public abstract class AbstractCommonCrawlFormat extends java.lang.Object implements CommonCrawlFormat
Modifier and Type | Field and Description |
---|---|
protected Configuration |
conf |
protected Content |
content |
protected java.util.List<java.lang.String> |
inLinks |
protected boolean |
jsonArray |
protected java.lang.String |
keyPrefix |
protected static org.slf4j.Logger |
LOG |
protected Metadata |
metadata |
protected boolean |
reverseKey |
protected java.lang.String |
reverseKeyValue |
protected boolean |
simpleDateFormat |
protected java.lang.String |
url |
Constructor and Description |
---|
AbstractCommonCrawlFormat(java.lang.String url,
Content content,
Metadata metadata,
Configuration nutchConf,
CommonCrawlConfig config) |
Modifier and Type | Method and Description |
---|---|
void |
close()
Optional method that could be implemented if the actual format needs some
close procedure.
|
protected abstract void |
closeArray(java.lang.String key,
boolean nested,
boolean newline) |
protected abstract void |
closeObject(java.lang.String key) |
protected abstract java.lang.String |
generateJson() |
protected java.lang.String |
getImported() |
java.util.List<java.lang.String> |
getInLinks()
gets set of inlinks
|
java.lang.String |
getJsonData() |
java.lang.String |
getJsonData(java.lang.String url,
Content content,
Metadata metadata)
Returns a string representation of the JSON structure of the URL content
|
java.lang.String |
getJsonData(java.lang.String url,
Content content,
Metadata metadata,
ParseData parseData)
Returns a string representation of the JSON structure of the URL content
takes into account the parsed metadata about the URL
|
protected java.lang.String |
getKey() |
protected java.lang.String |
getMethod() |
protected java.lang.String |
getRequestAccept() |
protected java.lang.String |
getRequestAcceptEncoding() |
protected java.lang.String |
getRequestAcceptLanguage() |
protected java.lang.String |
getRequestContactEmail() |
protected java.lang.String |
getRequestContactName() |
protected java.lang.String |
getRequestHostAddress() |
protected java.lang.String |
getRequestHostName() |
protected java.lang.String |
getRequestRobots() |
protected java.lang.String |
getRequestSoftware() |
protected java.lang.String |
getRequestUserAgent() |
protected java.lang.String |
getResponseAddress() |
protected java.lang.String |
getResponseContent() |
protected java.lang.String |
getResponseContentEncoding() |
protected java.lang.String |
getResponseContentType() |
protected java.lang.String |
getResponseDate() |
protected java.lang.String |
getResponseHostName() |
protected java.lang.String |
getResponseServer() |
protected java.lang.String |
getResponseStatus() |
protected java.lang.String |
getTimestamp() |
protected java.lang.String |
getUrl() |
void |
setInLinks(java.util.List<java.lang.String> inLinks)
sets inlinks of this document
|
protected abstract void |
startArray(java.lang.String key,
boolean nested,
boolean newline) |
protected abstract void |
startObject(java.lang.String key) |
protected abstract void |
writeArrayValue(java.lang.String value) |
protected abstract void |
writeKeyNull(java.lang.String key) |
protected abstract void |
writeKeyValue(java.lang.String key,
java.lang.String value) |
protected static final org.slf4j.Logger LOG
protected java.lang.String url
protected Content content
protected Metadata metadata
protected Configuration conf
protected java.lang.String keyPrefix
protected boolean simpleDateFormat
protected boolean jsonArray
protected boolean reverseKey
protected java.lang.String reverseKeyValue
protected java.util.List<java.lang.String> inLinks
public AbstractCommonCrawlFormat(java.lang.String url, Content content, Metadata metadata, Configuration nutchConf, CommonCrawlConfig config) throws java.io.IOException
java.io.IOException
public java.lang.String getJsonData(java.lang.String url, Content content, Metadata metadata) throws java.io.IOException
CommonCrawlFormat
getJsonData
in interface CommonCrawlFormat
java.io.IOException
public java.lang.String getJsonData(java.lang.String url, Content content, Metadata metadata, ParseData parseData) throws java.io.IOException
CommonCrawlFormat
getJsonData
in interface CommonCrawlFormat
java.io.IOException
public java.lang.String getJsonData() throws java.io.IOException
getJsonData
in interface CommonCrawlFormat
java.io.IOException
protected abstract void writeKeyValue(java.lang.String key, java.lang.String value) throws java.io.IOException
java.io.IOException
protected abstract void writeKeyNull(java.lang.String key) throws java.io.IOException
java.io.IOException
protected abstract void startArray(java.lang.String key, boolean nested, boolean newline) throws java.io.IOException
java.io.IOException
protected abstract void closeArray(java.lang.String key, boolean nested, boolean newline) throws java.io.IOException
java.io.IOException
protected abstract void writeArrayValue(java.lang.String value) throws java.io.IOException
java.io.IOException
protected abstract void startObject(java.lang.String key) throws java.io.IOException
java.io.IOException
protected abstract void closeObject(java.lang.String key) throws java.io.IOException
java.io.IOException
protected abstract java.lang.String generateJson() throws java.io.IOException
java.io.IOException
protected java.lang.String getUrl()
protected java.lang.String getTimestamp()
protected java.lang.String getMethod()
protected java.lang.String getRequestHostName()
protected java.lang.String getRequestHostAddress()
protected java.lang.String getRequestSoftware()
protected java.lang.String getRequestRobots()
protected java.lang.String getRequestContactName()
protected java.lang.String getRequestContactEmail()
protected java.lang.String getRequestAccept()
protected java.lang.String getRequestAcceptEncoding()
protected java.lang.String getRequestAcceptLanguage()
protected java.lang.String getRequestUserAgent()
protected java.lang.String getResponseStatus()
protected java.lang.String getResponseHostName()
protected java.lang.String getResponseAddress()
protected java.lang.String getResponseContentEncoding()
protected java.lang.String getResponseContentType()
public java.util.List<java.lang.String> getInLinks()
CommonCrawlFormat
getInLinks
in interface CommonCrawlFormat
public void setInLinks(java.util.List<java.lang.String> inLinks)
CommonCrawlFormat
setInLinks
in interface CommonCrawlFormat
inLinks
- list of inlinksprotected java.lang.String getResponseDate()
protected java.lang.String getResponseServer()
protected java.lang.String getResponseContent()
protected java.lang.String getKey()
protected java.lang.String getImported()
public void close()
CommonCrawlFormat
close
in interface java.io.Closeable
close
in interface java.lang.AutoCloseable
close
in interface CommonCrawlFormat
Copyright © 2019 The Apache Software Foundation