public class CommonCrawlFormatWARC extends AbstractCommonCrawlFormat
Modifier and Type | Field and Description |
---|---|
static java.lang.String |
MAX_WARC_FILE_SIZE |
static java.lang.String |
TEMPLATE |
conf, content, inLinks, jsonArray, keyPrefix, LOG, metadata, reverseKey, reverseKeyValue, simpleDateFormat, url
Constructor and Description |
---|
CommonCrawlFormatWARC(Configuration nutchConf,
CommonCrawlConfig config) |
CommonCrawlFormatWARC(java.lang.String url,
Content content,
Metadata metadata,
Configuration nutchConf,
CommonCrawlConfig config,
ParseData parseData) |
Modifier and Type | Method and Description |
---|---|
void |
close()
Optional method that could be implemented if the actual format needs some
close procedure.
|
protected void |
closeArray(java.lang.String key,
boolean nested,
boolean newline) |
protected void |
closeObject(java.lang.String key) |
protected java.lang.String |
generateJson() |
java.lang.String |
getJsonData() |
java.lang.String |
getJsonData(java.lang.String url,
Content content,
Metadata metadata,
ParseData parseData)
Returns a string representation of the JSON structure of the URL content
takes into account the parsed metadata about the URL
|
protected void |
startArray(java.lang.String key,
boolean nested,
boolean newline) |
protected void |
startObject(java.lang.String key) |
protected void |
writeArrayValue(java.lang.String value) |
protected void |
writeKeyNull(java.lang.String key) |
protected void |
writeKeyValue(java.lang.String key,
java.lang.String value) |
protected java.net.URI |
writeRequest(java.net.URI id) |
protected java.net.URI |
writeResponse() |
getImported, getInLinks, getJsonData, getKey, getMethod, getRequestAccept, getRequestAcceptEncoding, getRequestAcceptLanguage, getRequestContactEmail, getRequestContactName, getRequestHostAddress, getRequestHostName, getRequestRobots, getRequestSoftware, getRequestUserAgent, getResponseAddress, getResponseContent, getResponseContentEncoding, getResponseContentType, getResponseDate, getResponseHostName, getResponseServer, getResponseStatus, getTimestamp, getUrl, setInLinks
public static final java.lang.String MAX_WARC_FILE_SIZE
public static final java.lang.String TEMPLATE
public CommonCrawlFormatWARC(Configuration nutchConf, CommonCrawlConfig config) throws java.io.IOException
java.io.IOException
public CommonCrawlFormatWARC(java.lang.String url, Content content, Metadata metadata, Configuration nutchConf, CommonCrawlConfig config, ParseData parseData) throws java.io.IOException
java.io.IOException
public java.lang.String getJsonData(java.lang.String url, Content content, Metadata metadata, ParseData parseData) throws java.io.IOException
CommonCrawlFormat
getJsonData
in interface CommonCrawlFormat
getJsonData
in class AbstractCommonCrawlFormat
java.io.IOException
public java.lang.String getJsonData() throws java.io.IOException
getJsonData
in interface CommonCrawlFormat
getJsonData
in class AbstractCommonCrawlFormat
java.io.IOException
protected java.net.URI writeResponse() throws java.io.IOException, java.text.ParseException
java.io.IOException
java.text.ParseException
protected java.net.URI writeRequest(java.net.URI id) throws java.io.IOException, java.text.ParseException
java.io.IOException
java.text.ParseException
protected java.lang.String generateJson() throws java.io.IOException
generateJson
in class AbstractCommonCrawlFormat
java.io.IOException
protected void writeKeyValue(java.lang.String key, java.lang.String value) throws java.io.IOException
writeKeyValue
in class AbstractCommonCrawlFormat
java.io.IOException
protected void writeKeyNull(java.lang.String key) throws java.io.IOException
writeKeyNull
in class AbstractCommonCrawlFormat
java.io.IOException
protected void startArray(java.lang.String key, boolean nested, boolean newline) throws java.io.IOException
startArray
in class AbstractCommonCrawlFormat
java.io.IOException
protected void closeArray(java.lang.String key, boolean nested, boolean newline) throws java.io.IOException
closeArray
in class AbstractCommonCrawlFormat
java.io.IOException
protected void writeArrayValue(java.lang.String value) throws java.io.IOException
writeArrayValue
in class AbstractCommonCrawlFormat
java.io.IOException
protected void startObject(java.lang.String key) throws java.io.IOException
startObject
in class AbstractCommonCrawlFormat
java.io.IOException
protected void closeObject(java.lang.String key) throws java.io.IOException
closeObject
in class AbstractCommonCrawlFormat
java.io.IOException
public void close()
CommonCrawlFormat
close
in interface java.io.Closeable
close
in interface java.lang.AutoCloseable
close
in interface CommonCrawlFormat
close
in class AbstractCommonCrawlFormat
Copyright © 2019 The Apache Software Foundation