public class WARCUtils
extends java.lang.Object
Modifier and Type | Field and Description |
---|---|
static java.lang.String |
COLONSP |
static java.lang.String |
CONFORMS_TO |
static java.lang.String |
CRLF |
static java.lang.String |
FORMAT |
static org.archive.uid.UUIDGenerator |
generator |
static java.lang.String |
HOSTNAME |
static java.lang.String |
HTTP_HEADER_FROM |
static java.lang.String |
HTTP_HEADER_USER_AGENT |
static java.lang.String |
IP |
static java.lang.String |
OPERATOR |
protected static java.util.regex.Pattern |
PROBLEMATIC_HEADERS |
static java.lang.String |
ROBOTS |
static java.lang.String |
SOFTWARE |
protected static java.lang.String |
X_HIDE_HEADER |
Constructor and Description |
---|
WARCUtils() |
Modifier and Type | Method and Description |
---|---|
static org.archive.io.warc.WARCRecordInfo |
docToMetadata(NutchDocument doc) |
static java.lang.String |
fixHttpHeaders(java.lang.String headers,
int contentLength)
Modify verbatim HTTP response headers: fix, remove or replace headers
Content-Length , Content-Encoding and
Transfer-Encoding which may confuse WARC readers. |
static java.lang.String |
getAgentString(java.lang.String name,
java.lang.String version,
java.lang.String description,
java.lang.String URL,
java.lang.String email) |
static java.lang.String |
getHostname(Configuration conf) |
static java.lang.String |
getIPAddress(Configuration conf) |
static org.archive.util.anvl.ANVLRecord |
getWARCInfoContent(Configuration conf) |
static byte[] |
toByteArray(org.archive.format.http.HttpHeaders headers) |
public static final java.lang.String SOFTWARE
public static final java.lang.String HTTP_HEADER_FROM
public static final java.lang.String HTTP_HEADER_USER_AGENT
public static final java.lang.String HOSTNAME
public static final java.lang.String ROBOTS
public static final java.lang.String OPERATOR
public static final java.lang.String FORMAT
public static final java.lang.String CONFORMS_TO
public static final java.lang.String IP
public static final org.archive.uid.UUIDGenerator generator
public static final java.lang.String CRLF
public static final java.lang.String COLONSP
protected static final java.util.regex.Pattern PROBLEMATIC_HEADERS
protected static final java.lang.String X_HIDE_HEADER
public static final org.archive.util.anvl.ANVLRecord getWARCInfoContent(Configuration conf)
public static final java.lang.String getHostname(Configuration conf) throws java.net.UnknownHostException
java.net.UnknownHostException
public static final java.lang.String getIPAddress(Configuration conf) throws java.net.UnknownHostException
java.net.UnknownHostException
public static final byte[] toByteArray(org.archive.format.http.HttpHeaders headers) throws java.io.IOException
java.io.IOException
public static final java.lang.String getAgentString(java.lang.String name, java.lang.String version, java.lang.String description, java.lang.String URL, java.lang.String email)
public static final org.archive.io.warc.WARCRecordInfo docToMetadata(NutchDocument doc) throws java.io.UnsupportedEncodingException
java.io.UnsupportedEncodingException
public static final java.lang.String fixHttpHeaders(java.lang.String headers, int contentLength)
Content-Length
, Content-Encoding
and
Transfer-Encoding
which may confuse WARC readers. Ensure that
returned header end with a single empty line (\r\n\r\n
).headers
- HTTP 1.1 or 1.0 response header string, CR-LF-separated lines,
first line is status lineCopyright © 2019 The Apache Software Foundation