Modifier and Type | Field and Description |
---|---|
protected java.lang.String |
accept
The "Accept" request header value.
|
protected java.lang.String |
acceptCharset
The "Accept-Charset" request header value.
|
protected java.lang.String |
acceptLanguage
The "Accept-Language" request header value.
|
static int |
BUFFER_SIZE |
static Text |
COOKIE |
protected boolean |
enableCookieHeader
Controls whether or not to set Cookie HTTP header based on CrawlDatum metadata
|
protected boolean |
enableIfModifiedsinceHeader
Configuration directive for If-Modified-Since HTTP header
|
protected int |
maxContent
The length limit for downloaded content, in bytes.
|
protected long |
maxCrawlDelay
Skip page if Crawl-Delay longer than this value.
|
protected int |
maxDuration
The time limit to download the entire content, in seconds.
|
protected boolean |
partialAsTruncated
Whether to save partial fetches as truncated content.
|
protected java.util.HashMap<java.lang.String,java.lang.String> |
proxyException
The proxy exception list.
|
protected java.lang.String |
proxyHost
The proxy hostname.
|
protected int |
proxyPort
The proxy port.
|
protected java.net.Proxy.Type |
proxyType
The proxy port.
|
static Text |
RESPONSE_TIME |
protected boolean |
responseTime
Record response time in CrawlDatum's meta data, see property
http.store.responsetime.
|
protected boolean |
storeHttpHeaders
Record the HTTP response header in the metadata, see property
store.http.headers . |
protected boolean |
storeHttpRequest
Record the HTTP request in the metadata, see property
store.http.request . |
protected boolean |
storeIPAddress
Record the IP address of the responding server, see property
store.ip.address . |
protected int |
timeout
The network timeout in millisecond
|
protected boolean |
tlsCheckCertificate
Whether to check TLS/SSL certificates
|
protected java.util.Set<java.lang.String> |
tlsPreferredCipherSuites
Which TLS/SSL cipher suites to support
|
protected java.util.Set<java.lang.String> |
tlsPreferredProtocols
Which TLS/SSL protocols to support
|
protected boolean |
useHttp11
Do we use HTTP/1.1?
|
protected boolean |
useHttp2
Whether to use HTTP/2
|
protected boolean |
useProxy
Indicates if a proxy is used
|
protected java.lang.String |
userAgent
The Nutch 'User-Agent' request header
|
X_POINT_ID
Constructor and Description |
---|
HttpBase()
Creates a new instance of HttpBase
|
HttpBase(org.slf4j.Logger logger)
Creates a new instance of HttpBase
|
Modifier and Type | Method and Description |
---|---|
java.lang.String |
getAccept() |
java.lang.String |
getAcceptCharset() |
java.lang.String |
getAcceptLanguage()
Value of "Accept-Language" request header sent by Nutch.
|
Configuration |
getConf() |
java.lang.String |
getCookie(java.net.URL url)
If per-host cookies are configured, this method will look it up
for the given url.
|
int |
getMaxContent() |
int |
getMaxDuration()
The time limit to download the entire content, in seconds.
|
ProtocolOutput |
getProtocolOutput(Text url,
CrawlDatum datum)
Returns the
Content for a fetchlist entry. |
java.lang.String |
getProxyHost() |
int |
getProxyPort() |
protected abstract Response |
getResponse(java.net.URL url,
CrawlDatum datum,
boolean followRedirects) |
crawlercommons.robots.BaseRobotRules |
getRobotRules(Text url,
CrawlDatum datum,
java.util.List<Content> robotsTxtContent)
Retrieve robot rules applicable for this URL.
|
int |
getTimeout() |
java.util.Set<java.lang.String> |
getTlsPreferredCipherSuites() |
java.util.Set<java.lang.String> |
getTlsPreferredProtocols() |
boolean |
getUseHttp11() |
java.lang.String |
getUserAgent() |
boolean |
isCookieEnabled() |
boolean |
isIfModifiedSinceEnabled() |
boolean |
isStoreHttpHeaders() |
boolean |
isStoreHttpRequest() |
boolean |
isStoreIPAddress() |
boolean |
isStorePartialAsTruncated()
Whether to save partial fetches as truncated content, cf.
|
boolean |
isTlsCheckCertificates() |
protected void |
logConf() |
protected static void |
main(HttpBase http,
java.lang.String[] args) |
byte[] |
processDeflateEncoded(byte[] compressed,
java.net.URL url) |
byte[] |
processGzipEncoded(byte[] compressed,
java.net.URL url) |
void |
setConf(Configuration conf) |
boolean |
useProxy(java.lang.String host) |
boolean |
useProxy(java.net.URI uri) |
boolean |
useProxy(java.net.URL url) |
public static final Text RESPONSE_TIME
public static final Text COOKIE
public static final int BUFFER_SIZE
protected java.lang.String proxyHost
protected int proxyPort
protected java.net.Proxy.Type proxyType
protected java.util.HashMap<java.lang.String,java.lang.String> proxyException
protected boolean useProxy
protected int timeout
protected int maxContent
protected int maxDuration
protected boolean partialAsTruncated
protected java.lang.String userAgent
protected java.lang.String acceptLanguage
protected java.lang.String acceptCharset
protected java.lang.String accept
protected boolean useHttp11
protected boolean useHttp2
protected boolean responseTime
protected boolean storeIPAddress
store.ip.address
.protected boolean storeHttpRequest
store.http.request
.protected boolean storeHttpHeaders
store.http.headers
.protected long maxCrawlDelay
protected boolean tlsCheckCertificate
protected java.util.Set<java.lang.String> tlsPreferredProtocols
protected java.util.Set<java.lang.String> tlsPreferredCipherSuites
protected boolean enableIfModifiedsinceHeader
protected boolean enableCookieHeader
public HttpBase()
public HttpBase(org.slf4j.Logger logger)
public void setConf(Configuration conf)
setConf
in interface Configurable
public Configuration getConf()
getConf
in interface Configurable
public ProtocolOutput getProtocolOutput(Text url, CrawlDatum datum)
Protocol
Content
for a fetchlist entry.getProtocolOutput
in interface Protocol
public java.lang.String getProxyHost()
public int getProxyPort()
public boolean useProxy(java.net.URL url)
public boolean useProxy(java.net.URI uri)
public boolean useProxy(java.lang.String host)
public int getTimeout()
public boolean isIfModifiedSinceEnabled()
public boolean isCookieEnabled()
public boolean isStoreIPAddress()
public boolean isStoreHttpRequest()
public boolean isStoreHttpHeaders()
public int getMaxContent()
public int getMaxDuration()
http.time.limit
.public boolean isStorePartialAsTruncated()
http.partial.truncated
.public java.lang.String getUserAgent()
public java.lang.String getCookie(java.net.URL url)
url
- the url to look-up a cookie forpublic java.lang.String getAcceptLanguage()
public java.lang.String getAcceptCharset()
public java.lang.String getAccept()
public boolean getUseHttp11()
public boolean isTlsCheckCertificates()
public java.util.Set<java.lang.String> getTlsPreferredCipherSuites()
public java.util.Set<java.lang.String> getTlsPreferredProtocols()
protected void logConf()
public byte[] processGzipEncoded(byte[] compressed, java.net.URL url) throws java.io.IOException
java.io.IOException
public byte[] processDeflateEncoded(byte[] compressed, java.net.URL url) throws java.io.IOException
java.io.IOException
protected static void main(HttpBase http, java.lang.String[] args) throws java.lang.Exception
java.lang.Exception
protected abstract Response getResponse(java.net.URL url, CrawlDatum datum, boolean followRedirects) throws ProtocolException, java.io.IOException
ProtocolException
java.io.IOException
public crawlercommons.robots.BaseRobotRules getRobotRules(Text url, CrawlDatum datum, java.util.List<Content> robotsTxtContent)
Protocol
getRobotRules
in interface Protocol
url
- URL to checkdatum
- page datumrobotsTxtContent
- container to store responses when fetching the robots.txt file for
debugging or archival purposes. Instead of a robots.txt file, it
may include redirects or an error page (404, etc.). Response
Content
is appended to the passed list. If null is passed
nothing is stored.Copyright © 2019 The Apache Software Foundation