us.codecraft.webmagic.Site Maven / Gradle / Ivy
The newest version!
package us.codecraft.webmagic;
import java.util.HashMap;
import java.util.HashSet;
import java.util.LinkedHashMap;
import java.util.Map;
import java.util.Set;
import java.util.UUID;
import us.codecraft.webmagic.utils.HttpConstant;
/**
* Object contains setting for crawler.
*
* @author [email protected]
* @see us.codecraft.webmagic.processor.PageProcessor
* @since 0.1.0
*/
public class Site {
private String domain;
private String userAgent;
private Map defaultCookies = new LinkedHashMap();
private Map> cookies = new HashMap>();
private String charset;
private String defaultCharset;
private int sleepTime = 5000;
private int retryTimes = 0;
private int cycleRetryTimes = 0;
private int retrySleepTime = 1000;
private int timeOut = 5000;
private static final Set DEFAULT_STATUS_CODE_SET = new HashSet();
private Set acceptStatCode = DEFAULT_STATUS_CODE_SET;
private Map headers = new HashMap();
private boolean useGzip = true;
private boolean disableCookieManagement = false;
static {
DEFAULT_STATUS_CODE_SET.add(HttpConstant.StatusCode.CODE_200);
}
/**
* new a Site
*
* @return new site
*/
public static Site me() {
return new Site();
}
/**
* Add a cookie with domain {@link #getDomain()}
*
* @param name name
* @param value value
* @return this
*/
public Site addCookie(String name, String value) {
defaultCookies.put(name, value);
return this;
}
/**
* Add a cookie with specific domain.
*
* @param domain domain
* @param name name
* @param value value
* @return this
*/
public Site addCookie(String domain, String name, String value) {
if (!cookies.containsKey(domain)){
cookies.put(domain,new HashMap());
}
cookies.get(domain).put(name, value);
return this;
}
/**
* set user agent
*
* @param userAgent userAgent
* @return this
*/
public Site setUserAgent(String userAgent) {
this.userAgent = userAgent;
return this;
}
/**
* get cookies
*
* @return get cookies
*/
public Map getCookies() {
return defaultCookies;
}
/**
* get cookies of all domains
*
* @return get cookies
*/
public Map> getAllCookies() {
return cookies;
}
/**
* get user agent
*
* @return user agent
*/
public String getUserAgent() {
return userAgent;
}
/**
* get domain
*
* @return get domain
*/
public String getDomain() {
return domain;
}
/**
* set the domain of site.
*
* @param domain domain
* @return this
*/
public Site setDomain(String domain) {
this.domain = domain;
return this;
}
/**
* Set charset of page manually.
* When charset is not set or set to null, it can be auto detected by Http header.
*
* @param charset charset
* @return this
*/
public Site setCharset(String charset) {
this.charset = charset;
return this;
}
/**
* get charset set manually
*
* @return charset
*/
public String getCharset() {
return charset;
}
/**
* Set default charset of page.
*
* When charset detect failed, use this default charset.
*
* @param defaultCharset the default charset
* @return this
* @since 0.9.0
*/
public Site setDefaultCharset(String defaultCharset) {
this.defaultCharset = defaultCharset;
return this;
}
/**
* The default charset if charset detected failed.
*
* @return the defulat charset
* @since 0.9.0
*/
public String getDefaultCharset() {
return defaultCharset;
}
public int getTimeOut() {
return timeOut;
}
/**
* set timeout for downloader in ms
*
* @param timeOut timeOut
* @return this
*/
public Site setTimeOut(int timeOut) {
this.timeOut = timeOut;
return this;
}
/**
* Set acceptStatCode.
* When status code of http response is in acceptStatCodes, it will be processed.
* {200} by default.
* It is not necessarily to be set.
*
* @param acceptStatCode acceptStatCode
* @return this
*/
public Site setAcceptStatCode(Set acceptStatCode) {
this.acceptStatCode = acceptStatCode;
return this;
}
/**
* get acceptStatCode
*
* @return acceptStatCode
*/
public Set getAcceptStatCode() {
return acceptStatCode;
}
/**
* Set the interval between the processing of two pages.
* Time unit is milliseconds.
*
* @param sleepTime sleepTime
* @return this
*/
public Site setSleepTime(int sleepTime) {
this.sleepTime = sleepTime;
return this;
}
/**
* Get the interval between the processing of two pages.
* Time unit is milliseconds.
*
* @return the interval between the processing of two pages,
*/
public int getSleepTime() {
return sleepTime;
}
/**
* Get retry times immediately when download fail, 0 by default.
*
* @return retry times when download fail
*/
public int getRetryTimes() {
return retryTimes;
}
public Map getHeaders() {
return headers;
}
/**
* Put an Http header for downloader.
* Use {@link #addCookie(String, String)} for cookie and {@link #setUserAgent(String)} for user-agent.
*
* @param key key of http header, there are some keys constant in {@link HttpConstant.Header}
* @param value value of header
* @return this
*/
public Site addHeader(String key, String value) {
headers.put(key, value);
return this;
}
/**
* Set retry times when download fail, 0 by default.
*
* @param retryTimes retryTimes
* @return this
*/
public Site setRetryTimes(int retryTimes) {
this.retryTimes = retryTimes;
return this;
}
/**
* When cycleRetryTimes is more than 0, it will add back to scheduler and try download again.
*
* @return retry times when download fail
*/
public int getCycleRetryTimes() {
return cycleRetryTimes;
}
/**
* Set cycleRetryTimes times when download fail, 0 by default.
*
* @param cycleRetryTimes cycleRetryTimes
* @return this
*/
public Site setCycleRetryTimes(int cycleRetryTimes) {
this.cycleRetryTimes = cycleRetryTimes;
return this;
}
public boolean isUseGzip() {
return useGzip;
}
public int getRetrySleepTime() {
return retrySleepTime;
}
/**
* Set retry sleep times when download fail, 1000 by default.
*
* @param retrySleepTime retrySleepTime
* @return this
*/
public Site setRetrySleepTime(int retrySleepTime) {
this.retrySleepTime = retrySleepTime;
return this;
}
/**
* Whether use gzip.
* Default is true, you can set it to false to disable gzip.
*
* @param useGzip useGzip
* @return this
*/
public Site setUseGzip(boolean useGzip) {
this.useGzip = useGzip;
return this;
}
public boolean isDisableCookieManagement() {
return disableCookieManagement;
}
/**
* Downloader is supposed to store response cookie.
* Disable it to ignore all cookie fields and stay clean.
* Warning: Set cookie will still NOT work if disableCookieManagement is true.
* @param disableCookieManagement disableCookieManagement
* @return this
*/
public Site setDisableCookieManagement(boolean disableCookieManagement) {
this.disableCookieManagement = disableCookieManagement;
return this;
}
public Task toTask() {
return new Task() {
@Override
public String getUUID() {
String uuid = Site.this.getDomain();
if (uuid == null) {
uuid = UUID.randomUUID().toString();
}
return uuid;
}
@Override
public Site getSite() {
return Site.this;
}
};
}
@Override
public boolean equals(Object o) {
if (this == o) return true;
if (o == null || getClass() != o.getClass()) return false;
Site site = (Site) o;
if (cycleRetryTimes != site.cycleRetryTimes) return false;
if (retryTimes != site.retryTimes) return false;
if (sleepTime != site.sleepTime) return false;
if (timeOut != site.timeOut) return false;
if (acceptStatCode != null ? !acceptStatCode.equals(site.acceptStatCode) : site.acceptStatCode != null)
return false;
if (charset != null ? !charset.equals(site.charset) : site.charset != null) return false;
if (defaultCookies != null ? !defaultCookies.equals(site.defaultCookies) : site.defaultCookies != null)
return false;
if (domain != null ? !domain.equals(site.domain) : site.domain != null) return false;
if (headers != null ? !headers.equals(site.headers) : site.headers != null) return false;
if (userAgent != null ? !userAgent.equals(site.userAgent) : site.userAgent != null) return false;
return true;
}
@Override
public int hashCode() {
int result = domain != null ? domain.hashCode() : 0;
result = 31 * result + (userAgent != null ? userAgent.hashCode() : 0);
result = 31 * result + (defaultCookies != null ? defaultCookies.hashCode() : 0);
result = 31 * result + (charset != null ? charset.hashCode() : 0);
result = 31 * result + sleepTime;
result = 31 * result + retryTimes;
result = 31 * result + cycleRetryTimes;
result = 31 * result + timeOut;
result = 31 * result + (acceptStatCode != null ? acceptStatCode.hashCode() : 0);
result = 31 * result + (headers != null ? headers.hashCode() : 0);
return result;
}
@Override
public String toString() {
return "Site{" +
"domain='" + domain + '\'' +
", userAgent='" + userAgent + '\'' +
", cookies=" + defaultCookies +
", charset='" + charset + '\'' +
", sleepTime=" + sleepTime +
", retryTimes=" + retryTimes +
", cycleRetryTimes=" + cycleRetryTimes +
", timeOut=" + timeOut +
", acceptStatCode=" + acceptStatCode +
", headers=" + headers +
'}';
}
}