All Downloads are FREE. Search and download functionalities are using the official Maven repository.

us.codecraft.webmagic.Site Maven / Gradle / Ivy

There is a newer version: 1.0.2
Show newest version
package us.codecraft.webmagic;

import com.google.common.collect.HashBasedTable;
import com.google.common.collect.Table;
import org.apache.http.HttpHost;

import us.codecraft.webmagic.proxy.ProxyPool;
import us.codecraft.webmagic.utils.UrlUtils;

import java.util.*;

/**
 * Object contains setting for crawler.
* * @author [email protected]
* @see us.codecraft.webmagic.processor.PageProcessor * @since 0.1.0 */ public class Site { private String domain; private String userAgent; private Map defaultCookies = new LinkedHashMap(); private Table cookies = HashBasedTable.create(); private String charset; /** * startUrls is the urls the crawler to start with. */ private List startRequests = new ArrayList(); private int sleepTime = 5000; private int retryTimes = 0; private int cycleRetryTimes = 0; private int retrySleepTime = 1000; private int timeOut = 5000; private static final Set DEFAULT_STATUS_CODE_SET = new HashSet(); private Set acceptStatCode = DEFAULT_STATUS_CODE_SET; private Map headers = new HashMap(); private HttpHost httpProxy; private ProxyPool httpProxyPool; private boolean useGzip = true; /** * @see us.codecraft.webmagic.utils.HttpConstant.Header * @deprecated */ public static interface HeaderConst { public static final String REFERER = "Referer"; } static { DEFAULT_STATUS_CODE_SET.add(200); } /** * new a Site * * @return new site */ public static Site me() { return new Site(); } /** * Add a cookie with domain {@link #getDomain()} * * @param name name * @param value value * @return this */ public Site addCookie(String name, String value) { defaultCookies.put(name, value); return this; } /** * Add a cookie with specific domain. * * @param domain domain * @param name name * @param value value * @return this */ public Site addCookie(String domain, String name, String value) { cookies.put(domain, name, value); return this; } /** * set user agent * * @param userAgent userAgent * @return this */ public Site setUserAgent(String userAgent) { this.userAgent = userAgent; return this; } /** * get cookies * * @return get cookies */ public Map getCookies() { return defaultCookies; } /** * get cookies of all domains * * @return get cookies */ public Map> getAllCookies() { return cookies.rowMap(); } /** * get user agent * * @return user agent */ public String getUserAgent() { return userAgent; } /** * get domain * * @return get domain */ public String getDomain() { return domain; } /** * set the domain of site. * * @param domain domain * @return this */ public Site setDomain(String domain) { this.domain = domain; return this; } /** * Set charset of page manually.
* When charset is not set or set to null, it can be auto detected by Http header. * * @param charset charset * @return this */ public Site setCharset(String charset) { this.charset = charset; return this; } /** * get charset set manually * * @return charset */ public String getCharset() { return charset; } public int getTimeOut() { return timeOut; } /** * set timeout for downloader in ms * * @param timeOut timeOut * @return this */ public Site setTimeOut(int timeOut) { this.timeOut = timeOut; return this; } /** * Set acceptStatCode.
* When status code of http response is in acceptStatCodes, it will be processed.
* {200} by default.
* It is not necessarily to be set.
* * @param acceptStatCode acceptStatCode * @return this */ public Site setAcceptStatCode(Set acceptStatCode) { this.acceptStatCode = acceptStatCode; return this; } /** * get acceptStatCode * * @return acceptStatCode */ public Set getAcceptStatCode() { return acceptStatCode; } /** * get start urls * * @return start urls * @see #getStartRequests * @deprecated */ @Deprecated public List getStartUrls() { return UrlUtils.convertToUrls(startRequests); } public List getStartRequests() { return startRequests; } /** * Add a url to start url.
* Because urls are more a Spider's property than Site, move it to {@link Spider#addUrl(String...)}} * * @param startUrl startUrl * @return this * @see Spider#addUrl(String...) * @deprecated */ public Site addStartUrl(String startUrl) { return addStartRequest(new Request(startUrl)); } /** * Add a url to start url.
* Because urls are more a Spider's property than Site, move it to {@link Spider#addRequest(Request...)}} * * @param startRequest startRequest * @return this * @see Spider#addRequest(Request...) * @deprecated */ public Site addStartRequest(Request startRequest) { this.startRequests.add(startRequest); if (domain == null && startRequest.getUrl() != null) { domain = UrlUtils.getDomain(startRequest.getUrl()); } return this; } /** * Set the interval between the processing of two pages.
* Time unit is micro seconds.
* * @param sleepTime sleepTime * @return this */ public Site setSleepTime(int sleepTime) { this.sleepTime = sleepTime; return this; } /** * Get the interval between the processing of two pages.
* Time unit is micro seconds.
* * @return the interval between the processing of two pages, */ public int getSleepTime() { return sleepTime; } /** * Get retry times immediately when download fail, 0 by default.
* * @return retry times when download fail */ public int getRetryTimes() { return retryTimes; } public Map getHeaders() { return headers; } /** * Put an Http header for downloader.
* Use {@link #addCookie(String, String)} for cookie and {@link #setUserAgent(String)} for user-agent.
* * @param key key of http header, there are some keys constant in {@link HeaderConst} * @param value value of header * @return this */ public Site addHeader(String key, String value) { headers.put(key, value); return this; } /** * Set retry times when download fail, 0 by default.
* * @param retryTimes retryTimes * @return this */ public Site setRetryTimes(int retryTimes) { this.retryTimes = retryTimes; return this; } /** * When cycleRetryTimes is more than 0, it will add back to scheduler and try download again.
* * @return retry times when download fail */ public int getCycleRetryTimes() { return cycleRetryTimes; } /** * Set cycleRetryTimes times when download fail, 0 by default.
* * @param cycleRetryTimes cycleRetryTimes * @return this */ public Site setCycleRetryTimes(int cycleRetryTimes) { this.cycleRetryTimes = cycleRetryTimes; return this; } public HttpHost getHttpProxy() { return httpProxy; } /** * set up httpProxy for this site * * @param httpProxy httpProxy * @return this */ public Site setHttpProxy(HttpHost httpProxy) { this.httpProxy = httpProxy; return this; } public boolean isUseGzip() { return useGzip; } public int getRetrySleepTime() { return retrySleepTime; } /** * Set retry sleep times when download fail, 1000 by default.
* * @param retrySleepTime retrySleepTime * @return this */ public Site setRetrySleepTime(int retrySleepTime) { this.retrySleepTime = retrySleepTime; return this; } /** * Whether use gzip.
* Default is true, you can set it to false to disable gzip. * * @param useGzip useGzip * @return this */ public Site setUseGzip(boolean useGzip) { this.useGzip = useGzip; return this; } public Task toTask() { return new Task() { @Override public String getUUID() { return Site.this.getDomain(); } @Override public Site getSite() { return Site.this; } }; } @Override public boolean equals(Object o) { if (this == o) return true; if (o == null || getClass() != o.getClass()) return false; Site site = (Site) o; if (cycleRetryTimes != site.cycleRetryTimes) return false; if (retryTimes != site.retryTimes) return false; if (sleepTime != site.sleepTime) return false; if (timeOut != site.timeOut) return false; if (acceptStatCode != null ? !acceptStatCode.equals(site.acceptStatCode) : site.acceptStatCode != null) return false; if (charset != null ? !charset.equals(site.charset) : site.charset != null) return false; if (defaultCookies != null ? !defaultCookies.equals(site.defaultCookies) : site.defaultCookies != null) return false; if (domain != null ? !domain.equals(site.domain) : site.domain != null) return false; if (headers != null ? !headers.equals(site.headers) : site.headers != null) return false; if (startRequests != null ? !startRequests.equals(site.startRequests) : site.startRequests != null) return false; if (userAgent != null ? !userAgent.equals(site.userAgent) : site.userAgent != null) return false; return true; } @Override public int hashCode() { int result = domain != null ? domain.hashCode() : 0; result = 31 * result + (userAgent != null ? userAgent.hashCode() : 0); result = 31 * result + (defaultCookies != null ? defaultCookies.hashCode() : 0); result = 31 * result + (charset != null ? charset.hashCode() : 0); result = 31 * result + (startRequests != null ? startRequests.hashCode() : 0); result = 31 * result + sleepTime; result = 31 * result + retryTimes; result = 31 * result + cycleRetryTimes; result = 31 * result + timeOut; result = 31 * result + (acceptStatCode != null ? acceptStatCode.hashCode() : 0); result = 31 * result + (headers != null ? headers.hashCode() : 0); return result; } @Override public String toString() { return "Site{" + "domain='" + domain + '\'' + ", userAgent='" + userAgent + '\'' + ", cookies=" + defaultCookies + ", charset='" + charset + '\'' + ", startRequests=" + startRequests + ", sleepTime=" + sleepTime + ", retryTimes=" + retryTimes + ", cycleRetryTimes=" + cycleRetryTimes + ", timeOut=" + timeOut + ", acceptStatCode=" + acceptStatCode + ", headers=" + headers + '}'; } /** * Set httpProxyPool, String[0]:ip, String[1]:port
* * @param httpProxyList httpProxyList * @return this */ public Site setHttpProxyPool(List httpProxyList) { this.httpProxyPool=new ProxyPool(httpProxyList); return this; } public Site enableHttpProxyPool() { this.httpProxyPool=new ProxyPool(); return this; } public ProxyPool getHttpProxyPool() { return httpProxyPool; } public HttpHost getHttpProxyFromPool() { return httpProxyPool.getProxy(); } public void returnHttpProxyToPool(HttpHost proxy,int statusCode) { httpProxyPool.returnProxy(proxy,statusCode); } public Site setProxyReuseInterval(int reuseInterval) { this.httpProxyPool.setReuseInterval(reuseInterval); return this; } }




© 2015 - 2025 Weber Informatics LLC | Privacy Policy