us.codecraft.webmagic.downloader.FileCache Maven / Gradle / Ivy
package us.codecraft.webmagic.downloader;
import org.apache.commons.codec.digest.DigestUtils;
import org.apache.commons.lang3.StringUtils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import us.codecraft.webmagic.*;
import us.codecraft.webmagic.utils.Experimental;
import us.codecraft.webmagic.pipeline.Pipeline;
import us.codecraft.webmagic.processor.PageProcessor;
import us.codecraft.webmagic.processor.SimplePageProcessor;
import us.codecraft.webmagic.selector.Html;
import us.codecraft.webmagic.selector.PlainText;
import us.codecraft.webmagic.utils.FilePersistentBase;
import us.codecraft.webmagic.utils.UrlUtils;
import java.io.*;
/**
* Download file and saved to file for cache.
*
* @author [email protected]
* @since 0.2.1
*/
@Experimental
public class FileCache extends FilePersistentBase implements Downloader, Pipeline, PageProcessor {
private Downloader downloaderWhenFileMiss;
private final PageProcessor pageProcessor;
private Logger logger = LoggerFactory.getLogger(getClass());
public FileCache(String startUrl, String urlPattern) {
this(startUrl, urlPattern, "/data/webmagic/temp/");
}
public FileCache(String startUrl, String urlPattern, String path) {
this.pageProcessor = new SimplePageProcessor(startUrl, urlPattern);
setPath(path);
downloaderWhenFileMiss = new HttpClientDownloader();
}
public FileCache setDownloaderWhenFileMiss(Downloader downloaderWhenFileMiss) {
this.downloaderWhenFileMiss = downloaderWhenFileMiss;
return this;
}
@Override
public Page download(Request request, Task task) {
String path = this.path + "/" + task.getUUID() + "/";
Page page = null;
try {
final File file = getFile(path + DigestUtils.md5Hex(request.getUrl()));
BufferedReader bufferedReader = new BufferedReader(new FileReader(file));
String line = bufferedReader.readLine();
if (line.equals("url:\t" + request.getUrl())) {
final String html = getHtml(bufferedReader);
page = new Page();
page.setRequest(request);
page.setUrl(PlainText.create(request.getUrl()));
page.setHtml(Html.create(UrlUtils.fixAllRelativeHrefs(html, request.getUrl())));
}
} catch (IOException e) {
if (e instanceof FileNotFoundException) {
logger.info("File not exist for url " + request.getUrl());
} else {
logger.warn("File read error for url " + request.getUrl(), e);
}
}
if (page == null) {
page = downloadWhenMiss(request, task);
}
return page;
}
@Override
public void setThread(int thread) {
}
private String getHtml(BufferedReader bufferedReader) throws IOException {
String line;
StringBuilder htmlBuilder = new StringBuilder();
line = bufferedReader.readLine();
line = StringUtils.removeStart(line, "html:\t");
htmlBuilder.append(line);
while ((line = bufferedReader.readLine()) != null) {
htmlBuilder.append(line);
}
return htmlBuilder.toString();
}
private Page downloadWhenMiss(Request request, Task task) {
Page page = null;
if (downloaderWhenFileMiss != null) {
page = downloaderWhenFileMiss.download(request, task);
}
return page;
}
@Override
public void process(ResultItems resultItems, Task task) {
String path = this.path + PATH_SEPERATOR + task.getUUID() + PATH_SEPERATOR;
try {
PrintWriter printWriter = new PrintWriter(new FileWriter(getFile(path + DigestUtils.md5Hex(resultItems.getRequest().getUrl()) + ".html")));
printWriter.println("url:\t" + resultItems.getRequest().getUrl());
printWriter.println("html:\t" + resultItems.get("html"));
printWriter.close();
} catch (IOException e) {
logger.warn("write file error", e);
}
}
@Override
public void process(Page page) {
pageProcessor.process(page);
}
@Override
public Site getSite() {
return pageProcessor.getSite();
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy