All Downloads are FREE. Search and download functionalities are using the official Maven repository.

us.codecraft.webmagic.pipeline.FilePipeline Maven / Gradle / Ivy

There is a newer version: 1.0.2
Show newest version
package us.codecraft.webmagic.pipeline;

import org.apache.commons.codec.digest.DigestUtils;
import org.apache.http.annotation.ThreadSafe;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import us.codecraft.webmagic.ResultItems;
import us.codecraft.webmagic.Task;
import us.codecraft.webmagic.utils.FilePersistentBase;

import java.io.FileOutputStream;
import java.io.IOException;
import java.io.OutputStreamWriter;
import java.io.PrintWriter;
import java.util.Map;

/**
 * Store results in files.
* * @author [email protected]
* @since 0.1.0 */ @ThreadSafe public class FilePipeline extends FilePersistentBase implements Pipeline { private Logger logger = LoggerFactory.getLogger(getClass()); /** * create a FilePipeline with default path"/data/webmagic/" */ public FilePipeline() { setPath("/data/webmagic/"); } public FilePipeline(String path) { setPath(path); } @Override public void process(ResultItems resultItems, Task task) { String path = this.path + PATH_SEPERATOR + task.getUUID() + PATH_SEPERATOR; try { PrintWriter printWriter = new PrintWriter(new OutputStreamWriter(new FileOutputStream(getFile(path + DigestUtils.md5Hex(resultItems.getRequest().getUrl()) + ".html")),"UTF-8")); printWriter.println("url:\t" + resultItems.getRequest().getUrl()); for (Map.Entry entry : resultItems.getAll().entrySet()) { if (entry.getValue() instanceof Iterable) { Iterable value = (Iterable) entry.getValue(); printWriter.println(entry.getKey() + ":"); for (Object o : value) { printWriter.println(o); } } else { printWriter.println(entry.getKey() + ":\t" + entry.getValue()); } } printWriter.close(); } catch (IOException e) { logger.warn("write file error", e); } } }




© 2015 - 2025 Weber Informatics LLC | Privacy Policy