All Downloads are FREE. Search and download functionalities are using the official Maven repository.

us.codecraft.webmagic.pipeline.MultiPagePipeline Maven / Gradle / Ivy

package us.codecraft.webmagic.pipeline;

import us.codecraft.webmagic.MultiPageModel;
import us.codecraft.webmagic.ResultItems;
import us.codecraft.webmagic.Task;
import us.codecraft.webmagic.utils.Experimental;
import us.codecraft.webmagic.utils.DoubleKeyMap;

import java.util.*;
import java.util.concurrent.ConcurrentHashMap;

/**
 * A pipeline combines the result in more than one page together.
* Used for news and articles containing more than one web page.
* MultiPagePipeline will store parts of object and output them when all parts are extracted.
* * @author [email protected]
* @since 0.2.0 */ @Experimental public class MultiPagePipeline implements Pipeline { private DoubleKeyMap pageMap = new DoubleKeyMap(ConcurrentHashMap.class); private DoubleKeyMap objectMap = new DoubleKeyMap(ConcurrentHashMap.class); @Override public void process(ResultItems resultItems, Task task) { Map resultItemsAll = resultItems.getAll(); Iterator> iterator = resultItemsAll.entrySet().iterator(); while (iterator.hasNext()) { handleObject(iterator); } } private void handleObject(Iterator> iterator) { Map.Entry objectEntry = iterator.next(); Object o = objectEntry.getValue(); //需要拼凑 if (o instanceof MultiPageModel) { MultiPageModel multiPageModel = (MultiPageModel) o; //这次处理的部分,设置为完成 pageMap.put(multiPageModel.getPageKey(), multiPageModel.getPage(), Boolean.FALSE); //每个key单独加锁 synchronized (pageMap.get(multiPageModel.getPageKey())) { pageMap.put(multiPageModel.getPageKey(), multiPageModel.getPage(), Boolean.TRUE); //其他需要拼凑的部分 if (multiPageModel.getOtherPages() != null) { for (String otherPage : multiPageModel.getOtherPages()) { Boolean aBoolean = pageMap.get(multiPageModel.getPageKey(), otherPage); if (aBoolean == null) { pageMap.put(multiPageModel.getPageKey(), otherPage, Boolean.FALSE); } } } //check if all pages are processed Map booleanMap = pageMap.get(multiPageModel.getPageKey()); objectMap.put(multiPageModel.getPageKey(), multiPageModel.getPage(), multiPageModel); if (booleanMap == null) { return; } // /过滤,这次完成的page item中,还未拼凑完整的item,不进入下一个pipeline for (Map.Entry stringBooleanEntry : booleanMap.entrySet()) { if (!stringBooleanEntry.getValue()) { iterator.remove(); return; } } List> entryList = new ArrayList>(); entryList.addAll(objectMap.get(multiPageModel.getPageKey()).entrySet()); if (entryList.size() != 0) { Collections.sort(entryList, new Comparator>() { @Override public int compare(Map.Entry o1, Map.Entry o2) { try { int i1 = Integer.parseInt(o1.getKey()); int i2 = Integer.parseInt(o2.getKey()); return i1 - i2; } catch (NumberFormatException e) { return o1.getKey().compareTo(o2.getKey()); } } }); // 合并 MultiPageModel value = entryList.get(0).getValue(); for (int i = 1; i < entryList.size(); i++) { value = value.combine(entryList.get(i).getValue()); } objectEntry.setValue(value); } } } } }




© 2015 - 2025 Weber Informatics LLC | Privacy Policy