us.codecraft.webmagic.pipeline.MultiPagePipeline Maven / Gradle / Ivy
package us.codecraft.webmagic.pipeline;
import us.codecraft.webmagic.MultiPageModel;
import us.codecraft.webmagic.ResultItems;
import us.codecraft.webmagic.Task;
import us.codecraft.webmagic.utils.Experimental;
import us.codecraft.webmagic.utils.DoubleKeyMap;
import java.util.*;
import java.util.concurrent.ConcurrentHashMap;
/**
* A pipeline combines the result in more than one page together.
* Used for news and articles containing more than one web page.
* MultiPagePipeline will store parts of object and output them when all parts are extracted.
*
* @author [email protected]
* @since 0.2.0
*/
@Experimental
public class MultiPagePipeline implements Pipeline {
private DoubleKeyMap pageMap = new DoubleKeyMap(ConcurrentHashMap.class);
private DoubleKeyMap objectMap = new DoubleKeyMap(ConcurrentHashMap.class);
@Override
public void process(ResultItems resultItems, Task task) {
Map resultItemsAll = resultItems.getAll();
Iterator> iterator = resultItemsAll.entrySet().iterator();
while (iterator.hasNext()) {
handleObject(iterator);
}
}
private void handleObject(Iterator> iterator) {
Map.Entry objectEntry = iterator.next();
Object o = objectEntry.getValue();
//需要拼凑
if (o instanceof MultiPageModel) {
MultiPageModel multiPageModel = (MultiPageModel) o;
//这次处理的部分,设置为完成
pageMap.put(multiPageModel.getPageKey(), multiPageModel.getPage(), Boolean.FALSE);
//每个key单独加锁
synchronized (pageMap.get(multiPageModel.getPageKey())) {
pageMap.put(multiPageModel.getPageKey(), multiPageModel.getPage(), Boolean.TRUE);
//其他需要拼凑的部分
if (multiPageModel.getOtherPages() != null) {
for (String otherPage : multiPageModel.getOtherPages()) {
Boolean aBoolean = pageMap.get(multiPageModel.getPageKey(), otherPage);
if (aBoolean == null) {
pageMap.put(multiPageModel.getPageKey(), otherPage, Boolean.FALSE);
}
}
}
//check if all pages are processed
Map booleanMap = pageMap.get(multiPageModel.getPageKey());
objectMap.put(multiPageModel.getPageKey(), multiPageModel.getPage(), multiPageModel);
if (booleanMap == null) {
return;
}
// /过滤,这次完成的page item中,还未拼凑完整的item,不进入下一个pipeline
for (Map.Entry stringBooleanEntry : booleanMap.entrySet()) {
if (!stringBooleanEntry.getValue()) {
iterator.remove();
return;
}
}
List> entryList = new ArrayList>();
entryList.addAll(objectMap.get(multiPageModel.getPageKey()).entrySet());
if (entryList.size() != 0) {
Collections.sort(entryList, new Comparator>() {
@Override
public int compare(Map.Entry o1, Map.Entry o2) {
try {
int i1 = Integer.parseInt(o1.getKey());
int i2 = Integer.parseInt(o2.getKey());
return i1 - i2;
} catch (NumberFormatException e) {
return o1.getKey().compareTo(o2.getKey());
}
}
});
// 合并
MultiPageModel value = entryList.get(0).getValue();
for (int i = 1; i < entryList.size(); i++) {
value = value.combine(entryList.get(i).getValue());
}
objectEntry.setValue(value);
}
}
}
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy