com.github.dennisit.vplus.data.utils.SpiderUtils Maven / Gradle / Ivy
package com.github.dennisit.vplus.data.utils;
import com.google.common.collect.Lists;
import lombok.AllArgsConstructor;
import lombok.Getter;
import lombok.NoArgsConstructor;
import org.apache.commons.collections4.CollectionUtils;
import us.codecraft.webmagic.Page;
import java.util.List;
import java.util.Optional;
import java.util.stream.Collectors;
@Deprecated
public class SpiderUtils {
@Getter
@NoArgsConstructor
@AllArgsConstructor
public enum FromEnum {
CNBLOG(1, "博客园", "https://www.cnblogs.com/"),
CSDN(2, "CSDN", "https://www.csdn.net/"),
OSCHINA(3, "开源中国", "https://www.oschina.net/"),
JUEJIN(4, "掘金", "https://juejin.im/"),
W1CTO(5, "51CTO", "http://blog.51cto.com/");
private int type;
private String name;
private String site;
}
public static final class Regex {
/**
* 开源中国技术博客URL正则
*/
public static final String CLAW_REGEX_OSCHINA_ARTICLE = "https://my\\.oschina\\.net/[\\S]+/blog/\\d+";
/**
* 博客园技术博客文章详情正则
*/
public static final String CLAW_REGEX_CNBLOG_ARTICLE = "https://www\\.cnblogs\\.com/[\\S]+/p/\\d+\\.html";
/**
* CSDN网站技术博客列表页正则格式
*/
public static final String CLAW_REGEX_CSDN_ARTICLE = "https://blog\\.csdn\\.net/[\\S]+/article/details/\\d+";
/**
* 掘金技术博客
*/
public static final String CLAW_REGEX_JUEJIN_ARTICLE = "https://juejin\\.im/post/[\\S]+";
/**
* 51CTO技术博客
*/
public static final String CLAW_REGEX_51CTO_ARTICLE = "http://blog\\.51cto\\.com/[\\S]+/\\d+";
}
/**
* 获取页面上所有的链接
*
* @param page 页面模型
* @return 页面链接集合
*/
public static List getBodyLinks(Page page) {
List links = Optional.ofNullable(page.getHtml().xpath("//body").links().all()).orElse(Lists.newArrayList());
return links;
}
/**
* 推送页面上所有的链接地址
*
* @param page 页面模型
*/
public static void pushBodyLinks(Page page) {
List links = getBodyLinks(page);
if (CollectionUtils.isNotEmpty(links)) {
page.addTargetRequests(links);
}
}
}