All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.github.dennisit.vplus.data.utils.SpiderUtils Maven / Gradle / Ivy

There is a newer version: 2.0.8
Show newest version
package com.github.dennisit.vplus.data.utils;

import com.google.common.collect.Lists;
import lombok.AllArgsConstructor;
import lombok.Getter;
import lombok.NoArgsConstructor;
import org.apache.commons.collections4.CollectionUtils;
import us.codecraft.webmagic.Page;

import java.util.List;
import java.util.Optional;
import java.util.stream.Collectors;

@Deprecated
public class SpiderUtils {

    @Getter
    @NoArgsConstructor
    @AllArgsConstructor
    public enum FromEnum {

        CNBLOG(1, "博客园", "https://www.cnblogs.com/"),
        CSDN(2, "CSDN", "https://www.csdn.net/"),
        OSCHINA(3, "开源中国", "https://www.oschina.net/"),
        JUEJIN(4, "掘金", "https://juejin.im/"),
        W1CTO(5, "51CTO", "http://blog.51cto.com/");

        private int type;

        private String name;

        private String site;
    }

    public static final class Regex {

        /**
         * 开源中国技术博客URL正则
         */
        public static final String CLAW_REGEX_OSCHINA_ARTICLE = "https://my\\.oschina\\.net/[\\S]+/blog/\\d+";

        /**
         * 博客园技术博客文章详情正则
         */
        public static final String CLAW_REGEX_CNBLOG_ARTICLE = "https://www\\.cnblogs\\.com/[\\S]+/p/\\d+\\.html";

        /**
         * CSDN网站技术博客列表页正则格式
         */
        public static final String CLAW_REGEX_CSDN_ARTICLE = "https://blog\\.csdn\\.net/[\\S]+/article/details/\\d+";

        /**
         * 掘金技术博客
         */
        public static final String CLAW_REGEX_JUEJIN_ARTICLE = "https://juejin\\.im/post/[\\S]+";

        /**
         * 51CTO技术博客
         */
        public static final String CLAW_REGEX_51CTO_ARTICLE = "http://blog\\.51cto\\.com/[\\S]+/\\d+";
    }

    /**
     * 获取页面上所有的链接
     *
     * @param page 页面模型
     * @return 页面链接集合
     */
    public static List getBodyLinks(Page page) {
        List links = Optional.ofNullable(page.getHtml().xpath("//body").links().all()).orElse(Lists.newArrayList());
        return links;
    }

    /**
     * 推送页面上所有的链接地址
     *
     * @param page 页面模型
     */
    public static void pushBodyLinks(Page page) {
        List links = getBodyLinks(page);
        if (CollectionUtils.isNotEmpty(links)) {
            page.addTargetRequests(links);
        }
    }
}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy