All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.github.houbb.special.chars.test.data.FhdqCrawlTest Maven / Gradle / Ivy

There is a newer version: 0.0.2
Show newest version
package com.github.houbb.special.chars.test.data;

import com.github.houbb.heaven.util.guava.Guavas;
import com.github.houbb.heaven.util.io.FileUtil;
import com.github.houbb.heaven.util.lang.StringUtil;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import org.junit.Ignore;
import org.junit.Test;

import java.io.File;
import java.io.IOException;
import java.util.List;

/**
 * TODO: CRAWL 的增强+扩展+使用
 *
 * @author binbin.hou
 * @since 0.0.1
 */
@Ignore
public class FhdqCrawlTest {

    @Test
    public void indexGetTest() throws IOException {
        final String index = "http://www.fhdq.net/";

        Document document = Jsoup.connect(index).get();

        Elements elements = document.select("tbody > tr");
        for (Element element : elements) {
            System.out.println(element.text());
        }
    }

    @Test
    public void indexNavigationTest() throws IOException {
        final String index = "http://www.fhdq.net/";

        Document document = Jsoup.connect(index).get();
        Elements elements = document.select(".nav > ul > li > a");
        List urls = Guavas.newArrayList(elements.size());

        for (Element element : elements) {
            final String url = element.attr("href");
            System.out.println(url);
            urls.add(url);
        }

        FileUtil.write("D:\\github\\special-char\\special-char-test\\src\\main\\resources\\fhdq\\index_navigation.txt", urls);
    }

    /**
     * 获取导航下的索引测试
     *
     * @since 0.0.1
     */
    @Test
    public void navIndexTest() throws IOException {
        List lines = FileUtil.readAllLines("D:\\github\\special-char\\special-char-test\\src\\main\\resources\\fhdq\\index_navigation.txt");

        final String filePrefix = "D:\\github\\special-char\\special-char-test\\src\\main\\resources\\fhdq\\index\\index_nav_";

        String indexFormat = "http://www.fhdq.net%s";
        for (int i = 1; i < lines.size(); i++) {
            String line = lines.get(i);
            String[] strings = line.split(" ");
            final String index = String.format(indexFormat, strings[0]);

            List indexLines = Guavas.newArrayList();
            String offset = strings[1];
            if ("0".equals(offset)) {
                // 只有当前一个页面
                indexLines.addAll(getOnePageIndexList(index));
            } else {
                String[] offsets = offset.split(",");
                String prefix = offsets[0];
                Integer endIndex = Integer.valueOf(offsets[1]);

                for (int j = 1; j <= endIndex; j++) {
                    String ix = index + String.format("list_%s_%d.html", prefix, j);
                    indexLines.addAll(getOnePageIndexList(ix));
                }
            }

            String fileName = filePrefix + strings[0].replaceAll("/", "") + ".txt";
            FileUtil.write(fileName, indexLines);
        }
    }

    /**
     * 获取单页索引
     *
     * @param page 页信息
     * @return 结果
     * @since 0.0.1
     */
    private List getOnePageIndexList(final String page) {
        try {
            Document document = Jsoup.connect(page).get();
            Element viewbox = document.select(".viewbox").first();
            Elements elements = viewbox.select("ul > li > a");
            List list = Guavas.newArrayList(elements.size());
            for (Element element : elements) {
                String text = element.text().replaceAll(",", "");
                String href = element.attr("href");

                if (href.startsWith("list_")) {
                    continue;
                }
                String line = text + "," + href;
                list.add(line);
            }

            return list;
        } catch (IOException e) {
            throw new RuntimeException(e);
        }
    }

    @Test
    public void detailTest() {
        final String dirPath = "D:\\github\\special-char\\special-char-test\\src\\main\\resources\\fhdq\\index\\";

        final String detailPrefixDirPath = "D:\\github\\special-char\\special-char-test\\src\\main\\resources\\fhdq\\detail\\";
        File dir = new File(dirPath);
        File[] files = dir.listFiles();

        final String htmlPathPrefix = "http://www.fhdq.net";
        for (File file : files) {
            String name = file.getName().split("\\.")[0];

            List lines = FileUtil.readAllLines(file);
            String dirP = detailPrefixDirPath + name + "\\";

            for (String line : lines) {
                String[] strings = line.split(",");
                String fileName = FileUtil.trimWindowsSpecialChars(strings[0]);
                String targetP = dirP + fileName + ".txt";

                String htmlP = htmlPathPrefix + strings[1];
                final String content = getHtmlContent(htmlP);

                //TODO: 新增一个文件夹创建。
                FileUtil.write(targetP, content);
            }
        }
    }

    @Test
    public void detail2Test() {
        File file = new File("D:\\_github\\special-char\\special-char-test\\src\\main\\resources\\fhdq\\index\\index_nav_wm_2.txt");
        final String detailPrefixDirPath = "D:\\_github\\special-char\\special-char-test\\src\\main\\resources\\fhdq\\detail\\wm\\";

        final String htmlPathPrefix = "http://www.fhdq.net";
        List lines = FileUtil.readAllLines(file);
        String dirP = detailPrefixDirPath;

        for (String line : lines) {
            String[] strings = line.split(",");
            String fileName = FileUtil.trimWindowsSpecialChars(strings[0]);
            String targetP = dirP + fileName + ".txt";

            String htmlP = htmlPathPrefix + strings[1];
            final String content = getHtmlContent(htmlP);

            //TODO: 新增一个文件夹创建。
            FileUtil.write(targetP, content);
        }
    }


    /**
     * 获取 Html 内容
     *
     * @param htmlP 路径
     * @return 结果
     * @since 0.0.1
     */
    private String getHtmlContent(final String htmlP) {
        try {
            Document document = Jsoup.connect(htmlP).get();
            Element content = document.select(".content").first();

            String text = content.text();

            return text;
        } catch (IOException e) {
            throw new RuntimeException(e);
        }
    }

}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy