com.github.houbb.special.chars.test.data.FhdqCrawlTest Maven / Gradle / Ivy
package com.github.houbb.special.chars.test.data;
import com.github.houbb.heaven.util.guava.Guavas;
import com.github.houbb.heaven.util.io.FileUtil;
import com.github.houbb.heaven.util.lang.StringUtil;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import org.junit.Ignore;
import org.junit.Test;
import java.io.File;
import java.io.IOException;
import java.util.List;
/**
* TODO: CRAWL 的增强+扩展+使用
*
* @author binbin.hou
* @since 0.0.1
*/
@Ignore
public class FhdqCrawlTest {
@Test
public void indexGetTest() throws IOException {
final String index = "http://www.fhdq.net/";
Document document = Jsoup.connect(index).get();
Elements elements = document.select("tbody > tr");
for (Element element : elements) {
System.out.println(element.text());
}
}
@Test
public void indexNavigationTest() throws IOException {
final String index = "http://www.fhdq.net/";
Document document = Jsoup.connect(index).get();
Elements elements = document.select(".nav > ul > li > a");
List urls = Guavas.newArrayList(elements.size());
for (Element element : elements) {
final String url = element.attr("href");
System.out.println(url);
urls.add(url);
}
FileUtil.write("D:\\github\\special-char\\special-char-test\\src\\main\\resources\\fhdq\\index_navigation.txt", urls);
}
/**
* 获取导航下的索引测试
*
* @since 0.0.1
*/
@Test
public void navIndexTest() throws IOException {
List lines = FileUtil.readAllLines("D:\\github\\special-char\\special-char-test\\src\\main\\resources\\fhdq\\index_navigation.txt");
final String filePrefix = "D:\\github\\special-char\\special-char-test\\src\\main\\resources\\fhdq\\index\\index_nav_";
String indexFormat = "http://www.fhdq.net%s";
for (int i = 1; i < lines.size(); i++) {
String line = lines.get(i);
String[] strings = line.split(" ");
final String index = String.format(indexFormat, strings[0]);
List indexLines = Guavas.newArrayList();
String offset = strings[1];
if ("0".equals(offset)) {
// 只有当前一个页面
indexLines.addAll(getOnePageIndexList(index));
} else {
String[] offsets = offset.split(",");
String prefix = offsets[0];
Integer endIndex = Integer.valueOf(offsets[1]);
for (int j = 1; j <= endIndex; j++) {
String ix = index + String.format("list_%s_%d.html", prefix, j);
indexLines.addAll(getOnePageIndexList(ix));
}
}
String fileName = filePrefix + strings[0].replaceAll("/", "") + ".txt";
FileUtil.write(fileName, indexLines);
}
}
/**
* 获取单页索引
*
* @param page 页信息
* @return 结果
* @since 0.0.1
*/
private List getOnePageIndexList(final String page) {
try {
Document document = Jsoup.connect(page).get();
Element viewbox = document.select(".viewbox").first();
Elements elements = viewbox.select("ul > li > a");
List list = Guavas.newArrayList(elements.size());
for (Element element : elements) {
String text = element.text().replaceAll(",", "");
String href = element.attr("href");
if (href.startsWith("list_")) {
continue;
}
String line = text + "," + href;
list.add(line);
}
return list;
} catch (IOException e) {
throw new RuntimeException(e);
}
}
@Test
public void detailTest() {
final String dirPath = "D:\\github\\special-char\\special-char-test\\src\\main\\resources\\fhdq\\index\\";
final String detailPrefixDirPath = "D:\\github\\special-char\\special-char-test\\src\\main\\resources\\fhdq\\detail\\";
File dir = new File(dirPath);
File[] files = dir.listFiles();
final String htmlPathPrefix = "http://www.fhdq.net";
for (File file : files) {
String name = file.getName().split("\\.")[0];
List lines = FileUtil.readAllLines(file);
String dirP = detailPrefixDirPath + name + "\\";
for (String line : lines) {
String[] strings = line.split(",");
String fileName = FileUtil.trimWindowsSpecialChars(strings[0]);
String targetP = dirP + fileName + ".txt";
String htmlP = htmlPathPrefix + strings[1];
final String content = getHtmlContent(htmlP);
//TODO: 新增一个文件夹创建。
FileUtil.write(targetP, content);
}
}
}
@Test
public void detail2Test() {
File file = new File("D:\\_github\\special-char\\special-char-test\\src\\main\\resources\\fhdq\\index\\index_nav_wm_2.txt");
final String detailPrefixDirPath = "D:\\_github\\special-char\\special-char-test\\src\\main\\resources\\fhdq\\detail\\wm\\";
final String htmlPathPrefix = "http://www.fhdq.net";
List lines = FileUtil.readAllLines(file);
String dirP = detailPrefixDirPath;
for (String line : lines) {
String[] strings = line.split(",");
String fileName = FileUtil.trimWindowsSpecialChars(strings[0]);
String targetP = dirP + fileName + ".txt";
String htmlP = htmlPathPrefix + strings[1];
final String content = getHtmlContent(htmlP);
//TODO: 新增一个文件夹创建。
FileUtil.write(targetP, content);
}
}
/**
* 获取 Html 内容
*
* @param htmlP 路径
* @return 结果
* @since 0.0.1
*/
private String getHtmlContent(final String htmlP) {
try {
Document document = Jsoup.connect(htmlP).get();
Element content = document.select(".content").first();
String text = content.text();
return text;
} catch (IOException e) {
throw new RuntimeException(e);
}
}
}
© 2015 - 2024 Weber Informatics LLC | Privacy Policy