tech.tablesaw.io.html.HtmlReader Maven / Gradle / Ivy
package tech.tablesaw.io.html;
import java.io.IOException;
import java.io.InputStream;
import java.util.ArrayList;
import java.util.List;
import java.util.stream.Stream;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.parser.Parser;
import org.jsoup.select.Elements;
import tech.tablesaw.api.Table;
import tech.tablesaw.io.DataReader;
import tech.tablesaw.io.ReaderRegistry;
import tech.tablesaw.io.Source;
import tech.tablesaw.io.TableBuildingUtils;
public class HtmlReader implements DataReader {
private static final HtmlReader INSTANCE = new HtmlReader();
static {
register(Table.defaultReaderRegistry);
}
public static void register(ReaderRegistry registry) {
registry.registerExtension("html", INSTANCE);
registry.registerMimeType("text/html", INSTANCE);
registry.registerOptions(HtmlReadOptions.class, INSTANCE);
}
@Override
public Table read(HtmlReadOptions options) throws IOException {
Document doc;
InputStream inputStream = options.source().inputStream();
if (inputStream != null) {
// Reader must support mark, so can't use InputStreamReader
// Parse the InputStream directly
doc = Jsoup.parse(inputStream, null, "");
} else {
doc = Parser.htmlParser().parseInput(options.source().createReader(null), "");
}
Elements tables = doc.select("table");
int tableIndex = 0;
if (tables.size() != 1) {
if (options.tableIndex() != null) {
if (options.tableIndex() >= 0 && options.tableIndex() < tables.size()) {
tableIndex = options.tableIndex();
} else {
throw new IndexOutOfBoundsException(
"Table index outside bounds. The URL has " + tables.size() + " tables");
}
} else {
throw new IllegalStateException(
tables.size()
+ " tables found. When more than one html table is present on the page you must specify the index of the table to read from.");
}
}
Element htmlTable = tables.get(tableIndex);
List rows = new ArrayList<>();
for (Element row : htmlTable.select("tr")) {
Elements headerCells = row.getElementsByTag("th");
Elements cells = row.getElementsByTag("td");
String[] nextLine =
Stream.concat(headerCells.stream(), cells.stream())
.map(Element::text)
.toArray(String[]::new);
rows.add(nextLine);
}
Table table = Table.create(options.tableName());
if (rows.isEmpty()) {
return table;
}
List columnNames = new ArrayList<>();
if (options.header()) {
String[] headerRow = rows.remove(0);
for (int i = 0; i < headerRow.length; i++) {
columnNames.add(headerRow[i]);
}
} else {
for (int i = 0; i < rows.get(0).length; i++) {
columnNames.add("C" + i);
}
}
return TableBuildingUtils.build(columnNames, rows, options);
}
@Override
public Table read(Source source) throws IOException {
return read(HtmlReadOptions.builder(source).build());
}
}
© 2015 - 2024 Weber Informatics LLC | Privacy Policy