uk.ac.shef.dcs.sti.parser.table.TableParser Maven / Gradle / Ivy
The newest version!
package uk.ac.shef.dcs.sti.parser.table;
import cern.colt.matrix.ObjectMatrix2D;
import org.apache.any23.extractor.html.TagSoupParser;
import org.apache.commons.io.FileUtils;
import org.w3c.dom.Document;
import org.w3c.dom.Node;
import uk.ac.shef.dcs.sti.STIException;
import uk.ac.shef.dcs.sti.core.model.Table;
import uk.ac.shef.dcs.sti.core.model.TContext;
import uk.ac.shef.dcs.sti.parser.table.hodetector.TableHODetector;
import uk.ac.shef.dcs.sti.parser.table.normalizer.TableNormalizer;
import uk.ac.shef.dcs.sti.parser.table.creator.TableObjCreator;
import uk.ac.shef.dcs.sti.parser.table.validator.TableValidator;
import java.io.*;
import java.util.List;
/**
* Author: Ziqi Zhang ([email protected])
* Date: 03/10/12
* Time: 12:05
*
* interface for extracting tables from certain RAW input strings
*
* WARNING: this class should not be used to READ serialised Table objects. but learn raw input to create them
*/
public abstract class TableParser {
protected TableNormalizer normalizer;
protected TableHODetector hoDetector;
protected TableObjCreator creator;
protected TableValidator[] validators;
protected TagSoupParser parser;
public TableParser(TableNormalizer normalizer,
TableHODetector detector, TableObjCreator creator,
TableValidator... validators) {
this.normalizer = normalizer;
this.hoDetector = detector;
this.creator = creator;
this.validators = validators;
}
public abstract List extract(String input, String sourceId) throws STIException;
/**
* Processes table elements following the basic principles:
* 1. normalize a table element (Jsoup) into a regular n x m table
* 2. find header and orientation of the table
* 3. extract text values in each table cell
* 4. validate the extracted tables
*
* (examples of tables that will be discarded by this method include(inaddition to the tablevalidator rules):
* tables only contain images but no texts;
* tables only have "tr" which has no "td"
*
* @param tableNode must be the element
* @param sourceId
* @return null if no valid tables are extracted; Table object if otherwise
*/
public Table extractTable(Node tableNode, String tableId, String sourceId, TContext... contexts) {
/*if (sourceId.startsWith("List of U.S. state songs"))
System.out.println();*/
List> norm = normalizer.normalize(tableNode);
if (norm.size() == 0)
return null;
ObjectMatrix2D preTable = hoDetector.detect(norm);
Table table = creator.create(preTable, tableId, sourceId, contexts);
for (TableValidator tv : validators) {
if (!tv.validate(table))
return null;
}
return table;
}
public static void serialize(Table table, String targetDir) throws IOException {
File dir = new File(targetDir);
if (!dir.exists())
dir.mkdirs();
String filename = targetDir + File.separator + table.getSourceId().replaceAll("[^\\d\\w]", "_") + "_" + table.getTableId();
FileOutputStream fileOut =
new FileOutputStream(filename);
ObjectOutputStream out =
new ObjectOutputStream(fileOut);
out.writeObject(table);
out.close();
fileOut.close();
}
public static Table deserialize(String filename) throws IOException, ClassNotFoundException {
FileInputStream fileIn =
new FileInputStream(filename);
ObjectInputStream in = new ObjectInputStream(fileIn);
Table table = (Table) in.readObject();
in.close();
fileIn.close();
return table;
}
protected Document createDocument(String inFile, String sourceId) throws STIException {
String input;
try {
input = FileUtils.readFileToString(new File(inFile));
} catch (IOException e) {
throw new STIException(e);
}
parser = new TagSoupParser(new ByteArrayInputStream(input.getBytes()), sourceId, "UTF-8");
Document doc = null;
try {
doc = parser.getDOM();
} catch (IOException e) {
}
return doc;
}
}
© 2015 - 2024 Weber Informatics LLC | Privacy Policy