uk.ac.shef.dcs.sti.parser.table.TableParserIMDB Maven / Gradle / Ivy
The newest version!
package uk.ac.shef.dcs.sti.parser.table;
import org.apache.any23.extractor.html.DomUtils;
import org.w3c.dom.Document;
import org.w3c.dom.Node;
import uk.ac.shef.dcs.sti.STIException;
import uk.ac.shef.dcs.sti.core.model.Table;
import uk.ac.shef.dcs.sti.core.model.TContext;
import uk.ac.shef.dcs.sti.parser.table.creator.TableObjCreatorIMDB;
import uk.ac.shef.dcs.sti.parser.table.hodetector.TableHODetector;
import uk.ac.shef.dcs.sti.parser.table.hodetector.TableHODetectorByHTMLTag;
import uk.ac.shef.dcs.sti.parser.table.normalizer.TableNormalizer;
import uk.ac.shef.dcs.sti.parser.table.creator.TableObjCreator;
import uk.ac.shef.dcs.sti.parser.table.context.TableContextExtractorIMDB;
import uk.ac.shef.dcs.sti.parser.table.normalizer.TableNormalizerDiscardIrregularRows;
import uk.ac.shef.dcs.sti.parser.table.validator.TableValidatorGeneric;
import uk.ac.shef.dcs.sti.parser.table.validator.TableValidator;
import java.io.*;
import java.util.ArrayList;
import java.util.List;
/**
* for parsing IMDB cast table on movie pages, e.g.,
* http://www.imdb.com/title/tt0371746/
*
* see TableContextExtractorIMDB for the extraction of context for the table
*/
public class TableParserIMDB extends TableParser implements Browsable {
public TableParserIMDB() {
super(new TableNormalizerDiscardIrregularRows(true),
new TableHODetectorByHTMLTag(),
new TableObjCreatorIMDB(),
new TableValidatorGeneric());
}
public TableParserIMDB(TableNormalizer normalizer, TableHODetector detector, TableObjCreator creator, TableValidator... validators) {
super(normalizer, detector, creator, validators);
}
@Override
public List
extract(String inFile, String sourceId) throws STIException {
List rs = new ArrayList<>();
Document doc = createDocument(inFile, sourceId);
List tables = DomUtils.findAll(doc, "//TABLE[@class='cast_list']");
List contexts = new ArrayList<>();
try {
contexts = new TableContextExtractorIMDB().extract(new File(sourceId), doc);
} catch (STIException e) {
e.printStackTrace();
}
int tableCount = 0;
for (Node n : tables) {
tableCount++;
TContext[] contexts_array = new TContext[contexts.size()];
for (int i = 0; i < contexts.size(); i++)
contexts_array[i] = contexts.get(i);
Table table = extractTable(n, String.valueOf(tableCount),
sourceId, contexts_array);
if (table != null)
rs.add(table);
}
return rs;
}
/**
* if the preview html file wants to support selection of
*
* @param inFile
* @param sourceId
* @param outputFolder
* @return
* @throws STIException
*/
@Override
public List extract(String inFile, String sourceId, String outputFolder) throws STIException {
Document doc = createDocument(inFile, sourceId);
List tables = DomUtils.findAll(doc, "//TABLE[@class='cast_list']");
List xpaths = BrowsableHelper.createBrowsableElements(tables, doc);
BrowsableHelper.output(inFile, outputFolder, doc);
return xpaths;
}
}
© 2015 - 2024 Weber Informatics LLC | Privacy Policy