uk.ac.shef.dcs.sti.parser.table.TableParser Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of sti-main Show documentation
The newest version!
package uk.ac.shef.dcs.sti.parser.table;

import cern.colt.matrix.ObjectMatrix2D;
import org.apache.any23.extractor.html.TagSoupParser;
import org.apache.commons.io.FileUtils;
import org.w3c.dom.Document;
import org.w3c.dom.Node;
import uk.ac.shef.dcs.sti.STIException;
import uk.ac.shef.dcs.sti.core.model.Table;
import uk.ac.shef.dcs.sti.core.model.TContext;
import uk.ac.shef.dcs.sti.parser.table.hodetector.TableHODetector;
import uk.ac.shef.dcs.sti.parser.table.normalizer.TableNormalizer;
import uk.ac.shef.dcs.sti.parser.table.creator.TableObjCreator;
import uk.ac.shef.dcs.sti.parser.table.validator.TableValidator;

import java.io.*;
import java.util.List;

/**
 * Author: Ziqi Zhang ([email protected])
 * Date: 03/10/12
 * Time: 12:05
 * 
 * interface for extracting tables from certain RAW input strings
 *
 * WARNING: this class should not be used to READ serialised Table objects. but learn raw input to create them
 */
public abstract class TableParser {
    protected TableNormalizer normalizer;
    protected TableHODetector hoDetector;
    protected TableObjCreator creator;
    protected TableValidator[] validators;

    protected TagSoupParser parser;

    public TableParser(TableNormalizer normalizer,
                       TableHODetector detector, TableObjCreator creator,
                       TableValidator... validators) {
        this.normalizer = normalizer;
        this.hoDetector = detector;
        this.creator = creator;
        this.validators = validators;
    }

    public abstract List extract(String input, String sourceId) throws STIException;


    /**
     * Processes table elements following the basic principles:
     * 1. normalize a table element (Jsoup) into a regular n x m table
     * 2. find header and orientation of the table
     * 3. extract text values in each table cell
     * 4. validate the extracted tables
     * 
     * (examples of tables that will be discarded by this method include(inaddition to the tablevalidator rules):
     * tables only contain images but no texts;
     * tables only have "tr" which has no "td"
     *
     * @param tableNode must be the 
 element
     * @param sourceId
     * @return null if no valid tables are extracted; Table object if otherwise
     */
    public Table extractTable(Node tableNode, String tableId, String sourceId, TContext... contexts) {
        /*if (sourceId.startsWith("List of U.S. state songs"))
            System.out.println();*/
        List> norm = normalizer.normalize(tableNode);
        if (norm.size() == 0)
            return null;
        ObjectMatrix2D preTable = hoDetector.detect(norm);
        Table table = creator.create(preTable, tableId, sourceId, contexts);
        for (TableValidator tv : validators) {
            if (!tv.validate(table))
                return null;
        }
        return table;
    }


    public static void serialize(Table table, String targetDir) throws IOException {
        File dir = new File(targetDir);
        if (!dir.exists())
            dir.mkdirs();
        String filename = targetDir + File.separator + table.getSourceId().replaceAll("[^\\d\\w]", "_") + "_" + table.getTableId();

        FileOutputStream fileOut =
                new FileOutputStream(filename);
        ObjectOutputStream out =
                new ObjectOutputStream(fileOut);
        out.writeObject(table);
        out.close();
        fileOut.close();
    }

    public static Table deserialize(String filename) throws IOException, ClassNotFoundException {
        FileInputStream fileIn =
                new FileInputStream(filename);
        ObjectInputStream in = new ObjectInputStream(fileIn);
        Table table = (Table) in.readObject();
        in.close();
        fileIn.close();
        return table;
    }

    protected Document createDocument(String inFile, String sourceId) throws STIException {
        String input;
        try {
            input = FileUtils.readFileToString(new File(inFile));
        } catch (IOException e) {
            throw new STIException(e);
        }
        parser = new TagSoupParser(new ByteArrayInputStream(input.getBytes()), sourceId, "UTF-8");
        Document doc = null;
        try {
            doc = parser.getDOM();
        } catch (IOException e) {
        }

        return doc;
    }
}
    

    

    
            
    
            

    
        
            
                Related Artifacts
                
                     mysql-connector-java mysql
 facebook-messenger com.github.codedrinker
 selenium-java org.seleniumhq.selenium
 instagram-java com.github.sola92
 gson com.google.code.gson
 poi org.apache.poi
 httpclient org.apache.httpcomponents
 json org.json
 facebook-java-api com.google.code.facebook-java-api
 poi-ooxml org.apache.poi
 jackson-databind com.fasterxml.jackson.core
 junit junit
 primefaces org.primefaces
 ojdbc7 com.github.noraui
 jfoenix com.jfoenix
 testng org.testng
 json-simple com.googlecode.json-simple
 selenium-server org.seleniumhq.selenium
 itextpdf com.itextpdf
 spring-core org.springframework
                
            
        
        
            
                Related Groups
                
                     org.springframework
 org.apache.poi
 org.hibernate
 org.springframework.boot
 com.fasterxml.jackson.core
 com.itextpdf
 org.seleniumhq.selenium
 mysql
 org.finos.legend.engine
 org.apache.httpcomponents
 org.apache.logging.log4j
 org.openjfx
 org.apache.commons
 org.json
 com.google.guava
 com.google.zxing
 net.sf.jasperreports
 javax.xml.bind
 ojdbc
 com.google.code.facebook-java-api
                
            
        
    
    





    © 2015 - 2024 Weber Informatics LLC | Privacy Policy