uk.ac.shef.dcs.sti.parser.table.TableParserRottenTomato Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of sti-main Show documentation
The newest version!
package uk.ac.shef.dcs.sti.parser.table;

import org.apache.any23.extractor.html.DomUtils;
import org.apache.any23.extractor.html.TagSoupParser;
import org.apache.commons.io.FileUtils;
import org.w3c.dom.Document;
import org.w3c.dom.Node;
import org.w3c.dom.NodeList;
import uk.ac.shef.dcs.sti.STIEnum;
import uk.ac.shef.dcs.sti.STIException;
import uk.ac.shef.dcs.sti.core.model.TCell;
import uk.ac.shef.dcs.sti.core.model.TColumnHeader;
import uk.ac.shef.dcs.sti.core.model.TContext;
import uk.ac.shef.dcs.sti.core.model.Table;
import uk.ac.shef.dcs.sti.parser.table.creator.TableObjCreatorGoodreads;
import uk.ac.shef.dcs.sti.parser.table.hodetector.TableHODetector;
import uk.ac.shef.dcs.sti.parser.table.hodetector.TableHODetectorByHTMLTag;
import uk.ac.shef.dcs.sti.parser.table.normalizer.TableNormalizer;
import uk.ac.shef.dcs.sti.parser.table.creator.TableObjCreator;
import uk.ac.shef.dcs.sti.parser.table.context.TableContextExtractorGeneric;
import uk.ac.shef.dcs.sti.parser.table.normalizer.TableNormalizerDiscardIrregularRows;
import uk.ac.shef.dcs.sti.parser.table.validator.TableValidatorGeneric;
import uk.ac.shef.dcs.sti.parser.table.validator.TableValidator;

import java.io.ByteArrayInputStream;
import java.io.File;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;

/**
 * Created with IntelliJ IDEA.
 * User: zqz
 * Date: 13/06/14
 * Time: 11:22
 * To change this template use File | Settings | File Templates.
 */
public class TableParserRottenTomato extends TableParser {

    public TableParserRottenTomato(){
        super(new TableNormalizerDiscardIrregularRows(true),
                new TableHODetectorByHTMLTag(),
                new TableObjCreatorGoodreads(),
                new TableValidatorGeneric());
    }

    public TableParserRottenTomato(TableNormalizer normalizer, TableHODetector detector, TableObjCreator creator, TableValidator... validators) {
        super(normalizer, detector, creator, validators);
    }

    @Override
    public List extract(String inFile, String sourceId) throws STIException {
        String input;
        try {
            input = FileUtils.readFileToString(new File(inFile));
        } catch (IOException e) {
            throw new STIException(e);
        }

        List rs = new ArrayList<>();
        parser = new TagSoupParser(new ByteArrayInputStream(input.getBytes()), sourceId, "UTF-8");
        Document doc = null;
        try {
            doc = parser.getDOM();
        } catch (IOException e) {
            return rs;
        }

        List tables = DomUtils.findAll(doc, "//DIV[@id='cast-info']");
        if (tables.size() > 0) {
            List contexts=new ArrayList<>();
            try {
                contexts = new TableContextExtractorGeneric().extract(new File(sourceId), doc);
            } catch (STIException e) {
                e.printStackTrace();
            }
            int tableCount = 0;
            for (Node n : tables) {
                tableCount++;

                Node ul = null;
                NodeList list = n.getChildNodes();
                for (int i = 0; i < list.getLength(); i++) {
                    Node nn = list.item(i);
                    if (nn.getNodeName().equals("UL")) {
                        ul = nn;
                        break;
                    }
                }

                if (ul == null)
                    continue;

                List items = DomUtils.findAll(ul, "LI");
                Table table = new Table(sourceId, sourceId, items.size(), 1);
                for (TContext ltc : contexts)
                    table.addContext(ltc);

                table.setColumnHeader(0, new TColumnHeader(STIEnum.TABLE_HEADER_UNKNOWN.getValue()));
                int i=0;
                for(Node it: items){
                    String content="";
                    try{
                        content=DomUtils.findAll(it, "DIV/A").get(0).getTextContent();
                    }catch (NullPointerException npe){}

                    TCell ltc = new TCell(content);
                    table.setContentCell(i, 0, ltc);
                    i++;
                }

                if (table != null)
                    rs.add(table);

            }
        }
        return rs;
    }
}
    

    

    
            
    
            

    
        
            
                Related Artifacts
                
                     mysql-connector-java mysql
 facebook-messenger com.github.codedrinker
 selenium-java org.seleniumhq.selenium
 instagram-java com.github.sola92
 gson com.google.code.gson
 poi org.apache.poi
 httpclient org.apache.httpcomponents
 json org.json
 facebook-java-api com.google.code.facebook-java-api
 poi-ooxml org.apache.poi
 jackson-databind com.fasterxml.jackson.core
 junit junit
 primefaces org.primefaces
 ojdbc7 com.github.noraui
 jfoenix com.jfoenix
 testng org.testng
 json-simple com.googlecode.json-simple
 selenium-server org.seleniumhq.selenium
 itextpdf com.itextpdf
 spring-core org.springframework
                
            
        
        
            
                Related Groups
                
                     org.springframework
 org.apache.poi
 org.hibernate
 org.springframework.boot
 com.fasterxml.jackson.core
 com.itextpdf
 org.seleniumhq.selenium
 mysql
 org.finos.legend.engine
 org.apache.httpcomponents
 org.apache.logging.log4j
 org.openjfx
 org.apache.commons
 org.json
 com.google.guava
 com.google.zxing
 net.sf.jasperreports
 javax.xml.bind
 ojdbc
 com.google.code.facebook-java-api
                
            
        
    
    





    © 2015 - 2024 Weber Informatics LLC | Privacy Policy