uk.ac.shef.dcs.sti.TODO.TestSubjectColumnDetector Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of sti-main Show documentation
The newest version!
package uk.ac.shef.dcs.sti.TODO;

import javafx.util.Pair;
import org.apache.solr.client.solrj.embedded.EmbeddedSolrServer;
import org.xml.sax.SAXException;
import uk.ac.shef.dcs.sti.STIException;
import uk.ac.shef.dcs.sti.core.subjectcol.SubjectColumnDetector;
import uk.ac.shef.dcs.sti.core.model.Table;
import uk.ac.shef.dcs.sti.parser.table.TableParserMusicBrainz;
import uk.ac.shef.dcs.sti.parser.table.normalizer.TableNormalizerSimple;
import uk.ac.shef.dcs.sti.parser.table.validator.TableValidatorGeneric;
import uk.ac.shef.dcs.sti.parser.table.hodetector.TableHODetectorByHTMLTag;
import uk.ac.shef.dcs.sti.parser.table.creator.TableObjCreatorMusicBrainz;
import uk.ac.shef.dcs.sti.util.FileUtils;

import javax.xml.parsers.ParserConfigurationException;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.PrintWriter;
import java.util.*;

/**
 */
public class TestSubjectColumnDetector {
    public static void main(String[] args) throws IOException, SAXException, ParserConfigurationException, STIException {
        /*which_table_has_no_acronym_columns("E:\\Data\\table annotation\\freebase_crawl\\music_record_label/tmp.txt");
        System.exit(0);*/

        /* gs_rewrite("E:\\Data\\table annotation\\corpus_analysis\\90_tables/90_tables_for_studying_subject_columns_(noise_removed).csv",
                "E:\\Data\\table annotation\\corpus_analysis\\90_tables\\91_tables_subcol_gs.csv");
        System.exit(0);*/
        String inFolder = args[0];
        String outFile = args[1];
        String nlpResources = args[2];
        String cacheFolder = args[3];
        Properties properties = new Properties();
        properties.load(new FileInputStream(args[4]));
        List stopWords = FileUtils.readList(nlpResources + "/stoplist.txt", true);
        File configFile = new File(cacheFolder + File.separator + "solr.xml");

        //todo: this class will not work, the following code must be resumed and corrected
        /*CoreContainer container = new CoreContainer(cacheFolder,
                configFile);
        SolrServer server = new EmbeddedSolrServer(container, "collection1");
        SubjectColumnDetector finder = new SubjectColumnDetector(new TContentTContentRowRankerImpl(),
                IInf.class.getName(),
                new String[]{"0.0", "1", "0.01"},
                server,
                nlpResources, true, stopWords,
                MultiKeyStringSplitter.split(properties.getProperty("BING_API_KEYS")) //lodie

        );*/

        EmbeddedSolrServer server=null;
        SubjectColumnDetector finder=null;

        /* List tasks=
            FileUtils.readList("E:\\Data\\table annotation\\corpus_analysis\\90_tables/" +
                    "90_tables_for_studying_subject_columns_(noise_removed).csv",false);
    PrintWriter p = new PrintWriter("E:\\Data\\table annotation\\corpus_analysis\\90_tables" +
            "/evaluate_subject_column_finder.csv");
    int count=0;
    for(String task: tasks){
        count++;
        *//*if(count==1||count==57)
                System.out.println();*//*

            int lastComma = task.lastIndexOf(",");
            String sourceTableFile = task.substring(0, lastComma).trim();
            if(sourceTableFile.startsWith("\"") &&sourceTableFile.endsWith("\""))
                sourceTableFile = sourceTableFile.substring(1, sourceTableFile.length()-1).trim();
            System.out.println(count+"_"+sourceTableFile);

            String groundTruth = task.substring(lastComma+1).trim();

            Table table = LimayeDatasetLoader.readTable(sourceTableFile, null, null);

            List>> result = finder.compute(table);

            StringBuilder sb = new StringBuilder();
            sb.append("\"").append(sourceTableFile).append("\",").append(groundTruth).append(",").append(result.get(0).getMainObject()+1);
            p.println(sb.toString());
        }
        p.close();

        server.closeConnection();
        System.exit(0);*/
        //String inFolder="E:\\Data\\table annotation\\corpus_analysis\\100_tables\\100_tables";
        TableParserMusicBrainz xtractor = new TableParserMusicBrainz(new TableNormalizerSimple(),
                new TableHODetectorByHTMLTag(),
                new TableObjCreatorMusicBrainz(),
                new TableValidatorGeneric());
        PrintWriter p = new PrintWriter(outFile);
        int count = 0;
        /*TableXtractorMusicBrainz xtractor = new TableXtractorMusicBrainz(new TableNormalizerDummy(),
                new TableHODetectorByHTMLTag(),
                new TableObjCreatorMusicBrainz(),
                new TabValGeneric());*/

        List sorted = new ArrayList(Arrays.asList(new File(inFolder).listFiles()));
        Collections.sort(sorted);
        for (File f : sorted) {

            count++;
            String task = f.toString();
            /*if(!task.contains("Atlantis"))
                continue;*/
            System.out.println(count + "_" + f);

            /*if(count==1||count==57)
                System.out.println();*/


            String fileContent = org.apache.any23.util.FileUtils.readFileContent(f);
            List tables = xtractor.extract(fileContent, f.toString());
            if (tables.size() == 0)
                continue;

            Table table = tables.get(0);

            //Table table = LimayeDatasetLoader.readTable(task, null, null);
            // String fileContent = org.apache.any23.util.FileUtils.readFileContent(f);
            //List tables = xtractor.extract(fileContent, task);
            /* if(tables.size()<1)
            continue;*/
            // Table table = tables.get(0);
            try {
                List>> result = finder.compute(table);

                p.println("\"" + f + "\"," + result.get(0).getKey() + "," + result.get(0).getValue().getValue());
            } catch (Exception e) {
                System.err.println("FAILED:" + f);
            }

        }
        p.close();

        server.close();
        System.exit(0);
    }

    public static void which_table_has_no_acronym_columns(String logFile) throws IOException {

        boolean newFile = false;
        boolean hasAcronym = false;
        for (String l : FileUtils.readList(logFile, false)) {
            if (l.contains("_E:")) {
                if (newFile && !hasAcronym)
                    System.out.println(l);

                hasAcronym = false;
                newFile = true;
                continue;
            } else {
                hasAcronym = true;
            }

        }
    }

    public static void gs_rewrite(String inGsFile, String outGsFile) throws IOException {
        List lines = FileUtils.readList(inGsFile, false);
        List newGs = new ArrayList();
        for (String l : lines) {
            String originalLine = l;

            int split = l.lastIndexOf(",");
            String path = l.substring(0, split).trim();
            path = path.replaceAll("\\\\", "/");
            split = path.lastIndexOf("/");
            path = path.substring(split + 1).trim();
            if (path.endsWith("\""))
                path = path.substring(0, path.length() - 1).trim();
            newGs.add("\"" + path + "\"," + originalLine.substring(originalLine.lastIndexOf(",") + 1).trim());
        }
        Collections.sort(newGs);
        PrintWriter p = new PrintWriter(outGsFile);
        for (String l : newGs)
            p.println(l);
        p.close();
    }
}
    

    

    
            
    
            

    
        
            
                Related Artifacts
                
                     mysql-connector-java mysql
 facebook-messenger com.github.codedrinker
 selenium-java org.seleniumhq.selenium
 instagram-java com.github.sola92
 gson com.google.code.gson
 poi org.apache.poi
 httpclient org.apache.httpcomponents
 json org.json
 facebook-java-api com.google.code.facebook-java-api
 poi-ooxml org.apache.poi
 jackson-databind com.fasterxml.jackson.core
 junit junit
 primefaces org.primefaces
 ojdbc7 com.github.noraui
 jfoenix com.jfoenix
 testng org.testng
 json-simple com.googlecode.json-simple
 selenium-server org.seleniumhq.selenium
 itextpdf com.itextpdf
 spring-core org.springframework
                
            
        
        
            
                Related Groups
                
                     org.springframework
 org.apache.poi
 org.hibernate
 org.springframework.boot
 com.fasterxml.jackson.core
 com.itextpdf
 org.seleniumhq.selenium
 mysql
 org.finos.legend.engine
 org.apache.httpcomponents
 org.apache.logging.log4j
 org.openjfx
 org.apache.commons
 org.json
 com.google.guava
 com.google.zxing
 net.sf.jasperreports
 javax.xml.bind
 ojdbc
 com.google.code.facebook-java-api
                
            
        
    
    





    © 2015 - 2024 Weber Informatics LLC | Privacy Policy