All Downloads are FREE. Search and download functionalities are using the official Maven repository.
Please wait. This can take some minutes ...
Many resources are needed to download a project. Please understand that we have to compensate our server costs. Thank you in advance.
Project price only 1 $
You can buy this project and download/modify it how often you want.
uk.ac.shef.dcs.sti.TODO.TestSubjectColumnDetector Maven / Gradle / Ivy
package uk.ac.shef.dcs.sti.TODO;
import javafx.util.Pair;
import org.apache.solr.client.solrj.embedded.EmbeddedSolrServer;
import org.xml.sax.SAXException;
import uk.ac.shef.dcs.sti.STIException;
import uk.ac.shef.dcs.sti.core.subjectcol.SubjectColumnDetector;
import uk.ac.shef.dcs.sti.core.model.Table;
import uk.ac.shef.dcs.sti.parser.table.TableParserMusicBrainz;
import uk.ac.shef.dcs.sti.parser.table.normalizer.TableNormalizerSimple;
import uk.ac.shef.dcs.sti.parser.table.validator.TableValidatorGeneric;
import uk.ac.shef.dcs.sti.parser.table.hodetector.TableHODetectorByHTMLTag;
import uk.ac.shef.dcs.sti.parser.table.creator.TableObjCreatorMusicBrainz;
import uk.ac.shef.dcs.sti.util.FileUtils;
import javax.xml.parsers.ParserConfigurationException;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.PrintWriter;
import java.util.*;
/**
*/
public class TestSubjectColumnDetector {
public static void main(String[] args) throws IOException, SAXException, ParserConfigurationException, STIException {
/*which_table_has_no_acronym_columns("E:\\Data\\table annotation\\freebase_crawl\\music_record_label/tmp.txt");
System.exit(0);*/
/* gs_rewrite("E:\\Data\\table annotation\\corpus_analysis\\90_tables/90_tables_for_studying_subject_columns_(noise_removed).csv",
"E:\\Data\\table annotation\\corpus_analysis\\90_tables\\91_tables_subcol_gs.csv");
System.exit(0);*/
String inFolder = args[0];
String outFile = args[1];
String nlpResources = args[2];
String cacheFolder = args[3];
Properties properties = new Properties();
properties.load(new FileInputStream(args[4]));
List stopWords = FileUtils.readList(nlpResources + "/stoplist.txt", true);
File configFile = new File(cacheFolder + File.separator + "solr.xml");
//todo: this class will not work, the following code must be resumed and corrected
/*CoreContainer container = new CoreContainer(cacheFolder,
configFile);
SolrServer server = new EmbeddedSolrServer(container, "collection1");
SubjectColumnDetector finder = new SubjectColumnDetector(new TContentTContentRowRankerImpl(),
IInf.class.getName(),
new String[]{"0.0", "1", "0.01"},
server,
nlpResources, true, stopWords,
MultiKeyStringSplitter.split(properties.getProperty("BING_API_KEYS")) //lodie
);*/
EmbeddedSolrServer server=null;
SubjectColumnDetector finder=null;
/* List tasks=
FileUtils.readList("E:\\Data\\table annotation\\corpus_analysis\\90_tables/" +
"90_tables_for_studying_subject_columns_(noise_removed).csv",false);
PrintWriter p = new PrintWriter("E:\\Data\\table annotation\\corpus_analysis\\90_tables" +
"/evaluate_subject_column_finder.csv");
int count=0;
for(String task: tasks){
count++;
*//*if(count==1||count==57)
System.out.println();*//*
int lastComma = task.lastIndexOf(",");
String sourceTableFile = task.substring(0, lastComma).trim();
if(sourceTableFile.startsWith("\"") &&sourceTableFile.endsWith("\""))
sourceTableFile = sourceTableFile.substring(1, sourceTableFile.length()-1).trim();
System.out.println(count+"_"+sourceTableFile);
String groundTruth = task.substring(lastComma+1).trim();
Table table = LimayeDatasetLoader.readTable(sourceTableFile, null, null);
List>> result = finder.compute(table);
StringBuilder sb = new StringBuilder();
sb.append("\"").append(sourceTableFile).append("\",").append(groundTruth).append(",").append(result.get(0).getMainObject()+1);
p.println(sb.toString());
}
p.close();
server.closeConnection();
System.exit(0);*/
//String inFolder="E:\\Data\\table annotation\\corpus_analysis\\100_tables\\100_tables";
TableParserMusicBrainz xtractor = new TableParserMusicBrainz(new TableNormalizerSimple(),
new TableHODetectorByHTMLTag(),
new TableObjCreatorMusicBrainz(),
new TableValidatorGeneric());
PrintWriter p = new PrintWriter(outFile);
int count = 0;
/*TableXtractorMusicBrainz xtractor = new TableXtractorMusicBrainz(new TableNormalizerDummy(),
new TableHODetectorByHTMLTag(),
new TableObjCreatorMusicBrainz(),
new TabValGeneric());*/
List sorted = new ArrayList(Arrays.asList(new File(inFolder).listFiles()));
Collections.sort(sorted);
for (File f : sorted) {
count++;
String task = f.toString();
/*if(!task.contains("Atlantis"))
continue;*/
System.out.println(count + "_" + f);
/*if(count==1||count==57)
System.out.println();*/
String fileContent = org.apache.any23.util.FileUtils.readFileContent(f);
List tables = xtractor.extract(fileContent, f.toString());
if (tables.size() == 0)
continue;
Table table = tables.get(0);
//Table table = LimayeDatasetLoader.readTable(task, null, null);
// String fileContent = org.apache.any23.util.FileUtils.readFileContent(f);
//List tables = xtractor.extract(fileContent, task);
/* if(tables.size()<1)
continue;*/
// Table table = tables.get(0);
try {
List>> result = finder.compute(table);
p.println("\"" + f + "\"," + result.get(0).getKey() + "," + result.get(0).getValue().getValue());
} catch (Exception e) {
System.err.println("FAILED:" + f);
}
}
p.close();
server.close();
System.exit(0);
}
public static void which_table_has_no_acronym_columns(String logFile) throws IOException {
boolean newFile = false;
boolean hasAcronym = false;
for (String l : FileUtils.readList(logFile, false)) {
if (l.contains("_E:")) {
if (newFile && !hasAcronym)
System.out.println(l);
hasAcronym = false;
newFile = true;
continue;
} else {
hasAcronym = true;
}
}
}
public static void gs_rewrite(String inGsFile, String outGsFile) throws IOException {
List lines = FileUtils.readList(inGsFile, false);
List newGs = new ArrayList();
for (String l : lines) {
String originalLine = l;
int split = l.lastIndexOf(",");
String path = l.substring(0, split).trim();
path = path.replaceAll("\\\\", "/");
split = path.lastIndexOf("/");
path = path.substring(split + 1).trim();
if (path.endsWith("\""))
path = path.substring(0, path.length() - 1).trim();
newGs.add("\"" + path + "\"," + originalLine.substring(originalLine.lastIndexOf(",") + 1).trim());
}
Collections.sort(newGs);
PrintWriter p = new PrintWriter(outGsFile);
for (String l : newGs)
p.println(l);
p.close();
}
}