eu.fbk.twm.classifier.OneExamplePerSenseClassifier4 Maven / Gradle / Ivy
The newest version!
package eu.fbk.twm.classifier;
import eu.fbk.twm.utils.Defaults;
import org.apache.commons.cli.*;
import org.apache.log4j.Logger;
import org.apache.log4j.PropertyConfigurator;
import eu.fbk.utils.lsa.LSI;
import eu.fbk.utils.lsa.BOW;
import eu.fbk.utils.math.Node;
import eu.fbk.twm.index.OneExamplePerSenseSearcher;
import eu.fbk.twm.utils.WikipediaExtractor;
import java.io.File;
import java.util.Arrays;
import java.util.Comparator;
import java.util.Map;
/**
* Created with IntelliJ IDEA.
* User: giuliano
* Date: 12/21/13
* Time: 10:00 PM
* To change this template use File | Settings | File Templates.
*/
public class OneExamplePerSenseClassifier4 extends OneExamplePerSenseClassifier3 {
/**
* Define a static logger variable so that it references the
* Logger instance named OneExamplePerSenseClassifier4
.
*/
static Logger logger = Logger.getLogger(OneExamplePerSenseClassifier4.class.getName());
public OneExamplePerSenseClassifier4(LSI lsi, OneExamplePerSenseSearcher oneExamplePerSenseSearcher) {
super(lsi, oneExamplePerSenseSearcher, null);
}
public OneExamplePerSenseClassifier4(LSI lsi, OneExamplePerSenseSearcher oneExamplePerSenseSearcher, Map termMap) {
super(lsi, oneExamplePerSenseSearcher, termMap);
}
//todo: use the PageIncomingOutgoingSearcher strategy
//todo: form must be tokenized
public Sense[] classify(BOW bow, String form) {
//logger.debug(bow);
long begin = System.nanoTime();
//logger.debug("searching " + form);
OneExamplePerSenseSearcher.Entry[] entries = oneExamplePerSenseSearcher.search(form);
//logger.debug(Arrays.toString(entries));
long end = System.nanoTime();
Node[] bowVector = lsi.mapDocument(bow);
//Node[] lsVector = lsi.mapPseudoDocument(bowVector);
Sense[] senses = new ContextualSense[entries.length];
//logger.debug("bow\t" + Node.toString(bowVector));
//logger.debug("+\t" + Node.toString(lsVector));
for (int i = 0; i < entries.length; i++) {
//logger.debug(i + "\t" + Node.toString(entries[i].getBowVector()));
//logger.debug(i + "\t" + Node.toString(entries[i].getLsVector()));
double bowKernel = dot(bowVector, entries[i].getBowVector(), termMap, form, entries[i].getPage());
double lsKernel = 0;// Node.dot(lsVector, entries[i].getLsVector());
//logger.debug(i + "\t" + entries[i].getPage() + "\t" + rf.format(bowKernel) + "\t" + rf.format(lsKernel) + "\t" + rf.format(entries[i].getFreq()));
senses[i] = new ContextualSense(entries[i].getPage(), entries[i].getFreq(), bowKernel, lsKernel);
}
Arrays.sort(senses, new Comparator() {
@Override
public int compare(Sense sense, Sense sense2) {
double diff = sense.getCombo() - sense2.getCombo();
if (diff > 0) {
return -1;
}
else if (diff < 0) {
return 1;
}
return 0;
}
});
//logger.info("i\tprior\tbow\tls\tcombo\tpage");
//for (int i = 0; i < senses.length; i++) {
// logger.info(i + "\t" + rf.format(senses[i].getPrior()) + "\t" + rf.format(senses[i].getBow()) + "\t" + rf.format(senses[i].getLs()) + "\t" + rf.format(senses[i].getCombo()) + "\t" + rf.format(senses[i].getCombo() * senses[i].getPrior())+ "\t" + senses[i].getPage());
//}
return senses;
}
public Sense[] classify(Map bow, String form) {
//logger.debug(bow);
long begin = System.nanoTime();
//logger.debug("searching " + form);
OneExamplePerSenseSearcher.Entry[] entries = oneExamplePerSenseSearcher.search(form);
//logger.debug(Arrays.toString(entries));
long end = System.nanoTime();
Node[] bowVector = lsi.mapDocument(bow);
//Node[] lsVector = lsi.mapPseudoDocument(bowVector);
Sense[] senses = new ContextualSense[entries.length];
//logger.debug("bow\t" + Node.toString(bowVector));
//logger.debug("+\t" + Node.toString(lsVector));
for (int i = 0; i < entries.length; i++) {
//logger.debug(i + "\t" + Node.toString(entries[i].getBowVector()));
//logger.debug(i + "\t" + Node.toString(entries[i].getLsVector()));
double bowKernel = dot(bowVector, entries[i].getBowVector(), termMap, form, entries[i].getPage());
double lsKernel = 0;// Node.dot(lsVector, entries[i].getLsVector());
//logger.debug(i + "\t" + entries[i].getPage() + "\t" + rf.format(bowKernel) + "\t" + rf.format(lsKernel) + "\t" + rf.format(entries[i].getFreq()));
senses[i] = new ContextualSense(entries[i].getPage(), entries[i].getFreq(), bowKernel, lsKernel);
}
Arrays.sort(senses, new Comparator() {
@Override
public int compare(Sense sense, Sense sense2) {
double diff = sense.getCombo() - sense2.getCombo();
if (diff > 0) {
return -1;
}
else if (diff < 0) {
return 1;
}
return 0;
}
});
//logger.info("i\tprior\tbow\tls\tcombo\tpage");
//for (int i = 0; i < senses.length; i++) {
// logger.info(i + "\t" + rf.format(senses[i].getPrior()) + "\t" + rf.format(senses[i].getBow()) + "\t" + rf.format(senses[i].getLs()) + "\t" + rf.format(senses[i].getCombo()) + "\t" + rf.format(senses[i].getCombo() * senses[i].getPrior())+ "\t" + senses[i].getPage());
//}
return senses;
}
public static void main(String args[]) throws Exception {
String logConfig = System.getProperty("log-config");
if (logConfig == null) {
logConfig = "configuration/log-config.txt";
}
PropertyConfigurator.configure(logConfig);
Options options = new Options();
try {
Option indexNameOpt = OptionBuilder.withArgName("dir").hasArg().withDescription("open an index with the specified name").isRequired().withLongOpt("index").create("i");
Option interactiveModeOpt = OptionBuilder.withDescription("enter in the interactive mode").withLongOpt("interactive-mode").create("t");
Option instanceFileOpt = OptionBuilder.withArgName("file").hasArg().withDescription("read the instances to classify from the specified file").withLongOpt("instance-file").create("f");
Option lsmDirOpt = OptionBuilder.withArgName("dir").hasArg().withDescription("lsi dir").isRequired().withLongOpt("lsi").create("l");
Option lsmDimOpt = OptionBuilder.withArgName("int").hasArg().withDescription("lsi dim").withLongOpt("dim").create("d");
Option normalizedOpt = OptionBuilder.withDescription("normalize vectors (default is " + WikipediaExtractor.DEFAULT_NORMALIZE + ")").withLongOpt("normalized").create();
options.addOption("h", "help", false, "print this message");
options.addOption("v", "version", false, "output version information and exit");
options.addOption(indexNameOpt);
options.addOption(interactiveModeOpt);
options.addOption(instanceFileOpt);
options.addOption(lsmDirOpt);
options.addOption(lsmDimOpt);
options.addOption(normalizedOpt);
CommandLineParser parser = new PosixParser();
CommandLine line = parser.parse(options, args);
if (line.hasOption("help") || line.hasOption("version")) {
throw new ParseException("");
}
int minFreq = OneExamplePerSenseSearcher.DEFAULT_MIN_FREQ;
if (line.hasOption("minimum-freq")) {
minFreq = Integer.parseInt(line.getOptionValue("minimum-freq"));
}
int notificationPoint = Defaults.DEFAULT_NOTIFICATION_POINT;
if (line.hasOption("notification-point")) {
notificationPoint = Integer.parseInt(line.getOptionValue("notification-point"));
}
String lsmDirName = line.getOptionValue("lsi");
if (!lsmDirName.endsWith(File.separator)) {
lsmDirName += File.separator;
}
boolean normalized = WikipediaExtractor.DEFAULT_NORMALIZE;
if (line.hasOption("normalized")) {
normalized = true;
}
File fileUt = new File(lsmDirName + "X-Ut");
File fileSk = new File(lsmDirName + "X-S");
File fileR = new File(lsmDirName + "X-row");
File fileC = new File(lsmDirName + "X-col");
File fileDf = new File(lsmDirName + "X-df");
int dim = 100;
if (line.hasOption("dim")) {
dim = Integer.parseInt(line.getOptionValue("dim"));
}
logger.debug(line.getOptionValue("lsi") + "\t" + line.getOptionValue("dim"));
LSI lsi = new LSI(fileUt, fileSk, fileR, fileC, fileDf, dim, true, normalized);
OneExamplePerSenseSearcher oneExamplePerSenseSearcher = new OneExamplePerSenseSearcher(line.getOptionValue("index"));
oneExamplePerSenseSearcher.setNotificationPoint(notificationPoint);
if (line.hasOption("instance-file")) {
OneExamplePerSenseClassifier4 oneExamplePerSenseClassifier = new OneExamplePerSenseClassifier4(lsi, oneExamplePerSenseSearcher);
oneExamplePerSenseClassifier.classify(new File(line.getOptionValue("instance-file")), false);
}
if (line.hasOption("interactive-mode")) {
OneExamplePerSenseClassifier4 oneExamplePerSenseClassifier = new OneExamplePerSenseClassifier4(lsi, oneExamplePerSenseSearcher);
oneExamplePerSenseClassifier.interactive();
}
} catch (ParseException e) {
// oops, something went wrong
if (e.getMessage().length() > 0) {
System.out.println("Parsing failed: " + e.getMessage() + "\n");
}
HelpFormatter formatter = new HelpFormatter();
formatter.printHelp(400, "java -cp dist/thewikimachine.jar eu.fbk.twm.classifier.OneExamplePerSenseClassifier3", "\n", options, "\n", true);
}
}
}