All Downloads are FREE. Search and download functionalities are using the official Maven repository.

eu.fbk.twm.classifier.OneExamplePerSenseClassifier4 Maven / Gradle / Ivy

The newest version!
package eu.fbk.twm.classifier;

import eu.fbk.twm.utils.Defaults;
import org.apache.commons.cli.*;
import org.apache.log4j.Logger;
import org.apache.log4j.PropertyConfigurator;
import eu.fbk.utils.lsa.LSI;
import eu.fbk.utils.lsa.BOW;
import eu.fbk.utils.math.Node;
import eu.fbk.twm.index.OneExamplePerSenseSearcher;
import eu.fbk.twm.utils.WikipediaExtractor;

import java.io.File;
import java.util.Arrays;
import java.util.Comparator;
import java.util.Map;

/**
 * Created with IntelliJ IDEA.
 * User: giuliano
 * Date: 12/21/13
 * Time: 10:00 PM
 * To change this template use File | Settings | File Templates.
 */
public class OneExamplePerSenseClassifier4 extends OneExamplePerSenseClassifier3 {
	/**
	 * Define a static logger variable so that it references the
	 * Logger instance named OneExamplePerSenseClassifier4.
	 */
	static Logger logger = Logger.getLogger(OneExamplePerSenseClassifier4.class.getName());


	public OneExamplePerSenseClassifier4(LSI lsi, OneExamplePerSenseSearcher oneExamplePerSenseSearcher) {
		super(lsi, oneExamplePerSenseSearcher, null);
	}

	public OneExamplePerSenseClassifier4(LSI lsi, OneExamplePerSenseSearcher oneExamplePerSenseSearcher, Map termMap) {
		super(lsi, oneExamplePerSenseSearcher, termMap);
	}


	//todo: use the PageIncomingOutgoingSearcher strategy
	//todo: form must be tokenized
	public Sense[] classify(BOW bow, String form) {
		//logger.debug(bow);
		long begin = System.nanoTime();
		//logger.debug("searching " + form);
		OneExamplePerSenseSearcher.Entry[] entries = oneExamplePerSenseSearcher.search(form);
		//logger.debug(Arrays.toString(entries));
		long end = System.nanoTime();

		Node[] bowVector = lsi.mapDocument(bow);
		//Node[] lsVector = lsi.mapPseudoDocument(bowVector);
		Sense[] senses = new ContextualSense[entries.length];
		//logger.debug("bow\t" + Node.toString(bowVector));
		//logger.debug("+\t" + Node.toString(lsVector));
		for (int i = 0; i < entries.length; i++) {
			//logger.debug(i + "\t" + Node.toString(entries[i].getBowVector()));
			//logger.debug(i + "\t" + Node.toString(entries[i].getLsVector()));
			double bowKernel = dot(bowVector, entries[i].getBowVector(), termMap, form, entries[i].getPage());
			double lsKernel = 0;// Node.dot(lsVector, entries[i].getLsVector());
			//logger.debug(i + "\t" + entries[i].getPage() + "\t" + rf.format(bowKernel) + "\t" + rf.format(lsKernel) + "\t" + rf.format(entries[i].getFreq()));
			senses[i] = new ContextualSense(entries[i].getPage(), entries[i].getFreq(), bowKernel, lsKernel);
		}

		Arrays.sort(senses, new Comparator() {
			@Override
			public int compare(Sense sense, Sense sense2) {
				double diff = sense.getCombo() - sense2.getCombo();
				if (diff > 0) {
					return -1;
				}
				else if (diff < 0) {
					return 1;
				}
				return 0;
			}
		});
		//logger.info("i\tprior\tbow\tls\tcombo\tpage");
		//for (int i = 0; i < senses.length; i++) {
		//	logger.info(i + "\t" + rf.format(senses[i].getPrior()) + "\t" + rf.format(senses[i].getBow()) + "\t" + rf.format(senses[i].getLs()) + "\t" + rf.format(senses[i].getCombo()) + "\t" + rf.format(senses[i].getCombo() * senses[i].getPrior())+  "\t" + senses[i].getPage());
		//}
		return senses;
	}

	public Sense[] classify(Map bow, String form) {
		//logger.debug(bow);
		long begin = System.nanoTime();
		//logger.debug("searching " + form);
		OneExamplePerSenseSearcher.Entry[] entries = oneExamplePerSenseSearcher.search(form);
		//logger.debug(Arrays.toString(entries));
		long end = System.nanoTime();

		Node[] bowVector = lsi.mapDocument(bow);
		//Node[] lsVector = lsi.mapPseudoDocument(bowVector);
		Sense[] senses = new ContextualSense[entries.length];
		//logger.debug("bow\t" + Node.toString(bowVector));
		//logger.debug("+\t" + Node.toString(lsVector));
		for (int i = 0; i < entries.length; i++) {
			//logger.debug(i + "\t" + Node.toString(entries[i].getBowVector()));
			//logger.debug(i + "\t" + Node.toString(entries[i].getLsVector()));
			double bowKernel = dot(bowVector, entries[i].getBowVector(), termMap, form, entries[i].getPage());
			double lsKernel = 0;// Node.dot(lsVector, entries[i].getLsVector());
			//logger.debug(i + "\t" + entries[i].getPage() + "\t" + rf.format(bowKernel) + "\t" + rf.format(lsKernel) + "\t" + rf.format(entries[i].getFreq()));
			senses[i] = new ContextualSense(entries[i].getPage(), entries[i].getFreq(), bowKernel, lsKernel);
		}

		Arrays.sort(senses, new Comparator() {
			@Override
			public int compare(Sense sense, Sense sense2) {
				double diff = sense.getCombo() - sense2.getCombo();
				if (diff > 0) {
					return -1;
				}
				else if (diff < 0) {
					return 1;
				}
				return 0;
			}
		});
		//logger.info("i\tprior\tbow\tls\tcombo\tpage");
		//for (int i = 0; i < senses.length; i++) {
		//	logger.info(i + "\t" + rf.format(senses[i].getPrior()) + "\t" + rf.format(senses[i].getBow()) + "\t" + rf.format(senses[i].getLs()) + "\t" + rf.format(senses[i].getCombo()) + "\t" + rf.format(senses[i].getCombo() * senses[i].getPrior())+  "\t" + senses[i].getPage());
		//}
		return senses;
	}

	public static void main(String args[]) throws Exception {
		String logConfig = System.getProperty("log-config");
		if (logConfig == null) {
			logConfig = "configuration/log-config.txt";
		}

		PropertyConfigurator.configure(logConfig);
		Options options = new Options();
		try {
			Option indexNameOpt = OptionBuilder.withArgName("dir").hasArg().withDescription("open an index with the specified name").isRequired().withLongOpt("index").create("i");
			Option interactiveModeOpt = OptionBuilder.withDescription("enter in the interactive mode").withLongOpt("interactive-mode").create("t");
			Option instanceFileOpt = OptionBuilder.withArgName("file").hasArg().withDescription("read the instances to classify from the specified file").withLongOpt("instance-file").create("f");
			Option lsmDirOpt = OptionBuilder.withArgName("dir").hasArg().withDescription("lsi dir").isRequired().withLongOpt("lsi").create("l");
			Option lsmDimOpt = OptionBuilder.withArgName("int").hasArg().withDescription("lsi dim").withLongOpt("dim").create("d");
			Option normalizedOpt = OptionBuilder.withDescription("normalize vectors (default is " + WikipediaExtractor.DEFAULT_NORMALIZE + ")").withLongOpt("normalized").create();

			options.addOption("h", "help", false, "print this message");
			options.addOption("v", "version", false, "output version information and exit");

			options.addOption(indexNameOpt);
			options.addOption(interactiveModeOpt);
			options.addOption(instanceFileOpt);
			options.addOption(lsmDirOpt);
			options.addOption(lsmDimOpt);
			options.addOption(normalizedOpt);

			CommandLineParser parser = new PosixParser();
			CommandLine line = parser.parse(options, args);

			if (line.hasOption("help") || line.hasOption("version")) {
				throw new ParseException("");
			}

			int minFreq = OneExamplePerSenseSearcher.DEFAULT_MIN_FREQ;
			if (line.hasOption("minimum-freq")) {
				minFreq = Integer.parseInt(line.getOptionValue("minimum-freq"));
			}

			int notificationPoint = Defaults.DEFAULT_NOTIFICATION_POINT;
			if (line.hasOption("notification-point")) {
				notificationPoint = Integer.parseInt(line.getOptionValue("notification-point"));
			}

			String lsmDirName = line.getOptionValue("lsi");
			if (!lsmDirName.endsWith(File.separator)) {
				lsmDirName += File.separator;
			}

			boolean normalized = WikipediaExtractor.DEFAULT_NORMALIZE;
			if (line.hasOption("normalized")) {
				normalized = true;
			}

			File fileUt = new File(lsmDirName + "X-Ut");
			File fileSk = new File(lsmDirName + "X-S");
			File fileR = new File(lsmDirName + "X-row");
			File fileC = new File(lsmDirName + "X-col");
			File fileDf = new File(lsmDirName + "X-df");
			int dim = 100;
			if (line.hasOption("dim")) {
				dim = Integer.parseInt(line.getOptionValue("dim"));
			}
			logger.debug(line.getOptionValue("lsi") + "\t" + line.getOptionValue("dim"));

			LSI lsi = new LSI(fileUt, fileSk, fileR, fileC, fileDf, dim, true, normalized);
			OneExamplePerSenseSearcher oneExamplePerSenseSearcher = new OneExamplePerSenseSearcher(line.getOptionValue("index"));
			oneExamplePerSenseSearcher.setNotificationPoint(notificationPoint);


			if (line.hasOption("instance-file")) {
				OneExamplePerSenseClassifier4 oneExamplePerSenseClassifier = new OneExamplePerSenseClassifier4(lsi, oneExamplePerSenseSearcher);
				oneExamplePerSenseClassifier.classify(new File(line.getOptionValue("instance-file")), false);
			}

			if (line.hasOption("interactive-mode")) {
				OneExamplePerSenseClassifier4 oneExamplePerSenseClassifier = new OneExamplePerSenseClassifier4(lsi, oneExamplePerSenseSearcher);
				oneExamplePerSenseClassifier.interactive();
			}
		} catch (ParseException e) {
			// oops, something went wrong
			if (e.getMessage().length() > 0) {
				System.out.println("Parsing failed: " + e.getMessage() + "\n");
			}
			HelpFormatter formatter = new HelpFormatter();
			formatter.printHelp(400, "java -cp dist/thewikimachine.jar eu.fbk.twm.classifier.OneExamplePerSenseClassifier3", "\n", options, "\n", true);
		}
	}

}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy