All Downloads are FREE. Search and download functionalities are using the official Maven repository.

cc.mallet.classify.tui.Vectors2Info Maven / Gradle / Ivy

Go to download

MALLET is a Java-based package for statistical natural language processing, document classification, clustering, topic modeling, information extraction, and other machine learning applications to text.

The newest version!
/* Copyright (C) 2003 Univ. of Massachusetts Amherst, Computer Science Dept.
   This file is part of "MALLET" (MAchine Learning for LanguagE Toolkit).
   http://www.cs.umass.edu/~mccallum/mallet
   This software is provided under the terms of the Common Public License,
   version 1.0, as published by http://www.opensource.org.  For further
   information, see the file `LICENSE' included with this distribution. */





package cc.mallet.classify.tui;

import java.util.logging.*;
import java.io.*;

import cc.mallet.types.*;
import cc.mallet.util.*;
/**
 * Diagnostic facilities for a vector file.
   @author Andrew McCallum [email protected]
 */

public class Vectors2Info
{
	private static Logger logger = MalletLogger.getLogger(Vectors2Info.class.getName());

	static CommandOption.File inputFile = new CommandOption.File(Vectors2Info.class, "input", "FILE", true, new File("-"),
	 "Read the instance list from this file; Using - indicates stdin.", null);

	static CommandOption.Boolean printInstances = new CommandOption.Boolean(Vectors2Info.class, "print-instances", "N", false, false,
	 "Print labels and contents for all instances.", null);

	static CommandOption.Integer printInfogain = new CommandOption.Integer(Vectors2Info.class, "print-infogain", "N", false, 0,
	 "Print top N words by information gain, sorted.", null);

	static CommandOption.Boolean printLabels = new CommandOption.Boolean(Vectors2Info.class, "print-labels", "[TRUE|FALSE]", false, false,
	 "Print class labels known to instance list, one per line.", null);

	static CommandOption.Boolean printFeatures = new CommandOption.Boolean(Vectors2Info.class, "print-features", "[TRUE|FALSE]", false, false,
	 "Print the data alphabet, one feature per line.", null);

	static CommandOption.Boolean printFeatureCounts = new CommandOption.Boolean(Vectors2Info.class, "print-feature-counts", "[TRUE|FALSE]", false, false,
	 "Print feature names, feature counts (ie term frequency), and feature index counts (ie document frequency).", null);

	static CommandOption.String printMatrix = new CommandOption.String(Vectors2Info.class, "print-matrix", "STRING", false, "sic",
	 "Print word/document matrix in the specified format (a|s)(b|i)(n|w|c|e), for (all vs. sparse), (binary vs. integer), (number vs. word vs. combined vs. empty)", null)
	{
		public void parseArg(java.lang.String arg) {

			if (arg == null) arg = this.defaultValue;
			//System.out.println("pa arg=" + arg);

			// sanity check the raw printing options (a la Rainbow)
			char c0 = arg.charAt(0);
			char c1 = arg.charAt(1);
			char c2 = arg.charAt(2);

			if (arg.length() != 3 ||
			        (c0 != 's' && c0 != 'a') ||
			        (c1 != 'b' && c1 != 'i') ||
			        (c2 != 'n' && c2 != 'w' && c2 != 'c' && c2 != 'e')) {
				throw new IllegalArgumentException("Illegal argument = " + arg + " in --print-matrix=" +arg);
			}

			value = arg;
		}
	};

	public static void main (String[] args) throws FileNotFoundException, IOException {

		// Process the command-line options
		CommandOption.setSummary (Vectors2Info.class,
								  "A tool for printing information about instance lists of feature vectors.");
		CommandOption.process (Vectors2Info.class, args);

		// Print some helpful messages for error cases
		if (args.length == 0) {
			CommandOption.getList(Vectors2Info.class).printUsage(false);
			System.exit (-1);
		}
		if (false && !inputFile.wasInvoked()) {
			System.err.println ("You must specify an input instance list, with --input.");
			System.exit (-1);
		}

		// Read the InstanceList
		InstanceList instances = InstanceList.load (inputFile.value);

		if (printLabels.value) {
			Alphabet labelAlphabet = instances.getTargetAlphabet ();
			for (int i = 0; i < labelAlphabet.size(); i++) {
				System.out.println (labelAlphabet.lookupObject (i));
			}
			System.out.print ("\n");
		}

		if (printInstances.value) {
			for (Instance instance: instances) {
				System.out.println(instance.getName() + "\t" + instance.getTarget() + "\t" + instance.getData());
			}
		}

		if (printFeatureCounts.value) {
			FeatureCountTool counter = new FeatureCountTool(instances);
			counter.count();
			counter.printCounts();
		}

		if (printFeatures.value) {
			Alphabet alphabet = instances.getDataAlphabet();
			for (int i = 0; i < alphabet.size(); i++) {
				System.out.println(alphabet.lookupObject(i));
			}
		}

		if (printInfogain.value > 0) {
			InfoGain ig = new InfoGain (instances);
			for (int i = 0; i < printInfogain.value; i++) {
				System.out.println (""+i+" "+ig.getObjectAtRank(i));
			}
			System.out.print ("\n");
		}

		if (printMatrix.wasInvoked()) {
			 printInstanceList(instances, printMatrix.value);
		}

	}

	/** print an instance list according to the format string */
	private static void printInstanceList(InstanceList instances, String formatString) {

		int numInstances = instances.size();
		int numClasses = instances.getTargetAlphabet().size();
		int numFeatures = instances.getDataAlphabet().size();

		Alphabet dataAlphabet = instances.getDataAlphabet();
		double[] counts = new double[numFeatures];
		double count;

		for (int i = 0; i < instances.size(); i++) {
			Instance instance = instances.get(i);

			if (instance.getData() instanceof FeatureVector) {
				FeatureVector fv = (FeatureVector) instance.getData ();
				
				System.out.print(instance.getName() + " " + instance.getTarget());
				
				if (formatString.charAt(0) == 'a') {
					// Dense: Print all features, even those with value 0.
					for (int fvi=0; fvi0.5) ? "1" : "0"));
		}

	}


}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy