All Downloads are FREE. Search and download functionalities are using the official Maven repository.

cc.mallet.classify.tui.SvmLight2Vectors Maven / Gradle / Ivy

Go to download

MALLET is a Java-based package for statistical natural language processing, document classification, clustering, topic modeling, information extraction, and other machine learning applications to text.

There is a newer version: 2.0.12
Show newest version
/* Copyright (C) 2010 Univ. of Massachusetts Amherst, Computer Science Dept.
   This file is part of "MALLET" (MAchine Learning for LanguagE Toolkit).
   http://www.cs.umass.edu/~mccallum/mallet
   This software is provided under the terms of the Common Public License,
   version 1.0, as published by http://www.opensource.org.  For further
   information, see the file `LICENSE' included with this distribution. */

package cc.mallet.classify.tui;

import java.util.ArrayList;
import java.util.logging.*;
import java.io.*;
import java.nio.charset.Charset;

import cc.mallet.pipe.*;
import cc.mallet.pipe.iterator.*;
import cc.mallet.types.*;
import cc.mallet.util.*;

/**
 * Command line import tool for loading a sequence of 
 *  instances from an SVMLight feature-value pair file, with one instance 
 *  per line of the input file. 
 *  

* * The expected format is * * target feature:value feature:value ... * * targets and features can be indices, as in * SVMLight, or Strings. * * Note that if targets and features are indices, * their indices in the data and target Alphabets * may be different, though the data will be * equivalent. * * Note that the input and output args can take multiple files. * * @author Gregory Druck */ public class SvmLight2Vectors { private static Logger logger = MalletLogger.getLogger(SvmLight2Vectors.class.getName()); static CommandOption.SpacedStrings inputFiles = new CommandOption.SpacedStrings (SvmLight2Vectors.class, "input", "FILE", true, null, "The files containing data to be classified, one instance per line", null); static CommandOption.SpacedStrings outputFiles = new CommandOption.SpacedStrings (SvmLight2Vectors.class, "output", "FILE", true, null, "Write the instance list to this file; Using - indicates stdout.", null); static CommandOption.File usePipeFromVectorsFile = new CommandOption.File (SvmLight2Vectors.class, "use-pipe-from", "FILE", true, new File("text.vectors"), "Use the pipe and alphabets from a previously created vectors file.\n" + " Allows the creation, for example, of a test set of vectors that are\n" + " compatible with a previously created set of training vectors", null); static CommandOption.Boolean printOutput = new CommandOption.Boolean (SvmLight2Vectors.class, "print-output", "[TRUE|FALSE]", false, false, "If true, print a representation of the processed data\n" + " to standard output. This option is intended for debugging.", null); static CommandOption.String encoding = new CommandOption.String (SvmLight2Vectors.class, "encoding", "STRING", true, Charset.defaultCharset().displayName(), "Character encoding for input file", null); public static void main (String[] args) throws FileNotFoundException, IOException { // Process the command-line options CommandOption.setSummary (SvmLight2Vectors.class, "A tool for creating instance lists of feature vectors from comma-separated-values"); CommandOption.process (SvmLight2Vectors.class, args); // Print some helpful messages for error cases if (args.length == 0) { CommandOption.getList(SvmLight2Vectors.class).printUsage(false); System.exit (-1); } if (inputFiles == null) { throw new IllegalArgumentException ("You must include `--input FILE FILE ...' in order to specify "+ "files containing the instances, one per line."); } Pipe instancePipe; InstanceList previousInstanceList = null; if (usePipeFromVectorsFile.wasInvoked()) { // Ignore all options, use a previously created pipe previousInstanceList = InstanceList.load (usePipeFromVectorsFile.value); instancePipe = previousInstanceList.getPipe(); } else { // Build a new pipe ArrayList pipeList = new ArrayList(); pipeList.add(new SvmLight2FeatureVectorAndLabel()); if (printOutput.value) { pipeList.add(new PrintInputAndTarget()); } instancePipe = new SerialPipes(pipeList); } if (inputFiles.value.length != outputFiles.value.length) { throw new RuntimeException("Number of input and output files must be the same."); } InstanceList[] instances = new InstanceList[inputFiles.value.length]; for (int fileIndex = 0; fileIndex < inputFiles.value.length; fileIndex++) { // Create the instance list and open the input file instances[fileIndex] = new InstanceList (instancePipe); Reader fileReader; if (inputFiles.value[fileIndex].equals ("-")) { fileReader = new InputStreamReader (System.in); } else { fileReader = new InputStreamReader(new FileInputStream(inputFiles.value[fileIndex]), encoding.value); } // Read instances from the file instances[fileIndex].addThruPipe (new SelectiveFileLineIterator (fileReader, "^\\s*#.+")); } // [email protected] // If we have multiple files, the data or target alphabet may have new // elements added to it with each new file. If we save each InstanceList // immediately after processing each file, then Alphabets won't be the // same. Instead, process all files before writing the InstanceLists. for (int fileIndex = 0; fileIndex < inputFiles.value.length; fileIndex++) { // Save instances to output file instances[fileIndex].save(new File(outputFiles.value[fileIndex])); } // If we are reusing a pipe from an instance list // created earlier, we may have extended the label // or feature alphabets. To maintain compatibility, // we now save that original instance list back to disk // with the new alphabet. if (usePipeFromVectorsFile.wasInvoked()) { logger.info(" Rewriting extended pipe from " + usePipeFromVectorsFile.value); logger.info(" Instance ID = " + previousInstanceList.getPipe().getInstanceId()); ObjectOutputStream oos = new ObjectOutputStream(new FileOutputStream(usePipeFromVectorsFile.value)); oos.writeObject(previousInstanceList); oos.close(); } } }





© 2015 - 2025 Weber Informatics LLC | Privacy Policy