cc.mallet.classify.tui.SvmLight2Vectors Maven / Gradle / Ivy
Show all versions of mallet Show documentation
/* Copyright (C) 2010 Univ. of Massachusetts Amherst, Computer Science Dept.
This file is part of "MALLET" (MAchine Learning for LanguagE Toolkit).
This software is provided under the terms of the Common Public License,
version 1.0, as published by http://www.opensource.org. For further
information, see the file `LICENSE' included with this distribution. */
package cc.mallet.classify.tui;
import java.util.ArrayList;
import java.util.logging.*;
import java.io.*;
import java.nio.charset.Charset;
import cc.mallet.pipe.*;
import cc.mallet.pipe.iterator.*;
import cc.mallet.types.*;
import cc.mallet.util.*;
* Command line import tool for loading a sequence of
* instances from an SVMLight feature-value pair file, with one instance
* per line of the input file.
* The expected format is
* target feature:value feature:value ...
* targets and features can be indices, as in
* SVMLight, or Strings.
* Note that if targets and features are indices,
* their indices in the data and target Alphabets
* may be different, though the data will be
* equivalent.
* Note that the input and output args can take multiple files.
* @author Gregory Druck
public class SvmLight2Vectors {
private static Logger logger = MalletLogger.getLogger(SvmLight2Vectors.class.getName());
static CommandOption.SpacedStrings inputFiles = new CommandOption.SpacedStrings
(SvmLight2Vectors.class, "input", "FILE", true, null,
"The files containing data to be classified, one instance per line", null);
static CommandOption.SpacedStrings outputFiles = new CommandOption.SpacedStrings
(SvmLight2Vectors.class, "output", "FILE", true, null,
"Write the instance list to this file; Using - indicates stdout.", null);
static CommandOption.File usePipeFromVectorsFile = new CommandOption.File
(SvmLight2Vectors.class, "use-pipe-from", "FILE", true, new File("text.vectors"),
"Use the pipe and alphabets from a previously created vectors file.\n" +
" Allows the creation, for example, of a test set of vectors that are\n" +
" compatible with a previously created set of training vectors", null);
static CommandOption.Boolean printOutput = new CommandOption.Boolean
(SvmLight2Vectors.class, "print-output", "[TRUE|FALSE]", false, false,
"If true, print a representation of the processed data\n" +
" to standard output. This option is intended for debugging.", null);
static CommandOption.String encoding = new CommandOption.String
(SvmLight2Vectors.class, "encoding", "STRING", true, Charset.defaultCharset().displayName(),
"Character encoding for input file", null);
public static void main (String[] args) throws FileNotFoundException, IOException
// Process the command-line options
CommandOption.setSummary (SvmLight2Vectors.class,
"A tool for creating instance lists of feature vectors from comma-separated-values");
CommandOption.process (SvmLight2Vectors.class, args);
// Print some helpful messages for error cases
if (args.length == 0) {
System.exit (-1);
if (inputFiles == null) {
throw new IllegalArgumentException ("You must include `--input FILE FILE ...' in order to specify "+
"files containing the instances, one per line.");
Pipe instancePipe;
InstanceList previousInstanceList = null;
if (usePipeFromVectorsFile.wasInvoked()) {
// Ignore all options, use a previously created pipe
previousInstanceList = InstanceList.load (usePipeFromVectorsFile.value);
instancePipe = previousInstanceList.getPipe();
else {
// Build a new pipe
ArrayList pipeList = new ArrayList();
pipeList.add(new SvmLight2FeatureVectorAndLabel());
if (printOutput.value) {
pipeList.add(new PrintInputAndTarget());
instancePipe = new SerialPipes(pipeList);
if (inputFiles.value.length != outputFiles.value.length) {
throw new RuntimeException("Number of input and output files must be the same.");
InstanceList[] instances = new InstanceList[inputFiles.value.length];
for (int fileIndex = 0; fileIndex < inputFiles.value.length; fileIndex++) {
// Create the instance list and open the input file
instances[fileIndex] = new InstanceList (instancePipe);
Reader fileReader;
if (inputFiles.value[fileIndex].equals ("-")) {
fileReader = new InputStreamReader (System.in);
else {
fileReader = new InputStreamReader(new FileInputStream(inputFiles.value[fileIndex]), encoding.value);
// Read instances from the file
instances[fileIndex].addThruPipe (new SelectiveFileLineIterator (fileReader, "^\\s*#.+"));
// [email protected]
// If we have multiple files, the data or target alphabet may have new
// elements added to it with each new file. If we save each InstanceList
// immediately after processing each file, then Alphabets won't be the
// same. Instead, process all files before writing the InstanceLists.
for (int fileIndex = 0; fileIndex < inputFiles.value.length; fileIndex++) {
// Save instances to output file
instances[fileIndex].save(new File(outputFiles.value[fileIndex]));
// If we are reusing a pipe from an instance list
// created earlier, we may have extended the label
// or feature alphabets. To maintain compatibility,
// we now save that original instance list back to disk
// with the new alphabet.
if (usePipeFromVectorsFile.wasInvoked()) {
logger.info(" Rewriting extended pipe from " + usePipeFromVectorsFile.value);
logger.info(" Instance ID = " + previousInstanceList.getPipe().getInstanceId());
ObjectOutputStream oos = new ObjectOutputStream(new FileOutputStream(usePipeFromVectorsFile.value));