All Downloads are FREE. Search and download functionalities are using the official Maven repository.

cc.mallet.pipe.Csv2FeatureVector Maven / Gradle / Ivy

Go to download

MALLET is a Java-based package for statistical natural language processing, document classification, clustering, topic modeling, information extraction, and other machine learning applications to text.

There is a newer version: 2.0.12
Show newest version
/* Copyright (C) 2002 Univ. of Massachusetts Amherst, Computer Science Dept.
   This file is part of "MALLET" (MAchine Learning for LanguagE Toolkit).
   http://www.cs.umass.edu/~mccallum/mallet
   This software is provided under the terms of the Common Public License,
   version 1.0, as published by http://www.opensource.org.  For further
   information, see the file `LICENSE' included with this distribution. */

package cc.mallet.pipe;


import java.util.logging.*;
import java.util.*;

import cc.mallet.pipe.Pipe;
import cc.mallet.types.Alphabet;
import cc.mallet.types.FeatureVector;
import cc.mallet.types.Instance;
import cc.mallet.types.Labeling;
import cc.mallet.util.MalletLogger;


/**
 * Converts a string of the form
 * feature_1:val_1 feature_2:val_2 ... feature_k:val_k
 * into a (sparse) FeatureVector.
 *
 * Features with no ":" character are assumed to have value 1.0.
 * 
 * @author Gary Huang
 */
public class Csv2FeatureVector extends Pipe {

    private static Logger logger = MalletLogger.getLogger(Csv2FeatureVector.class.getName());

    public Csv2FeatureVector(int capacity) {
        this.dataAlphabet = new Alphabet(capacity);
    }
    
    public Csv2FeatureVector() {
        this(1000);
    }
    
    /**
     * Convert the data in the given Instance from a CharSequence 
     * of sparse feature-value pairs to a FeatureVector
     */
    public Instance pipe(Instance carrier) {

        CharSequence c = (CharSequence) carrier.getData();
        String[] pairs = c.toString().trim().split("\\s+");
        int[] keys = new int[pairs.length];
        double[] values = new double[pairs.length];

        for (int i = 0; i < pairs.length; i++) {
			int delimIndex = pairs[i].lastIndexOf(":");
			if (delimIndex <= 0 || delimIndex == (pairs[i].length()-1)) {
				keys[i] = dataAlphabet.lookupIndex(pairs[i], true);
				values[i] = 1.0;
			}
			else {
				keys[i] = dataAlphabet.lookupIndex(pairs[i].substring(0, delimIndex), true);
				values[i] = Double.parseDouble(pairs[i].substring(delimIndex+1));
			}
        }

		// [removed code that sorted indices but NOT values -DM]

        FeatureVector fv = new FeatureVector(dataAlphabet, keys, values);
        carrier.setData( fv );
        return carrier;
    }
    
}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy