cmu.arktweetnlp.impl.features.FeatureExtractor Maven / Gradle / Ivy
The newest version!
package cmu.arktweetnlp.impl.features;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import java.util.logging.Logger;
import cmu.arktweetnlp.impl.Model;
import cmu.arktweetnlp.impl.ModelSentence;
import cmu.arktweetnlp.impl.Sentence;
import cmu.arktweetnlp.util.Util;
import edu.stanford.nlp.util.Pair;
/**
* Extracts features and numberizes them
* Also numberizes other things if necessary (e.g. label numberizations for MEMM training)
*/
public class FeatureExtractor {
/** Only use the model for vocabulary and dimensionality info. **/
private Model model;
private ArrayList allFeatureExtractors;
public boolean isTrainingTime;
public boolean dumpMode = false;
public FeatureExtractor(Model model, boolean isTrainingTime) throws IOException{
this.model = model;
this.isTrainingTime = isTrainingTime;
assert model.labelVocab.isLocked();
initializeFeatureExtractors();
}
public static Logger log = Logger.getLogger("FeatureExtractor");
/**
* Does feature extraction on one sentence.
*
* Input: textual representation of sentence
* Output: fills up modelSentence with numberized features
*/
public void computeFeatures(Sentence linguisticSentence, ModelSentence modelSentence) {
int T = linguisticSentence.T();
assert linguisticSentence.T() > 0; //TODO: handle this when assertions are off
computeObservationFeatures(linguisticSentence, modelSentence);
if (isTrainingTime) {
for (int t=0; t < T; t++) {
modelSentence.labels[t] = model.labelVocab.num( linguisticSentence.labels.get(t) );
}
computeCheatingEdgeFeatures(linguisticSentence, modelSentence);
}
}
/**
* Peek at the modelSentence to see its labels -- for training only!
* @param sentence
* @param modelSentence
*/
private void computeCheatingEdgeFeatures(Sentence sentence, ModelSentence modelSentence) {
assert isTrainingTime;
modelSentence.edgeFeatures[0] = model.startMarker();
for (int t=1; t < sentence.T(); t++) {
modelSentence.edgeFeatures[t] = modelSentence.labels[t-1];
}
}
private void computeObservationFeatures(Sentence sentence, ModelSentence modelSentence) {
PositionFeaturePairs pairs = new PositionFeaturePairs();
// Extract in featurename form
for (FeatureExtractorInterface fe : allFeatureExtractors) {
fe.addFeatures(sentence.tokens, pairs);
}
// Numberize. This should be melded with the addFeatures() loop above, so no wasteful
// temporaries that later turn out to be OOV... but is this really an issue?
for (int i=0; i < pairs.size(); i++) {
int t = pairs.labelIndexes.get(i);
String fName = pairs.featureNames.get(i);
int fID = model.featureVocab.num(fName);
if ( ! isTrainingTime && fID == -1) {
// Skip OOV features at test time.
// Note we have implicit conjunctions from base features, so
// these are base features that weren't seen for *any* label at training time -- of course they will be useless for us...
continue;
}
double fValue = pairs.featureValues.get(i);
modelSentence.observationFeatures.get(t).add(new Pair(fID, fValue));
}
if (dumpMode) {
Util.p("");
for (int t=0; t < sentence.T(); t++) {
System.out.printf("%s\n\t", sentence.tokens.get(t));
for (Pair fv : modelSentence.observationFeatures.get(t)) {
System.out.printf("%s ", model.featureVocab.name(fv.first));
}
System.out.printf("\n");
}
}
}
public interface FeatureExtractorInterface {
/**
* Input: sentence
* Output: labelIndexes, featureIDs/Values through positionFeaturePairs
*
* We want to yield a sequence of (t, featID, featValue) pairs,
* to be conjuncted against label IDs at position t.
* Represent as parallel arrays. Ick yes, but we want to save object allocations (is this crazy?)
* This method should append to them.
*/
public void addFeatures(List tokens, PositionFeaturePairs positionFeaturePairs);
}
public static class PositionFeaturePairs {
public ArrayList labelIndexes;
public ArrayList featureNames;
public ArrayList featureValues;
public PositionFeaturePairs() {
labelIndexes = new ArrayList();
featureNames = new ArrayList();
featureValues = new ArrayList();
}
public void add(int labelIndex, String featureID) {
add(labelIndex, featureID, 1.0);
}
public void add(int labelIndex, String featureID, double featureValue) {
labelIndexes.add(labelIndex);
featureNames.add(featureID);
featureValues.add(featureValue);
}
public int size() { return featureNames.size(); }
}
///////////////////////////////////////////////////////////////////////////
//
// Actual feature extractors
private void initializeFeatureExtractors() throws IOException {
allFeatureExtractors = new ArrayList();
allFeatureExtractors.add(new WordClusterPaths());
allFeatureExtractors.add(new WordListFeatures.POSTagDict());
allFeatureExtractors.add(new WordListFeatures.MetaphonePOSDict());
allFeatureExtractors.add(new MiscFeatures.NgramSuffix(20));
allFeatureExtractors.add(new MiscFeatures.NgramPrefix(20));
allFeatureExtractors.add(new MiscFeatures.PrevWord());
allFeatureExtractors.add(new MiscFeatures.NextWord());
allFeatureExtractors.add(new MiscFeatures.WordformFeatures());
allFeatureExtractors.add(new MiscFeatures.CapitalizationFeatures());
allFeatureExtractors.add(new MiscFeatures.SimpleOrthFeatures());
allFeatureExtractors.add(new MiscFeatures.PrevNext());
allFeatureExtractors.add(new WordListFeatures.Listofnames("proper_names"));
allFeatureExtractors.add(new WordListFeatures.Listofnames("celebs")); //2012-08-09 version of freebase celebrity list
allFeatureExtractors.add(new WordListFeatures.Listofnames("videogame")); //june 22 version of freebase video game list
allFeatureExtractors.add(new WordListFeatures.Listofnames("mobyplaces")); //moby dictionary of US locations
allFeatureExtractors.add(new WordListFeatures.Listofnames("family"));
allFeatureExtractors.add(new WordListFeatures.Listofnames("male"));
allFeatureExtractors.add(new WordListFeatures.Listofnames("female"));
allFeatureExtractors.add(new MiscFeatures.Positions());
//allFeatureExtractors.add(new Prev2Words());
//allFeatureExtractors.add(new Next2Words());
//allFeatureExtractors.add(new MiscFeatures.URLFeatures());
}
// for performance, figuring out a numberization approach faster than string concatenation might help
// internet suggests that String.format() is slower than string concat
// maybe can reuse a StringBuilder object? Ideally, would do direct manipulation of a char[] with reuse.
// Or, if we move to randomized feature hashing, there are far faster methods
// e.g. http://www.hpl.hp.com/techreports/2008/HPL-2008-91R1.pdf
}