All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.aliasi.features.ZScoreFeatureExtractor Maven / Gradle / Ivy

Go to download

This is the original Lingpipe: http://alias-i.com/lingpipe/web/download.html There were not made any changes to the source code.

There is a newer version: 4.1.2-JL1.0
Show newest version
/*
 * LingPipe v. 4.1.0
 * Copyright (C) 2003-2011 Alias-i
 *
 * This program is licensed under the Alias-i Royalty Free License
 * Version 1 WITHOUT ANY WARRANTY, without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the Alias-i
 * Royalty Free License Version 1 for more details.
 *
 * You should have received a copy of the Alias-i Royalty Free License
 * Version 1 along with this program; if not, visit
 * http://alias-i.com/lingpipe/licenses/lingpipe-license-1.txt or contact
 * Alias-i, Inc. at 181 North 11th Street, Suite 401, Brooklyn, NY 11211,
 * +1 (718) 290-9170.
 */

package com.aliasi.features;

import com.aliasi.classify.Classified;
import com.aliasi.classify.Classification;

import com.aliasi.corpus.Corpus;
import com.aliasi.corpus.ObjectHandler;

import com.aliasi.stats.OnlineNormalEstimator;

import com.aliasi.util.AbstractExternalizable;
import com.aliasi.util.FeatureExtractor;


import java.io.IOException;
import java.io.ObjectInput;
import java.io.ObjectOutput;
import java.io.Serializable;

import java.util.Collections;
import java.util.HashMap;
import java.util.HashSet;
import java.util.LinkedHashMap;
import java.util.Map;
import java.util.Set;

/**
 * A {@code ZScoreFeatureExtractor} converts features to their
 * z-scores, where means and deviations are determined by a
 * corpus supplied at compile time.

 * 

Means and standard deviations are computed for each feature * in the training section of the corpus supplied to the * constructor. * *

At run time, feature values are converted to z-scores, by: * *

 * z(feat,val) = (val - mean(feat))/stdDev(feat)
* * where {@code feat} is the feature, {@code val} is the value * to be converted to a z-score, {@code mean(feat)} is the mean * (average) of the feature in the training corpus, and * {@code stdDev(feat)} is the standard deviation of the feature * in the training course. * *

Z-score normalization ensures that the collection of each * feature's values has zero mean and unit standard deviation * over the training section of the training corpus. This does * not guarantee zero means and unit standard deviation over * the test section of the corpus. * *

Constant (Zero Deviation) Features

* *

If a feature is unseen or has zero standard deviation in the * training corpus, it is removed from all output. A feature only has * zero standard deviation if it has the same value every time it * occurs. For instance, all features seen only once will have zero * variance. Effectively, features which always have the same value * in the training set will be eliminated from future consideration. * *

Sparseness

* * Applying a z-score transform to features destroys sparseness. * Undefined features implicitly have value zero, but the z-score * of 0 is non-zero if the mean of the feature values is non-zero. * *

Serialization

* *

A length-norm feature extractor is serializable if its * base feature extractor is serializable. * * @author Mike Ross * @author Bob Carpenter * @version 4.0.0 * @since Lingpipe3.8 * @param The type of object whose features are extracted. */ public class ZScoreFeatureExtractor extends FeatureExtractorFilter implements Serializable { static final long serialVersionUID = -5628628145432035433L; final Map mFeatureToMeanDev; ZScoreFeatureExtractor(FeatureExtractor extractor, Map featureToMeanDev) { super(extractor); mFeatureToMeanDev = new LinkedHashMap(featureToMeanDev); } /** * Construct a z-core feature extractor from the specified base * feature extractor and the training section of the supplied * corpus. * * @param extractor Base feature extractor. * @param corpus The corpus whose training section will be visited * @throws IOException If there is an I/O error visting the corpus. */ public ZScoreFeatureExtractor(Corpus>> corpus, FeatureExtractor extractor) throws IOException { this(extractor,meanDevs(corpus,extractor)); } /** * Return the feature map resulting from converting the feature * map produced by the underlying feature extractor to z-scores. * See the class documentation above for definition. * * @param in Input object. * @return Feature map for the input object. */ public Map features(E in) { Map featureMap = super.features(in); Map result = new HashMap(); for (Map.Entry featMeanDev : mFeatureToMeanDev.entrySet()) { String feature = featMeanDev.getKey(); MeanDev meanDev = featMeanDev.getValue(); Number n = featureMap.get(feature); double val = meanDev.zScore(n == null ? 0.0 : featureMap.get(feature).doubleValue()); result.put(feature,val); } return result; } /** * Return the z-score for the specified feature and value. * See the class documentation above for definitions. * * @param feature Feature name. * @param value Value of feature. * @return The z-score of the value for the specified feature. */ public double zScore(String feature, double value) { MeanDev meanDev = mFeatureToMeanDev.get(feature); return meanDev == null ? null : meanDev.zScore(value); } /** * Returns the mean for the specified feature, or * {@code Double.NaN} if the feature is not known. * * @param feature Feature whose mean is returned. * @return Mean for the specified feature. */ public double mean(String feature) { MeanDev meanDev = mFeatureToMeanDev.get(feature); return meanDev == null ? Double.NaN : meanDev.mMean; } /** * Returns the standard deviation for the specified feature, or * {@code Double.NaN} if the feature is not known. * * @param feature Feature whose standard deviation is returned. * @return Standard deviation for the specified feature. */ public double standardDeviation(String feature) { MeanDev meanDev = mFeatureToMeanDev.get(feature); return meanDev == null ? Double.NaN : meanDev.mDev; } /** * Returns an unmodifiable view of the known features * for this z-score feature extractor. * * @return The set of known features for this extractor. */ public Set knownFeatures() { return Collections.unmodifiableSet(mFeatureToMeanDev.keySet()); } /** * Returns a string representation of this z-score feature * extractor, listing the mean and deviation for each * feature. * * @return String representation of this extractor. */ @Override public String toString() { StringBuilder sb = new StringBuilder(); for (Map.Entry entry : mFeatureToMeanDev.entrySet()) { String feature = entry.getKey(); MeanDev meanDev = entry.getValue(); sb.append("|"); sb.append(feature); sb.append("| "); sb.append(meanDev); sb.append('\n'); } return sb.toString(); } Object writeReplace() { return new Serializer(this); } static Map meanDevs(Corpus>> corpus, final FeatureExtractor extractor) throws IOException { final Set collectedFeatures = new HashSet(); corpus.visitTrain(new ObjectHandler>() { public void handle(Classified classified) { collectedFeatures.addAll(extractor.features(classified.getObject()).keySet()); } }); final Map featToEstimator = new HashMap(); //For each entry ("in") of the corpus's training section... corpus.visitTrain(new ObjectHandler>() { public void handle(Classified classified) { F in = classified.getObject(); //For each feature ("feat") of "in"... for (String feature : collectedFeatures) { Number value = extractor.features(in).get(feature); double v = value==null ? 0.0 : value.doubleValue(); //Get or create an OnlineNormalEstimator for that feature OnlineNormalEstimator estimator = featToEstimator.get(feature); if (estimator == null) { estimator = new OnlineNormalEstimator(); featToEstimator.put(feature,estimator); } //Send the feature's value to the estimator... estimator.handle(v); } } }); Map result = new HashMap(); for (Map.Entry entry : featToEstimator.entrySet()) { String feat = entry.getKey(); OnlineNormalEstimator estimator = entry.getValue(); double mean = estimator.mean(); double dev = estimator.standardDeviation(); if (dev > 0.0) result.put(feat,new MeanDev(mean,dev)); } return result; } static final class MeanDev { final double mMean; final double mDev; MeanDev(double mean, double dev) { mMean = mean; mDev = dev; } double zScore(double x) { return (x - mMean)/mDev; } public String toString() { return "mean=" + mMean + " dev=" + mDev; } } static class Serializer extends AbstractExternalizable { static final long serialVersionUID = 6365515337527915147L; private final ZScoreFeatureExtractor mFilter; public Serializer() { this(null); } public Serializer(ZScoreFeatureExtractor filter) { mFilter = filter; } @Override public void writeExternal(ObjectOutput out) throws IOException { out.writeObject(mFilter.baseExtractor()); out.writeInt(mFilter.mFeatureToMeanDev.size()); for (Map.Entry entry : mFilter.mFeatureToMeanDev.entrySet()) { out.writeUTF(entry.getKey()); out.writeDouble(entry.getValue().mMean); out.writeDouble(entry.getValue().mDev); } } @Override public Object read(ObjectInput in) throws IOException, ClassNotFoundException { // required for deserialization @SuppressWarnings("unchecked") FeatureExtractor extractor = (FeatureExtractor) in.readObject(); int numFeats = in.readInt(); Map featureToMeanDev = new HashMap((3 * numFeats)/2); for (int i = 0; i < numFeats; ++i) { String feature = in.readUTF(); double mean = in.readDouble(); double dev = in.readDouble(); featureToMeanDev.put(feature,new MeanDev(mean,dev)); } return new ZScoreFeatureExtractor(extractor,featureToMeanDev); } } }





© 2015 - 2025 Weber Informatics LLC | Privacy Policy