![JAR search and dependency download from the Maven repository](/logo.png)
edu.berkeley.nlp.discPCFG.EncodedDatum Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of berkeleyparser Show documentation
Show all versions of berkeleyparser Show documentation
The Berkeley parser analyzes the grammatical structure of natural language using probabilistic context-free grammars (PCFGs).
The newest version!
/**
*
*/
package edu.berkeley.nlp.discPCFG;
import edu.berkeley.nlp.util.Counter;
/**
* EncodedDatums are sparse representations of (labeled) feature count vectors for a given data point. Use
* getNumActiveFeatures() to see how many features have non-zero count in a datum. Then, use getFeatureIndex() and
* getFeatureCount() to retreive the number and count of each non-zero feature. Use getLabelIndex() to get the
* label's number.
*/
public class EncodedDatum {
public static EncodedDatum encodeDatum(Encoding encoding, Counter features) {
return encodeLabeledDatum(encoding, features, null, null);
}
public static EncodedDatum encodeLabeledDatum(Encoding encoding, Counter features, L label, double[] weights) {
Counter knownFeatures = new Counter();
for (F feature : features.keySet()) {
if (encoding.getFeatureIndex(feature) < 0)
continue;
knownFeatures.incrementCount(feature, features.getCount(feature));
}
int numActiveFeatures = knownFeatures.keySet().size();
int[] featureIndexes = new int[numActiveFeatures];
double[] featureCounts = new double[knownFeatures.keySet().size()];
int i = 0;
for (F feature : knownFeatures.keySet()) {
int index = encoding.getFeatureIndex(feature);
double count = knownFeatures.getCount(feature);
featureIndexes[i] = index;
featureCounts[i] = count;
i++;
}
int labelIndex = encoding.getLabelIndex(label);
EncodedDatum encodedDatum = new EncodedDatum(labelIndex, featureIndexes, featureCounts, weights);
return encodedDatum;
}
int labelIndex;
int[] featureIndexes;
double[] featureCounts;
double[] weights; // the probability of each substate of the label (allows partial labeling)
public int getLabelIndex() {
return labelIndex;
}
public double[] getWeights() {
return weights;
}
public int getNumActiveFeatures() {
return featureCounts.length;
}
public int getFeatureIndex(int num) {
return featureIndexes[num];
}
public double getFeatureCount(int num) {
return featureCounts[num];
}
public EncodedDatum(int labelIndex, int[] featureIndexes, double[] featureCounts, double[] weights) {
this.labelIndex = labelIndex;
this.featureIndexes = featureIndexes;
this.featureCounts = featureCounts;
this.weights = weights;
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy