All Downloads are FREE. Search and download functionalities are using the official Maven repository.

edu.berkeley.nlp.discPCFG.EncodedDatum Maven / Gradle / Ivy

Go to download

The Berkeley parser analyzes the grammatical structure of natural language using probabilistic context-free grammars (PCFGs).

The newest version!
/**
 * 
 */
package edu.berkeley.nlp.discPCFG;

import edu.berkeley.nlp.util.Counter;

/**
 * EncodedDatums are sparse representations of (labeled) feature count vectors for a given data point.  Use
 * getNumActiveFeatures() to see how many features have non-zero count in a datum.  Then, use getFeatureIndex() and
 * getFeatureCount() to retreive the number and count of each non-zero feature.  Use getLabelIndex() to get the
 * label's number.
 */
public class EncodedDatum {

  public static  EncodedDatum encodeDatum(Encoding encoding, Counter features) {
    return encodeLabeledDatum(encoding, features, null, null);
  }

  public static  EncodedDatum encodeLabeledDatum(Encoding encoding, Counter features, L label, double[] weights) {
    Counter knownFeatures = new Counter();
    for (F feature : features.keySet()) {
      if (encoding.getFeatureIndex(feature) < 0)
        continue;
      knownFeatures.incrementCount(feature, features.getCount(feature));
    }
    int numActiveFeatures = knownFeatures.keySet().size();
    int[] featureIndexes = new int[numActiveFeatures];
    double[] featureCounts = new double[knownFeatures.keySet().size()];
    int i = 0;
    for (F feature : knownFeatures.keySet()) {
      int index = encoding.getFeatureIndex(feature);
      double count = knownFeatures.getCount(feature);
      featureIndexes[i] = index;
      featureCounts[i] = count;
      i++;
    }
    int labelIndex = encoding.getLabelIndex(label);
    EncodedDatum encodedDatum = new EncodedDatum(labelIndex, featureIndexes, featureCounts, weights);
    return encodedDatum;
  }

  int labelIndex;
  int[] featureIndexes;
  double[] featureCounts;
  double[] weights;  // the probability of each substate of the label (allows partial labeling)

  public int getLabelIndex() {
    return labelIndex;
  }

  public double[] getWeights() {
    return weights;
  }

  public int getNumActiveFeatures() {
    return featureCounts.length;
  }

  public int getFeatureIndex(int num) {
    return featureIndexes[num];
  }

  public double getFeatureCount(int num) {
    return featureCounts[num];
  }

  public EncodedDatum(int labelIndex, int[] featureIndexes, double[] featureCounts, double[] weights) {
    this.labelIndex = labelIndex;
    this.featureIndexes = featureIndexes;
    this.featureCounts = featureCounts;
    this.weights = weights;
  }
}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy