marytts.features.FeatureDefinition Maven / Gradle / Ivy
The newest version!
/**
* Copyright 2006 DFKI GmbH.
* All Rights Reserved. Use is subject to license terms.
*
* This file is part of MARY TTS.
*
* MARY TTS is free software: you can redistribute it and/or modify
* it under the terms of the GNU Lesser General Public License as published by
* the Free Software Foundation, version 3 of the License.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public License
* along with this program. If not, see .
*
*/
package marytts.features;
import java.io.BufferedReader;
import java.io.ByteArrayInputStream;
import java.io.ByteArrayOutputStream;
import java.io.DataInput;
import java.io.DataInputStream;
import java.io.DataOutput;
import java.io.DataOutputStream;
import java.io.IOException;
import java.io.PrintWriter;
import java.nio.ByteBuffer;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import java.util.Set;
import marytts.util.io.StreamUtils;
import marytts.util.string.ByteStringTranslator;
import marytts.util.string.IntStringTranslator;
import marytts.util.string.ShortStringTranslator;
/**
* A feature definition object represents the "meaning" of feature vectors. It consists of a list of byte-valued, short-valued and
* continuous features by name and index position in the feature vector; the respective possible feature values (and corresponding
* byte and short codes); and, optionally, the weights and, for continuous features, weighting functions for each feature.
*
* @author Marc Schröder
* @author steiner
*/
public class FeatureDefinition {
public static final String BYTEFEATURES = "ByteValuedFeatureProcessors";
public static final String SHORTFEATURES = "ShortValuedFeatureProcessors";
public static final String CONTINUOUSFEATURES = "ContinuousFeatureProcessors";
public static final String FEATURESIMILARITY = "FeatureSimilarity";
public static final char WEIGHT_SEPARATOR = '|';
public static final String EDGEFEATURE = "edge";
public static final String EDGEFEATURE_START = "start";
public static final String EDGEFEATURE_END = "end";
public static final String NULLVALUE = "0";
private int numByteFeatures;
private int numShortFeatures;
private int numContinuousFeatures;
private float[] featureWeights;
private IntStringTranslator featureNames;
// feature values: for byte and short features only
private ByteStringTranslator[] byteFeatureValues;
private ShortStringTranslator[] shortFeatureValues;
private String[] floatWeightFuncts; // for continuous features only
private float[][][] similarityMatrices = null;
/**
* Create a feature definition object, reading textual data from the given BufferedReader.
*
* @param input
* a BufferedReader from which a textual feature definition can be read.
* @param readWeights
* a boolean indicating whether or not to read weights from input. If weights are read, they will be normalized so
* that they sum to one.
* @throws IOException
* if a reading problem occurs
*
*/
public FeatureDefinition(BufferedReader input, boolean readWeights) throws IOException {
// Section BYTEFEATURES
String line = input.readLine();
if (line == null)
throw new IOException("Could not read from input");
while (line.matches("^\\s*#.*") || line.matches("\\s*")) {
line = input.readLine();
}
if (!line.trim().equals(BYTEFEATURES)) {
throw new IOException("Unexpected input: expected '" + BYTEFEATURES + "', read '" + line + "'");
}
List byteFeatureLines = new ArrayList();
while (true) {
line = input.readLine();
if (line == null)
throw new IOException("Could not read from input");
line = line.trim();
if (line.equals(SHORTFEATURES))
break; // Found end of section
byteFeatureLines.add(line);
}
// Section SHORTFEATURES
List shortFeatureLines = new ArrayList();
while (true) {
line = input.readLine();
if (line == null)
throw new IOException("Could not read from input");
line = line.trim();
if (line.equals(CONTINUOUSFEATURES))
break; // Found end of section
shortFeatureLines.add(line);
}
// Section CONTINUOUSFEATURES
List continuousFeatureLines = new ArrayList();
boolean readFeatureSimilarity = false;
while ((line = input.readLine()) != null) { // it's OK if we hit the end of the file now
line = line.trim();
// if (line.equals(FEATURESIMILARITY) || line.equals("")) break; // Found end of section
if (line.equals(FEATURESIMILARITY)) {
// readFeatureSimilarityMatrices(input);
readFeatureSimilarity = true;
break;
} else if (line.equals("")) { // empty line: end of section
break;
}
continuousFeatureLines.add(line);
}
numByteFeatures = byteFeatureLines.size();
numShortFeatures = shortFeatureLines.size();
numContinuousFeatures = continuousFeatureLines.size();
int total = numByteFeatures + numShortFeatures + numContinuousFeatures;
featureNames = new IntStringTranslator(total);
byteFeatureValues = new ByteStringTranslator[numByteFeatures];
shortFeatureValues = new ShortStringTranslator[numShortFeatures];
float sumOfWeights = 0; // for normalisation of weights
if (readWeights) {
featureWeights = new float[total];
floatWeightFuncts = new String[numContinuousFeatures];
}
for (int i = 0; i < numByteFeatures; i++) {
line = byteFeatureLines.get(i);
String featureDef;
if (readWeights) {
int seppos = line.indexOf(WEIGHT_SEPARATOR);
if (seppos == -1)
throw new IOException("Weight separator '" + WEIGHT_SEPARATOR + "' not found in line '" + line + "'");
String weightDef = line.substring(0, seppos).trim();
featureDef = line.substring(seppos + 1).trim();
// The weight definition is simply the float number:
featureWeights[i] = Float.parseFloat(weightDef);
sumOfWeights += featureWeights[i];
if (featureWeights[i] < 0)
throw new IOException("Negative weight found in line '" + line + "'");
} else {
featureDef = line;
}
// Now featureDef is a String in which the feature name and all feature values
// are separated by white space.
String[] nameAndValues = featureDef.split("\\s+", 2);
featureNames.set(i, nameAndValues[0]); // the feature name
byteFeatureValues[i] = new ByteStringTranslator(nameAndValues[1].split("\\s+")); // the feature values
}
for (int i = 0; i < numShortFeatures; i++) {
line = shortFeatureLines.get(i);
String featureDef;
if (readWeights) {
int seppos = line.indexOf(WEIGHT_SEPARATOR);
if (seppos == -1)
throw new IOException("Weight separator '" + WEIGHT_SEPARATOR + "' not found in line '" + line + "'");
String weightDef = line.substring(0, seppos).trim();
featureDef = line.substring(seppos + 1).trim();
// The weight definition is simply the float number:
featureWeights[numByteFeatures + i] = Float.parseFloat(weightDef);
sumOfWeights += featureWeights[numByteFeatures + i];
if (featureWeights[numByteFeatures + i] < 0)
throw new IOException("Negative weight found in line '" + line + "'");
} else {
featureDef = line;
}
// Now featureDef is a String in which the feature name and all feature values
// are separated by white space.
String[] nameAndValues = featureDef.split("\\s+", 2);
featureNames.set(numByteFeatures + i, nameAndValues[0]); // the feature name
shortFeatureValues[i] = new ShortStringTranslator(nameAndValues[1].split("\\s+")); // the feature values
}
for (int i = 0; i < numContinuousFeatures; i++) {
line = continuousFeatureLines.get(i);
String featureDef;
if (readWeights) {
int seppos = line.indexOf(WEIGHT_SEPARATOR);
if (seppos == -1)
throw new IOException("Weight separator '" + WEIGHT_SEPARATOR + "' not found in line '" + line + "'");
String weightDef = line.substring(0, seppos).trim();
featureDef = line.substring(seppos + 1).trim();
// The weight definition is the float number plus a definition of a weight function:
String[] weightAndFunction = weightDef.split("\\s+", 2);
featureWeights[numByteFeatures + numShortFeatures + i] = Float.parseFloat(weightAndFunction[0]);
sumOfWeights += featureWeights[numByteFeatures + numShortFeatures + i];
if (featureWeights[numByteFeatures + numShortFeatures + i] < 0)
throw new IOException("Negative weight found in line '" + line + "'");
try {
floatWeightFuncts[i] = weightAndFunction[1];
} catch (ArrayIndexOutOfBoundsException e) {
// System.out.println( "weightDef string was: '" + weightDef + "'." );
// System.out.println( "Splitting part 1: '" + weightAndFunction[0] + "'." );
// System.out.println( "Splitting part 2: '" + weightAndFunction[1] + "'." );
throw new RuntimeException("The string [" + weightDef + "] appears to be a badly formed"
+ " weight plus weighting function definition.");
}
} else {
featureDef = line;
}
// Now featureDef is the feature name
// or the feature name followed by the word "float"
if (featureDef.endsWith("float")) {
String[] featureDefSplit = featureDef.split("\\s+", 2);
featureNames.set(numByteFeatures + numShortFeatures + i, featureDefSplit[0]);
} else {
featureNames.set(numByteFeatures + numShortFeatures + i, featureDef);
}
}
// Normalize weights to sum to one:
if (readWeights) {
for (int i = 0; i < total; i++) {
featureWeights[i] /= sumOfWeights;
}
}
// read feature similarities here, if any
if (readFeatureSimilarity) {
readFeatureSimilarityMatrices(input);
}
}
/**
* read similarity matrices from feature definition file
*
* @param input
* input
* @throws IOException
* IOException
*/
private void readFeatureSimilarityMatrices(BufferedReader input) throws IOException {
String line = null;
similarityMatrices = new float[this.getNumberOfByteFeatures()][][];
for (int i = 0; i < this.getNumberOfByteFeatures(); i++) {
similarityMatrices[i] = null;
}
while ((line = input.readLine()) != null) {
if ("".equals(line)) {
return;
}
String[] featureUniqueValues = line.trim().split("\\s+");
String featureName = featureUniqueValues[0];
if (!isByteFeature(featureName)) {
throw new RuntimeException(
"Similarity matrix support is for bytefeatures only, but not for other feature types...");
}
int featureIndex = this.getFeatureIndex(featureName);
int noUniqValues = featureUniqueValues.length - 1;
similarityMatrices[featureIndex] = new float[noUniqValues][noUniqValues];
for (int i = 1; i <= noUniqValues; i++) {
Arrays.fill(similarityMatrices[featureIndex][i - 1], 0);
String featureValue = featureUniqueValues[i];
String matLine = input.readLine();
if (matLine == null) {
throw new RuntimeException("Feature definition file is having unexpected format...");
}
String[] lines = matLine.trim().split("\\s+");
if (!featureValue.equals(lines[0])) {
throw new RuntimeException("Feature definition file is having unexpected format...");
}
if (lines.length != i) {
throw new RuntimeException("Feature definition file is having unexpected format...");
}
for (int j = 1; j < i; j++) {
float similarity = (new Float(lines[j])).floatValue();
similarityMatrices[featureIndex][i - 1][j - 1] = similarity;
similarityMatrices[featureIndex][j - 1][i - 1] = similarity;
}
}
}
}
/**
* Create a feature definition object, reading binary data from the given DataInput.
*
* @param input
* a DataInputStream or a RandomAccessFile from which a binary feature definition can be read.
* @throws IOException
* if a reading problem occurs
*/
public FeatureDefinition(DataInput input) throws IOException {
// Section BYTEFEATURES
numByteFeatures = input.readInt();
byteFeatureValues = new ByteStringTranslator[numByteFeatures];
// Initialise global arrays to byte feature length first;
// we have no means of knowing how many short or continuous
// features there will be, so we need to resize later.
// This will happen automatically for featureNames, but needs
// to be done by hand for featureWeights.
featureNames = new IntStringTranslator(numByteFeatures);
featureWeights = new float[numByteFeatures];
// There is no need to normalise weights here, because
// they have already been normalized before the binary
// file was written.
for (int i = 0; i < numByteFeatures; i++) {
featureWeights[i] = input.readFloat();
String featureName = input.readUTF();
featureNames.set(i, featureName);
byte numberOfValuesEncoded = input.readByte(); // attention: this is an unsigned byte
int numberOfValues = numberOfValuesEncoded & 0xFF;
byteFeatureValues[i] = new ByteStringTranslator(numberOfValues);
for (int b = 0; b < numberOfValues; b++) {
String value = input.readUTF();
byteFeatureValues[i].set((byte) b, value);
}
}
// Section SHORTFEATURES
numShortFeatures = input.readInt();
if (numShortFeatures > 0) {
shortFeatureValues = new ShortStringTranslator[numShortFeatures];
// resize weight array:
float[] newWeights = new float[numByteFeatures + numShortFeatures];
System.arraycopy(featureWeights, 0, newWeights, 0, numByteFeatures);
featureWeights = newWeights;
for (int i = 0; i < numShortFeatures; i++) {
featureWeights[numByteFeatures + i] = input.readFloat();
String featureName = input.readUTF();
featureNames.set(numByteFeatures + i, featureName);
short numberOfValues = input.readShort();
shortFeatureValues[i] = new ShortStringTranslator(numberOfValues);
for (short s = 0; s < numberOfValues; s++) {
String value = input.readUTF();
shortFeatureValues[i].set(s, value);
}
}
}
// Section CONTINUOUSFEATURES
numContinuousFeatures = input.readInt();
floatWeightFuncts = new String[numContinuousFeatures];
if (numContinuousFeatures > 0) {
// resize weight array:
float[] newWeights = new float[numByteFeatures + numShortFeatures + numContinuousFeatures];
System.arraycopy(featureWeights, 0, newWeights, 0, numByteFeatures + numShortFeatures);
featureWeights = newWeights;
}
for (int i = 0; i < numContinuousFeatures; i++) {
featureWeights[numByteFeatures + numShortFeatures + i] = input.readFloat();
floatWeightFuncts[i] = input.readUTF();
String featureName = input.readUTF();
featureNames.set(numByteFeatures + numShortFeatures + i, featureName);
}
}
/**
* Create a feature definition object, reading binary data from the given byte buffer.
*
* @param bb
* a byte buffer from which a binary feature definition can be read.
* @throws IOException
* if a reading problem occurs
*/
public FeatureDefinition(ByteBuffer bb) throws IOException {
// Section BYTEFEATURES
numByteFeatures = bb.getInt();
byteFeatureValues = new ByteStringTranslator[numByteFeatures];
// Initialise global arrays to byte feature length first;
// we have no means of knowing how many short or continuous
// features there will be, so we need to resize later.
// This will happen automatically for featureNames, but needs
// to be done by hand for featureWeights.
featureNames = new IntStringTranslator(numByteFeatures);
featureWeights = new float[numByteFeatures];
// There is no need to normalise weights here, because
// they have already been normalized before the binary
// file was written.
for (int i = 0; i < numByteFeatures; i++) {
featureWeights[i] = bb.getFloat();
String featureName = StreamUtils.readUTF(bb);
featureNames.set(i, featureName);
byte numberOfValuesEncoded = bb.get(); // attention: this is an unsigned byte
int numberOfValues = numberOfValuesEncoded & 0xFF;
byteFeatureValues[i] = new ByteStringTranslator(numberOfValues);
for (int b = 0; b < numberOfValues; b++) {
String value = StreamUtils.readUTF(bb);
byteFeatureValues[i].set((byte) b, value);
}
}
// Section SHORTFEATURES
numShortFeatures = bb.getInt();
if (numShortFeatures > 0) {
shortFeatureValues = new ShortStringTranslator[numShortFeatures];
// resize weight array:
float[] newWeights = new float[numByteFeatures + numShortFeatures];
System.arraycopy(featureWeights, 0, newWeights, 0, numByteFeatures);
featureWeights = newWeights;
for (int i = 0; i < numShortFeatures; i++) {
featureWeights[numByteFeatures + i] = bb.getFloat();
String featureName = StreamUtils.readUTF(bb);
featureNames.set(numByteFeatures + i, featureName);
short numberOfValues = bb.getShort();
shortFeatureValues[i] = new ShortStringTranslator(numberOfValues);
for (short s = 0; s < numberOfValues; s++) {
String value = StreamUtils.readUTF(bb);
shortFeatureValues[i].set(s, value);
}
}
}
// Section CONTINUOUSFEATURES
numContinuousFeatures = bb.getInt();
floatWeightFuncts = new String[numContinuousFeatures];
if (numContinuousFeatures > 0) {
// resize weight array:
float[] newWeights = new float[numByteFeatures + numShortFeatures + numContinuousFeatures];
System.arraycopy(featureWeights, 0, newWeights, 0, numByteFeatures + numShortFeatures);
featureWeights = newWeights;
}
for (int i = 0; i < numContinuousFeatures; i++) {
featureWeights[numByteFeatures + numShortFeatures + i] = bb.getFloat();
floatWeightFuncts[i] = StreamUtils.readUTF(bb);
String featureName = StreamUtils.readUTF(bb);
featureNames.set(numByteFeatures + numShortFeatures + i, featureName);
}
}
/**
* Write this feature definition in binary format to the given output.
*
* @param out
* a DataOutputStream or RandomAccessFile to which the FeatureDefinition should be written.
* @throws IOException
* if a problem occurs while writing.
*/
public void writeBinaryTo(DataOutput out) throws IOException {
// TODO to avoid duplicate code, replace this with writeBinaryTo(out, List()) or some such
// Section BYTEFEATURES
out.writeInt(numByteFeatures);
for (int i = 0; i < numByteFeatures; i++) {
if (featureWeights != null) {
out.writeFloat(featureWeights[i]);
} else {
out.writeFloat(0);
}
out.writeUTF(getFeatureName(i));
int numValues = getNumberOfValues(i);
byte numValuesEncoded = (byte) numValues; // an unsigned byte
out.writeByte(numValuesEncoded);
for (int b = 0; b < numValues; b++) {
String value = getFeatureValueAsString(i, b);
out.writeUTF(value);
}
}
// Section SHORTFEATURES
out.writeInt(numShortFeatures);
for (int i = numByteFeatures; i < numByteFeatures + numShortFeatures; i++) {
if (featureWeights != null) {
out.writeFloat(featureWeights[i]);
} else {
out.writeFloat(0);
}
out.writeUTF(getFeatureName(i));
short numValues = (short) getNumberOfValues(i);
out.writeShort(numValues);
for (short b = 0; b < numValues; b++) {
String value = getFeatureValueAsString(i, b);
out.writeUTF(value);
}
}
// Section CONTINUOUSFEATURES
out.writeInt(numContinuousFeatures);
for (int i = numByteFeatures + numShortFeatures; i < numByteFeatures + numShortFeatures + numContinuousFeatures; i++) {
if (featureWeights != null) {
out.writeFloat(featureWeights[i]);
out.writeUTF(floatWeightFuncts[i - numByteFeatures - numShortFeatures]);
} else {
out.writeFloat(0);
out.writeUTF("");
}
out.writeUTF(getFeatureName(i));
}
}
/**
* Write this feature definition in binary format to the given output, dropping featuresToDrop
*
* @param out
* a DataOutputStream or RandomAccessFile to which the FeatureDefinition should be written.
* @param featuresToDrop
* List of Integers containing the indices of features to drop from DataOutputStream
* @throws IOException
* if a problem occurs while writing.
*/
private void writeBinaryTo(DataOutput out, List featuresToDrop) throws IOException {
// how many features of each type are to be dropped
int droppedByteFeatures = 0;
int droppedShortFeatures = 0;
int droppedContinuousFeatures = 0;
for (int f : featuresToDrop) {
if (f < numByteFeatures) {
droppedByteFeatures++;
} else if (f < numByteFeatures + numShortFeatures) {
droppedShortFeatures++;
} else if (f < numByteFeatures + numShortFeatures + numContinuousFeatures) {
droppedContinuousFeatures++;
}
}
// Section BYTEFEATURES
out.writeInt(numByteFeatures - droppedByteFeatures);
for (int i = 0; i < numByteFeatures; i++) {
if (featuresToDrop.contains(i)) {
continue;
}
if (featureWeights != null) {
out.writeFloat(featureWeights[i]);
} else {
out.writeFloat(0);
}
out.writeUTF(getFeatureName(i));
int numValues = getNumberOfValues(i);
byte numValuesEncoded = (byte) numValues; // an unsigned byte
out.writeByte(numValuesEncoded);
for (int b = 0; b < numValues; b++) {
String value = getFeatureValueAsString(i, b);
out.writeUTF(value);
}
}
// Section SHORTFEATURES
out.writeInt(numShortFeatures - droppedShortFeatures);
for (int i = numByteFeatures; i < numByteFeatures + numShortFeatures; i++) {
if (featuresToDrop.contains(i)) {
continue;
}
if (featureWeights != null) {
out.writeFloat(featureWeights[i]);
} else {
out.writeFloat(0);
}
out.writeUTF(getFeatureName(i));
short numValues = (short) getNumberOfValues(i);
out.writeShort(numValues);
for (short b = 0; b < numValues; b++) {
String value = getFeatureValueAsString(i, b);
out.writeUTF(value);
}
}
// Section CONTINUOUSFEATURES
out.writeInt(numContinuousFeatures - droppedContinuousFeatures);
for (int i = numByteFeatures + numShortFeatures; i < numByteFeatures + numShortFeatures + numContinuousFeatures; i++) {
if (featuresToDrop.contains(i)) {
continue;
}
if (featureWeights != null) {
out.writeFloat(featureWeights[i]);
out.writeUTF(floatWeightFuncts[i - numByteFeatures - numShortFeatures]);
} else {
out.writeFloat(0);
out.writeUTF("");
}
out.writeUTF(getFeatureName(i));
}
}
/**
* Get the total number of features.
*
* @return the number of features
*/
public int getNumberOfFeatures() {
return numByteFeatures + numShortFeatures + numContinuousFeatures;
}
/**
* Get the number of byte features.
*
* @return the number of features
*/
public int getNumberOfByteFeatures() {
return numByteFeatures;
}
/**
* Get the number of short features.
*
* @return the number of features
*/
public int getNumberOfShortFeatures() {
return numShortFeatures;
}
/**
* Get the number of continuous features.
*
* @return the number of features
*/
public int getNumberOfContinuousFeatures() {
return numContinuousFeatures;
}
/**
* For the feature with the given index, return the weight.
*
* @param featureIndex
* featureIndex
* @return a non-negative weight.
*/
public float getWeight(int featureIndex) {
return featureWeights[featureIndex];
}
public float[] getFeatureWeights() {
return featureWeights;
}
/**
* Get the name of any weighting function associated with the given feature index. For byte-valued and short-valued features,
* this method will always return null; for continuous features, the method will return the name of a weighting function, or
* null.
*
* @param featureIndex
* featureIndex
* @return the name of a weighting function, or null
*/
public String getWeightFunctionName(int featureIndex) {
return floatWeightFuncts[featureIndex - numByteFeatures - numShortFeatures];
}
// //////////////////// META-INFORMATION METHODS ///////////////////////
/**
* Translate between a feature index and a feature name.
*
* @param index
* a feature index, as could be used to access a feature value in a FeatureVector.
* @return the name of the feature corresponding to the index
* @throws IndexOutOfBoundsException
* if index<0 or index>getNumberOfFeatures()
*/
public String getFeatureName(int index) {
return featureNames.get(index);
}
/**
* Translate between an array of feature indexes and an array of feature names.
*
* @param index
* an array of feature indexes, as could be used to access a feature value in a FeatureVector.
* @return an array with the name of the features corresponding to the index
* @throws IndexOutOfBoundsException
* if any of the indexes is <0 or >getNumberOfFeatures()
*/
public String[] getFeatureNameArray(int[] index) {
String[] ret = new String[index.length];
for (int i = 0; i < index.length; i++) {
ret[i] = getFeatureName(index[i]);
}
return (ret);
}
/**
* Get names of all features
*
* @return an array of all feature name strings
*/
public String[] getFeatureNameArray() {
String[] names = new String[getNumberOfFeatures()];
for (int i = 0; i < names.length; i++) {
names[i] = getFeatureName(i);
}
return (names);
}
/**
* Get names of byte features
*
* @return an array of byte feature name strings
*/
public String[] getByteFeatureNameArray() {
String[] byteFeatureNames = new String[numByteFeatures];
for (int i = 0; i < numByteFeatures; i++) {
assert isByteFeature(i);
byteFeatureNames[i] = getFeatureName(i);
}
return byteFeatureNames;
}
/**
* Get names of short features
*
* @return an array of short feature name strings
*/
public String[] getShortFeatureNameArray() {
String[] shortFeatureNames = new String[numShortFeatures];
for (int i = 0; i < numShortFeatures; i++) {
int shortFeatureIndex = numByteFeatures + i;
assert isShortFeature(shortFeatureIndex);
shortFeatureNames[i] = getFeatureName(shortFeatureIndex);
}
return shortFeatureNames;
}
/**
* Get names of continuous features
*
* @return an array of continuous feature name strings
*/
public String[] getContinuousFeatureNameArray() {
String[] continuousFeatureNames = new String[numContinuousFeatures];
for (int i = 0; i < numContinuousFeatures; i++) {
int continuousFeatureIndex = numByteFeatures + numShortFeatures + i;
assert isContinuousFeature(continuousFeatureIndex);
continuousFeatureNames[i] = getFeatureName(continuousFeatureIndex);
}
return continuousFeatureNames;
}
/**
* List all feature names, separated by white space, in their order of definition.
*
* @return buf converted into a string
*/
public String getFeatureNames() {
StringBuilder buf = new StringBuilder();
for (int i = 0, n = getNumberOfFeatures(); i < n; i++) {
if (buf.length() > 0)
buf.append(" ");
buf.append(featureNames.get(i));
}
return buf.toString();
}
/**
* Indicate whether the feature definition contains the feature with the given name
*
* @param name
* the feature name in question, e.g. "next_next_phone"
* @return featureNames.contains(name)
*/
public boolean hasFeature(String name) {
return featureNames.contains(name);
}
/**
* Query a feature as identified by the given featureName as to whether the given featureValue is a known value of that
* feature. In other words, this will return true exactly if the given feature is a byte feature and
* getFeatureValueAsByte(featureName, featureValue) will not throw an exception or if the given feature is a short feature and
* getFeatureValueAsShort(featureName, featureValue) will not throw an exception.
*
* @param featureName
* featureName
* @param featureValue
* featureValue
* @return hasFeatureValue(getFeatureIndex(featureName), featureValue)
*/
public boolean hasFeatureValue(String featureName, String featureValue) {
return hasFeatureValue(getFeatureIndex(featureName), featureValue);
}
/**
* Query a feature as identified by the given featureIndex as to whether the given featureValue is a known value of that
* feature. In other words, this will return true exactly if the given feature is a byte feature and
* getFeatureValueAsByte(featureIndex, featureValue) will not throw an exception or if the given feature is a short feature
* and getFeatureValueAsShort(featureIndex, featureValue) will not throw an exception.
*
* @param featureIndex
* featureIndex
* @param featureValue
* featureValue
* @return false if featureIndex < 0, byteFeatureValues[featureIndex].contains(featureValue) if featureIndex <
* numByteFeatures, shortFeatureValues[featureIndex - numByteFeatures].contains(featureValue) if featureIndex <
* numByteFeatures + numShortFeatures, false otherwise
*/
public boolean hasFeatureValue(int featureIndex, String featureValue) {
if (featureIndex < 0) {
return false;
}
if (featureIndex < numByteFeatures) {
return byteFeatureValues[featureIndex].contains(featureValue);
}
if (featureIndex < numByteFeatures + numShortFeatures) {
return shortFeatureValues[featureIndex - numByteFeatures].contains(featureValue);
}
return false;
}
/**
* Determine whether the feature with the given name is a byte feature.
*
* @param featureName
* featureName
* @return true if the feature is a byte feature, false if the feature is not known or is not a byte feature
*/
public boolean isByteFeature(String featureName) {
try {
int index = getFeatureIndex(featureName);
return isByteFeature(index);
} catch (Exception e) {
return false;
}
}
/**
* Determine whether the feature with the given index number is a byte feature.
*
* @param index
* index
* @return true if the feature is a byte feature, false if the feature is not a byte feature or is invalid
*/
public boolean isByteFeature(int index) {
return 0 <= index && index < numByteFeatures;
}
/**
* Determine whether the feature with the given name is a short feature.
*
* @param featureName
* featureName
* @return true if the feature is a short feature, false if the feature is not known or is not a short feature
*/
public boolean isShortFeature(String featureName) {
try {
int index = getFeatureIndex(featureName);
return isShortFeature(index);
} catch (Exception e) {
return false;
}
}
/**
* Determine whether the feature with the given index number is a short feature.
*
* @param index
* index
* @return true if the feature is a short feature, false if the feature is not a short feature or is invalid
*/
public boolean isShortFeature(int index) {
index -= numByteFeatures;
return 0 <= index && index < numShortFeatures;
}
/**
* Determine whether the feature with the given name is a continuous feature.
*
* @param featureName
* featureName
* @return true if the feature is a continuous feature, false if the feature is not known or is not a continuous feature
*/
public boolean isContinuousFeature(String featureName) {
try {
int index = getFeatureIndex(featureName);
return isContinuousFeature(index);
} catch (Exception e) {
return false;
}
}
/**
* Determine whether the feature with the given index number is a continuous feature.
*
* @param index
* index
* @return true if the feature is a continuous feature, false if the feature is not a continuous feature or is invalid
*/
public boolean isContinuousFeature(int index) {
index -= numByteFeatures;
index -= numShortFeatures;
return 0 <= index && index < numContinuousFeatures;
}
/**
* true, if given feature index contains similarity matrix
*
* @param featureIndex
* featureIndex
* @return true if this.similarityMatrices different from null and this.similarityMatrices[featureIndex] different from null,
* false otherwise
*/
public boolean hasSimilarityMatrix(int featureIndex) {
if (featureIndex >= this.getNumberOfByteFeatures()) {
return false;
}
if (this.similarityMatrices != null && this.similarityMatrices[featureIndex] != null) {
return true;
}
return false;
}
/**
* true, if given feature name contains similarity matrix
*
* @param featureName
* featureName
* @return hasSimilarityMatrix(this.getFeatureIndex(featureName))
*/
public boolean hasSimilarityMatrix(String featureName) {
return hasSimilarityMatrix(this.getFeatureIndex(featureName));
}
/**
* To get a similarity between two feature values
*
* @param featureIndex
* featureIndex
* @param i
* i
* @param j
* j
* @return this.similarityMatrices[featureIndex][i][j]
*/
public float getSimilarity(int featureIndex, byte i, byte j) {
if (!hasSimilarityMatrix(featureIndex)) {
throw new RuntimeException("the given feature index ");
}
return this.similarityMatrices[featureIndex][i][j];
}
/**
* Translate between a feature name and a feature index.
*
* @param featureName
* a valid feature name
* @return a feature index, as could be used to access a feature value in a FeatureVector.
* @throws IllegalArgumentException
* if the feature name is unknown.
*/
public int getFeatureIndex(String featureName) {
return featureNames.get(featureName);
}
/**
* Translate between an array of feature names and an array of feature indexes.
*
* @param featureName
* an array of valid feature names
* @return an array of feature indexes, as could be used to access a feature value in a FeatureVector.
* @throws IllegalArgumentException
* if one of the feature names is unknown.
*/
public int[] getFeatureIndexArray(String[] featureName) {
int[] ret = new int[featureName.length];
for (int i = 0; i < featureName.length; i++) {
ret[i] = getFeatureIndex(featureName[i]);
}
return (ret);
}
/**
* Get the number of possible values for the feature with the given index number. This method must only be called for
* byte-valued or short-valued features.
*
* @param featureIndex
* the index number of the feature.
* @return for byte-valued and short-valued features, return the number of values.
* @throws IndexOutOfBoundsException
* if featureIndex < 0 or featureIndex ≥ getNumberOfByteFeatures() + getNumberOfShortFeatures().
*/
public int getNumberOfValues(int featureIndex) {
if (featureIndex < numByteFeatures)
return byteFeatureValues[featureIndex].getNumberOfValues();
featureIndex -= numByteFeatures;
if (featureIndex < numShortFeatures)
return shortFeatureValues[featureIndex].getNumberOfValues();
throw new IndexOutOfBoundsException("Feature no. " + featureIndex + " is not a byte-valued or short-valued feature");
}
/**
* Get the list of possible String values for the feature with the given index number. This method must only be called for
* byte-valued or short-valued features. The position in the String array corresponds to the byte or short value of the
* feature obtained from a FeatureVector.
*
* @param featureIndex
* the index number of the feature.
* @return for byte-valued and short-valued features, return the array of String values.
* @throws IndexOutOfBoundsException
* if featureIndex < 0 or featureIndex ≥ getNumberOfByteFeatures() + getNumberOfShortFeatures().
*/
public String[] getPossibleValues(int featureIndex) {
if (featureIndex < numByteFeatures)
return byteFeatureValues[featureIndex].getStringValues();
featureIndex -= numByteFeatures;
if (featureIndex < numShortFeatures)
return shortFeatureValues[featureIndex].getStringValues();
throw new IndexOutOfBoundsException("Feature no. " + featureIndex + " is not a byte-valued or short-valued feature");
}
/**
* For the feature with the given index number, translate its byte or short value to its String value. This method must only
* be called for byte-valued or short-valued features.
*
* @param featureIndex
* the index number of the feature.
* @param value
* the feature value. This must be in the range of acceptable values for the given feature.
* @return for byte-valued and short-valued features, return the String representation of the feature value.
* @throws IndexOutOfBoundsException
* if featureIndex < 0 or featureIndex ≥ getNumberOfByteFeatures() + getNumberOfShortFeatures()
* @throws IndexOutOfBoundsException
* if value is not a legal value for this feature
*
*
*/
public String getFeatureValueAsString(int featureIndex, int value) {
if (featureIndex < numByteFeatures)
return byteFeatureValues[featureIndex].get((byte) value);
featureIndex -= numByteFeatures;
if (featureIndex < numShortFeatures)
return shortFeatureValues[featureIndex].get((short) value);
throw new IndexOutOfBoundsException("Feature no. " + featureIndex + " is not a byte-valued or short-valued feature");
}
/**
* Simple access to string-based features.
*
* @param featureName
* featureName
* @param fv
* fv
* @return getFeatureValueAsString(i, fv.getFeatureAsInt(i))F
*/
public String getFeatureValueAsString(String featureName, FeatureVector fv) {
int i = getFeatureIndex(featureName);
return getFeatureValueAsString(i, fv.getFeatureAsInt(i));
}
/**
* For the feature with the given name, translate its String value to its byte value. This method must only be called for
* byte-valued features.
*
* @param featureName
* the name of the feature.
* @param value
* the feature value. This must be among the acceptable values for the given feature.
* @return for byte-valued features, return the byte representation of the feature value.
* @throws IllegalArgumentException
* if featureName is not a valid feature name, or if featureName is not a byte-valued feature.
* @throws IllegalArgumentException
* if value is not a legal value for this feature
*/
public byte getFeatureValueAsByte(String featureName, String value) {
int featureIndex = getFeatureIndex(featureName);
return getFeatureValueAsByte(featureIndex, value);
}
/**
* For the feature with the given index number, translate its String value to its byte value. This method must only be called
* for byte-valued features.
*
* @param featureIndex
* the name of the feature.
* @param value
* the feature value. This must be among the acceptable values for the given feature.
* @return for byte-valued features, return the byte representation of the feature value.
* @throws IllegalArgumentException
* if featureName is not a valid feature name, or if featureName is not a byte-valued feature.
* @throws IllegalArgumentException
* if value is not a legal value for this feature
*/
public byte getFeatureValueAsByte(int featureIndex, String value) {
if (featureIndex >= numByteFeatures)
throw new IndexOutOfBoundsException("Feature no. " + featureIndex + " is not a byte-valued feature");
try {
return byteFeatureValues[featureIndex].get(value);
} catch (IllegalArgumentException iae) {
StringBuilder message = new StringBuilder("Illegal value '" + value + "' for feature " + getFeatureName(featureIndex)
+ "; Legal values are:\n");
for (String v : getPossibleValues(featureIndex)) {
message.append(" " + v);
}
throw new IllegalArgumentException(message.toString());
}
}
/**
* For the feature with the given name, translate its String value to its short value. This method must only be called for
* short-valued features.
*
* @param featureName
* the name of the feature.
* @param value
* the feature value. This must be among the acceptable values for the given feature.
* @return for short-valued features, return the short representation of the feature value.
* @throws IllegalArgumentException
* if featureName is not a valid feature name, or if featureName is not a short-valued feature.
* @throws IllegalArgumentException
* if value is not a legal value for this feature
*/
public short getFeatureValueAsShort(String featureName, String value) {
int featureIndex = getFeatureIndex(featureName);
featureIndex -= numByteFeatures;
if (featureIndex < numShortFeatures)
return shortFeatureValues[featureIndex].get(value);
throw new IndexOutOfBoundsException("Feature '" + featureName + "' is not a short-valued feature");
}
/**
* For the feature with the given name, translate its String value to its short value. This method must only be called for
* short-valued features.
*
* @param featureIndex
* the name of the feature.
* @param value
* the feature value. This must be among the acceptable values for the given feature.
* @return for short-valued features, return the short representation of the feature value.
* @throws IllegalArgumentException
* if featureName is not a valid feature name, or if featureName is not a short-valued feature.
* @throws IllegalArgumentException
* if value is not a legal value for this feature
*/
public short getFeatureValueAsShort(int featureIndex, String value) {
featureIndex -= numByteFeatures;
if (featureIndex < numShortFeatures)
return shortFeatureValues[featureIndex].get(value);
throw new IndexOutOfBoundsException("Feature no. " + featureIndex + " is not a short-valued feature");
}
/**
* Determine whether two feature definitions are equal, with respect to number, names, and possible values of the three kinds
* of features (byte-valued, short-valued, continuous). This method does not compare any weights.
*
* @param other
* the feature definition to compare to
* @return true if all features and values are identical, false otherwise
*/
public boolean featureEquals(FeatureDefinition other) {
if (numByteFeatures != other.numByteFeatures || numShortFeatures != other.numShortFeatures
|| numContinuousFeatures != other.numContinuousFeatures)
return false;
// Compare the feature names and values for byte and short features:
for (int i = 0; i < numByteFeatures + numShortFeatures + numContinuousFeatures; i++) {
if (!getFeatureName(i).equals(other.getFeatureName(i)))
return false;
}
// Compare the values for byte and short features:
for (int i = 0; i < numByteFeatures + numShortFeatures; i++) {
if (getNumberOfValues(i) != other.getNumberOfValues(i))
return false;
for (int v = 0, n = getNumberOfValues(i); v < n; v++) {
if (!getFeatureValueAsString(i, v).equals(other.getFeatureValueAsString(i, v)))
return false;
}
}
return true;
}
/**
* An extension of the previous method.
*
* @param other
* other
* @return number of byte features, or number of short features, or number of continuous features, or feature name
*/
public String featureEqualsAnalyse(FeatureDefinition other) {
if (numByteFeatures != other.numByteFeatures) {
return ("The number of BYTE features differs: " + numByteFeatures + " versus " + other.numByteFeatures);
}
if (numShortFeatures != other.numShortFeatures) {
return ("The number of SHORT features differs: " + numShortFeatures + " versus " + other.numShortFeatures);
}
if (numContinuousFeatures != other.numContinuousFeatures) {
return ("The number of CONTINUOUS features differs: " + numContinuousFeatures + " versus " + other.numContinuousFeatures);
}
// Compare the feature names and values for byte and short features:
for (int i = 0; i < numByteFeatures + numShortFeatures + numContinuousFeatures; i++) {
if (!getFeatureName(i).equals(other.getFeatureName(i))) {
return ("The feature name differs at position [" + i + "]: " + getFeatureName(i) + " versus " + other
.getFeatureName(i));
}
}
// Compare the values for byte and short features:
for (int i = 0; i < numByteFeatures + numShortFeatures; i++) {
if (getNumberOfValues(i) != other.getNumberOfValues(i)) {
return ("The number of values differs at position [" + i + "]: " + getNumberOfValues(i) + " versus " + other
.getNumberOfValues(i));
}
for (int v = 0, n = getNumberOfValues(i); v < n; v++) {
if (!getFeatureValueAsString(i, v).equals(other.getFeatureValueAsString(i, v))) {
return ("The feature value differs at position [" + i + "] for feature value [" + v + "]: "
+ getFeatureValueAsString(i, v) + " versus " + other.getFeatureValueAsString(i, v));
}
}
}
return "";
}
/**
* Determine whether two feature definitions are equal, regarding both the actual feature definitions and the weights. The
* comparison of weights will succeed if both have no weights or if both have exactly the same weights
*
* @param obj
* the feature definition to compare to
* @return true if all features, values and weights are identical, false otherwise
* @see #featureEquals(FeatureDefinition)
*/
@Override
public boolean equals(Object obj) {
if (!(obj instanceof FeatureDefinition))
return false;
FeatureDefinition other = (FeatureDefinition) obj;
if (featureWeights == null) {
if (other.featureWeights != null)
return false;
// Both are null
} else { // featureWeights != null
if (other.featureWeights == null)
return false;
// Both != null
if (featureWeights.length != other.featureWeights.length)
return false;
for (int i = 0; i < featureWeights.length; i++) {
if (featureWeights[i] != other.featureWeights[i])
return false;
}
assert floatWeightFuncts != null;
assert other.floatWeightFuncts != null;
if (floatWeightFuncts.length != other.floatWeightFuncts.length)
return false;
for (int i = 0; i < floatWeightFuncts.length; i++) {
if (floatWeightFuncts[i] == null) {
if (other.floatWeightFuncts[i] != null)
return false;
// Both are null
} else { // != null
if (other.floatWeightFuncts[i] == null)
return false;
// Both != null
if (!floatWeightFuncts[i].equals(other.floatWeightFuncts[i]))
return false;
}
}
}
// OK, weights are equal
return featureEquals(other);
}
/**
* Determine whether this FeatureDefinition is a superset of, or equal to, another FeatureDefinition.
*
* Specifically,
*
* - every byte-valued feature in other must be in this, likewise for short-valued and continuous-valued
* features;
* - for byte-valued and short-valued features, the possible feature values must be the same in this and
* other.
*
*
* @param other
* FeatureDefinition
* @return true if
*
* - all features in other are also in this, and every feature in other is of the same type in
* this; and
- every feature in other has the same possible values as the feature in this
*
*
* false otherwise
*/
public boolean contains(FeatureDefinition other) {
List thisByteFeatures = Arrays.asList(this.getByteFeatureNameArray());
List otherByteFeatures = Arrays.asList(other.getByteFeatureNameArray());
if (!thisByteFeatures.containsAll(otherByteFeatures)) {
return false;
}
for (String commonByteFeature : otherByteFeatures) {
String[] thisByteFeaturePossibleValues = this.getPossibleValues(this.getFeatureIndex(commonByteFeature));
String[] otherByteFeaturePossibleValues = other.getPossibleValues(other.getFeatureIndex(commonByteFeature));
if (!Arrays.equals(thisByteFeaturePossibleValues, otherByteFeaturePossibleValues)) {
return false;
}
}
List thisShortFeatures = Arrays.asList(this.getShortFeatureNameArray());
List otherShortFeatures = Arrays.asList(other.getShortFeatureNameArray());
if (!thisShortFeatures.containsAll(otherShortFeatures)) {
return false;
}
for (String commonShortFeature : otherShortFeatures) {
String[] thisShortFeaturePossibleValues = this.getPossibleValues(this.getFeatureIndex(commonShortFeature));
String[] otherShortFeaturePossibleValues = other.getPossibleValues(other.getFeatureIndex(commonShortFeature));
if (!Arrays.equals(thisShortFeaturePossibleValues, otherShortFeaturePossibleValues)) {
return false;
}
}
List thisContinuousFeatures = Arrays.asList(this.getContinuousFeatureNameArray());
List otherContinuousFeatures = Arrays.asList(other.getContinuousFeatureNameArray());
if (!thisContinuousFeatures.containsAll(otherContinuousFeatures)) {
return false;
}
return true;
}
/**
* Create a new FeatureDefinition that contains a subset of the features in this.
*
* @param featureNamesToDrop
* array of Strings containing the names of the features to drop from the new FeatureDefinition
* @return new FeatureDefinition
*/
public FeatureDefinition subset(String[] featureNamesToDrop) {
// construct a list of indices for the features to be dropped:
List featureIndicesToDrop = new ArrayList();
for (String featureName : featureNamesToDrop) {
int featureIndex;
try {
featureIndex = getFeatureIndex(featureName);
featureIndicesToDrop.add(featureIndex);
} catch (IllegalArgumentException e) {
System.err.println("WARNING: feature " + featureName + " not found in FeatureDefinition; ignoring.");
}
}
// create a new FeatureDefinition by way of a byte array:
FeatureDefinition subDefinition = null;
try {
ByteArrayOutputStream toMemory = new ByteArrayOutputStream();
DataOutput output = new DataOutputStream(toMemory);
writeBinaryTo(output, featureIndicesToDrop);
byte[] memory = toMemory.toByteArray();
ByteArrayInputStream fromMemory = new ByteArrayInputStream(memory);
DataInput input = new DataInputStream(fromMemory);
subDefinition = new FeatureDefinition(input);
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
// make sure that subDefinition really is a subset of this
assert this.contains(subDefinition);
return subDefinition;
}
/**
* Create a feature vector consistent with this feature definition by reading the data from a String representation. In that
* String, the String values for each feature must be separated by white space. For example, this format is created by
* toFeatureString(FeatureVector).
*
* @param unitIndex
* an index number to assign to the feature vector
* @param featureString
* the string representation of a feature vector.
* @return the feature vector created from the String.
* @throws IllegalArgumentException
* if the feature values listed are not consistent with the feature definition.
* @see #toFeatureString(FeatureVector)
*/
public FeatureVector toFeatureVector(int unitIndex, String featureString) {
String[] featureValues = featureString.split("\\s+");
if (featureValues.length != numByteFeatures + numShortFeatures + numContinuousFeatures)
throw new IllegalArgumentException("Expected " + (numByteFeatures + numShortFeatures + numContinuousFeatures)
+ " features, got " + featureValues.length);
byte[] bytes = new byte[numByteFeatures];
short[] shorts = new short[numShortFeatures];
float[] floats = new float[numContinuousFeatures];
for (int i = 0; i < numByteFeatures; i++) {
bytes[i] = Byte.parseByte(featureValues[i]);
}
for (int i = 0; i < numShortFeatures; i++) {
shorts[i] = Short.parseShort(featureValues[numByteFeatures + i]);
}
for (int i = 0; i < numContinuousFeatures; i++) {
floats[i] = Float.parseFloat(featureValues[numByteFeatures + numShortFeatures + i]);
}
return new FeatureVector(bytes, shorts, floats, unitIndex);
}
public FeatureVector toFeatureVector(int unitIndex, byte[] bytes, short[] shorts, float[] floats) {
if (!((numByteFeatures == 0 && bytes == null || numByteFeatures == bytes.length)
&& (numShortFeatures == 0 && shorts == null || numShortFeatures == shorts.length) && (numContinuousFeatures == 0
&& floats == null || numContinuousFeatures == floats.length))) {
throw new IllegalArgumentException("Expected " + numByteFeatures + " bytes (got "
+ (bytes == null ? "0" : bytes.length) + "), " + numShortFeatures + " shorts (got "
+ (shorts == null ? "0" : shorts.length) + "), " + numContinuousFeatures + " floats (got "
+ (floats == null ? "0" : floats.length) + ")");
}
return new FeatureVector(bytes, shorts, floats, unitIndex);
}
/**
* Create a feature vector consistent with this feature definition by reading the data from the given input.
*
* @param input
* a DataInputStream or RandomAccessFile to read the feature values from.
* @param currentUnitIndex
* currentUnitIndex
* @throws IOException
* IOException
* @return a FeatureVector.
*/
public FeatureVector readFeatureVector(int currentUnitIndex, DataInput input) throws IOException {
byte[] bytes = new byte[numByteFeatures];
input.readFully(bytes);
short[] shorts = new short[numShortFeatures];
for (int i = 0; i < shorts.length; i++) {
shorts[i] = input.readShort();
}
float[] floats = new float[numContinuousFeatures];
for (int i = 0; i < floats.length; i++) {
floats[i] = input.readFloat();
}
return new FeatureVector(bytes, shorts, floats, currentUnitIndex);
}
/**
* Create a feature vector consistent with this feature definition by reading the data from the byte buffer.
*
* @param currentUnitIndex
* currentUnitIndex
* @param bb
* a byte buffer to read the feature values from.
* @throws IOException
* IOException
* @return a FeatureVector.
*/
public FeatureVector readFeatureVector(int currentUnitIndex, ByteBuffer bb) throws IOException {
byte[] bytes = new byte[numByteFeatures];
bb.get(bytes);
short[] shorts = new short[numShortFeatures];
for (int i = 0; i < shorts.length; i++) {
shorts[i] = bb.getShort();
}
float[] floats = new float[numContinuousFeatures];
for (int i = 0; i < floats.length; i++) {
floats[i] = bb.getFloat();
}
return new FeatureVector(bytes, shorts, floats, currentUnitIndex);
}
/**
* Create a feature vector that marks a start or end of a unit. All feature values are set to the neutral value "0", except
* for the EDGEFEATURE, which is set to start if start == true, to end otherwise.
*
* @param unitIndex
* index of the unit
* @param start
* true creates a start vector, false creates an end vector.
* @return a feature vector representing an edge.
*/
public FeatureVector createEdgeFeatureVector(int unitIndex, boolean start) {
int edgeFeature = getFeatureIndex(EDGEFEATURE);
assert edgeFeature < numByteFeatures; // we can assume this is byte-valued
byte edge;
if (start)
edge = getFeatureValueAsByte(edgeFeature, EDGEFEATURE_START);
else
edge = getFeatureValueAsByte(edgeFeature, EDGEFEATURE_END);
byte[] bytes = new byte[numByteFeatures];
short[] shorts = new short[numShortFeatures];
float[] floats = new float[numContinuousFeatures];
for (int i = 0; i < numByteFeatures; i++) {
bytes[i] = getFeatureValueAsByte(i, NULLVALUE);
}
for (int i = 0; i < numShortFeatures; i++) {
shorts[i] = getFeatureValueAsShort(numByteFeatures + i, NULLVALUE);
}
for (int i = 0; i < numContinuousFeatures; i++) {
floats[i] = 0;
}
bytes[edgeFeature] = edge;
return new FeatureVector(bytes, shorts, floats, unitIndex);
}
/**
* Convert a feature vector into a String representation.
*
* @param fv
* a feature vector which must be consistent with this feature definition.
* @return a String containing the String values of all features, separated by white space.
* @throws IllegalArgumentException
* if the feature vector is not consistent with this feature definition
* @throws IndexOutOfBoundsException
* if any value of the feature vector is not consistent with this feature definition
*/
public String toFeatureString(FeatureVector fv) {
if (numByteFeatures != fv.getNumberOfByteFeatures() || numShortFeatures != fv.getNumberOfShortFeatures()
|| numContinuousFeatures != fv.getNumberOfContinuousFeatures())
throw new IllegalArgumentException("Feature vector '" + fv + "' is inconsistent with feature definition");
StringBuilder buf = new StringBuilder();
for (int i = 0; i < numByteFeatures; i++) {
if (buf.length() > 0)
buf.append(" ");
buf.append(getFeatureValueAsString(i, fv.getByteFeature(i)));
}
for (int i = numByteFeatures; i < numByteFeatures + numShortFeatures; i++) {
if (buf.length() > 0)
buf.append(" ");
buf.append(getFeatureValueAsString(i, fv.getShortFeature(i)));
}
for (int i = numByteFeatures + numShortFeatures; i < numByteFeatures + numShortFeatures + numContinuousFeatures; i++) {
if (buf.length() > 0)
buf.append(" ");
buf.append(fv.getContinuousFeature(i));
}
return buf.toString();
}
/**
* Export this feature definition in the text format which can also be read by this class.
*
* @param out
* the destination of the data
* @param writeWeights
* whether to write weights before every line
*/
public void writeTo(PrintWriter out, boolean writeWeights) {
out.println("ByteValuedFeatureProcessors");
for (int i = 0; i < numByteFeatures; i++) {
if (writeWeights) {
out.print(featureWeights[i] + " | ");
}
out.print(getFeatureName(i));
for (int v = 0, vmax = getNumberOfValues(i); v < vmax; v++) {
out.print(" ");
String val = getFeatureValueAsString(i, v);
out.print(val);
}
out.println();
}
out.println("ShortValuedFeatureProcessors");
for (int i = 0; i < numShortFeatures; i++) {
if (writeWeights) {
out.print(featureWeights[numByteFeatures + i] + " | ");
}
out.print(getFeatureName(numByteFeatures + i));
for (int v = 0, vmax = getNumberOfValues(numByteFeatures + i); v < vmax; v++) {
out.print(" ");
String val = getFeatureValueAsString(numByteFeatures + i, v);
out.print(val);
}
out.println();
}
out.println("ContinuousFeatureProcessors");
for (int i = 0; i < numContinuousFeatures; i++) {
if (writeWeights) {
out.print(featureWeights[numByteFeatures + numShortFeatures + i]);
out.print(" ");
out.print(floatWeightFuncts[i]);
out.print(" | ");
}
out.print(getFeatureName(numByteFeatures + numShortFeatures + i));
out.println();
}
}
/**
* Export this feature definition in the "all.desc" format which can be read by wagon.
*
* @param out
* the destination of the data
*/
public void generateAllDotDescForWagon(PrintWriter out) {
generateAllDotDescForWagon(out, null);
}
/**
* Export this feature definition in the "all.desc" format which can be read by wagon.
*
* @param out
* the destination of the data
* @param featuresToIgnore
* a set of Strings containing the names of features that wagon should ignore. Can be null.
*/
public void generateAllDotDescForWagon(PrintWriter out, Set featuresToIgnore) {
out.println("(");
out.println("(occurid cluster)");
for (int i = 0, n = getNumberOfFeatures(); i < n; i++) {
out.print("( ");
String featureName = getFeatureName(i);
out.print(featureName);
if (featuresToIgnore != null && featuresToIgnore.contains(featureName)) {
out.print(" ignore");
}
if (i < numByteFeatures + numShortFeatures) { // list values
for (int v = 0, vmax = getNumberOfValues(i); v < vmax; v++) {
out.print(" ");
// Print values surrounded by double quotes, and make sure any
// double quotes in the value are preceded by a backslash --
// otherwise, we get problems e.g. for sentence_punc
String val = getFeatureValueAsString(i, v);
if (val.indexOf('"') != -1) {
StringBuilder buf = new StringBuilder();
for (int c = 0; c < val.length(); c++) {
char ch = val.charAt(c);
if (ch == '"')
buf.append("\\\"");
else
buf.append(ch);
}
val = buf.toString();
}
out.print("\"" + val + "\"");
}
out.println(" )");
} else { // float feature
out.println(" float )");
}
}
out.println(")");
}
/**
* Print this feature definition plus weights to a .txt file
*
* @param out
* the destination of the data
*/
public void generateFeatureWeightsFile(PrintWriter out) {
out.println("# This file lists the features and their weights to be used for\n" + "# creating the MARY features file.\n"
+ "# The same file can also be used to override weights in a run-time system.\n"
+ "# Three sections are distinguished: Byte-valued, Short-valued, and\n" + "# Continuous features.\n" + "#\n"
+ "# Lines starting with '#' are ignored; they can be used for comments\n"
+ "# anywhere in the file. Empty lines are also ignored.\n" + "# Entries must have the following form:\n"
+ "# \n" + "# | \n" + "# \n"
+ "# For byte and short features, is simply the \n"
+ "# (float) number representing the weight.\n" + "# For continuous features, is the\n"
+ "# (float) number representing the weight, followed by an optional\n"
+ "# weighting function including arguments.\n" + "#\n"
+ "# The is the feature name, which in the case of\n"
+ "# byte and short features is followed by the full list of feature values.\n" + "#\n"
+ "# Note that the feature definitions must be identical between this file\n"
+ "# and all unit feature files for individual database utterances.\n"
+ "# THIS FILE WAS GENERATED AUTOMATICALLY");
out.println();
out.println("ByteValuedFeatureProcessors");
List getValuesOf10 = new ArrayList();
getValuesOf10.add("phone");
getValuesOf10.add("ph_vc");
getValuesOf10.add("prev_phone");
getValuesOf10.add("next_phone");
getValuesOf10.add("stressed");
getValuesOf10.add("syl_break");
getValuesOf10.add("prev_syl_break");
getValuesOf10.add("next_is_pause");
getValuesOf10.add("prev_is_pause");
List getValuesOf5 = new ArrayList();
getValuesOf5.add("cplace");
getValuesOf5.add("ctype");
getValuesOf5.add("cvox");
getValuesOf5.add("vfront");
getValuesOf5.add("vheight");
getValuesOf5.add("vlng");
getValuesOf5.add("vrnd");
getValuesOf5.add("vc");
for (int i = 0; i < numByteFeatures; i++) {
String featureName = getFeatureName(i);
if (getValuesOf10.contains(featureName)) {
out.print("10 | " + featureName);
} else {
boolean found = false;
for (String match : getValuesOf5) {
if (featureName.matches(".*" + match)) {
out.print("5 | " + featureName);
found = true;
break;
}
}
if (!found) {
out.print("0 | " + featureName);
}
}
for (int v = 0, vmax = getNumberOfValues(i); v < vmax; v++) {
String val = getFeatureValueAsString(i, v);
out.print(" " + val);
}
out.print("\n");
}
out.println("ShortValuedFeatureProcessors");
for (int i = 0; i < numShortFeatures; i++) {
int n = i + numByteFeatures;
String featureName = getFeatureName(n);
out.print("0 | " + featureName);
for (int v = 0, vmax = getNumberOfValues(n); v < vmax; v++) {
String val = getFeatureValueAsString(n, v);
out.print(" " + val);
}
out.print("\n");
}
out.println("ContinuousFeatureProcessors");
for (int i = 0; i < numContinuousFeatures; i++) {
String featureName = getFeatureName(i + numByteFeatures + numShortFeatures);
int featureValue;
switch (featureName) {
case "unit_duration":
featureValue = 1000;
break;
case "unit_logf0":
featureValue = 100;
break;
default:
featureValue = 0;
break;
}
out.printf("%d linear | %s\n", featureValue, featureName);
}
out.flush();
out.close();
}
/**
* Compares two feature vectors in terms of how many discrete features they have in common. WARNING: this assumes that the
* feature vectors are issued from the same FeatureDefinition; only the number of features is checked for compatibility.
*
* @param v1
* A feature vector.
* @param v2
* Another feature vector to compare v1 with.
* @return The number of common features.
*/
public static int diff(FeatureVector v1, FeatureVector v2) {
int ret = 0;
/* Byte valued features */
if (v1.byteValuedDiscreteFeatures.length < v2.byteValuedDiscreteFeatures.length) {
throw new RuntimeException("v1 and v2 don't have the same number of byte-valued features: ["
+ v1.byteValuedDiscreteFeatures.length + "] versus [" + v2.byteValuedDiscreteFeatures.length + "].");
}
for (int i = 0; i < v1.byteValuedDiscreteFeatures.length; i++) {
if (v1.byteValuedDiscreteFeatures[i] == v2.byteValuedDiscreteFeatures[i])
ret++;
}
/* Short valued features */
if (v1.shortValuedDiscreteFeatures.length < v2.shortValuedDiscreteFeatures.length) {
throw new RuntimeException("v1 and v2 don't have the same number of short-valued features: ["
+ v1.shortValuedDiscreteFeatures.length + "] versus [" + v2.shortValuedDiscreteFeatures.length + "].");
}
for (int i = 0; i < v1.shortValuedDiscreteFeatures.length; i++) {
if (v1.shortValuedDiscreteFeatures[i] == v2.shortValuedDiscreteFeatures[i])
ret++;
}
/* TODO: would checking float-valued features make sense ? (Code below.) */
/* float valued features */
/*
* if ( v1.continuousFeatures.length < v2.continuousFeatures.length ) { throw new RuntimeException(
* "v1 and v2 don't have the same number of continuous features: [" + v1.continuousFeatures.length + "] versus [" +
* v2.continuousFeatures.length + "]." ); } float epsilon = 1.0e-6f; float d = 0.0f; for ( int i = 0; i <
* v1.continuousFeatures.length; i++ ) { d = ( v1.continuousFeatures[i] > v2.continuousFeatures[i] ?
* (v1.continuousFeatures[i] - v2.continuousFeatures[i]) : (v2.continuousFeatures[i] - v1.continuousFeatures[i]) ); // =>
* this avoids Math.abs() if ( d < epsilon ) ret++; }
*/
return (ret);
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy