All Downloads are FREE. Search and download functionalities are using the official Maven repository.

marytts.tools.dbselection.CoverageDefinition Maven / Gradle / Ivy

The newest version!
/**
 * Copyright 2007 DFKI GmbH.
 * All Rights Reserved.  Use is subject to license terms.
 *
 * This file is part of MARY TTS.
 *
 * MARY TTS is free software: you can redistribute it and/or modify
 * it under the terms of the GNU Lesser General Public License as published by
 * the Free Software Foundation, version 3 of the License.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU Lesser General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public License
 * along with this program.  If not, see .
 *
 */
package marytts.tools.dbselection;

import java.io.BufferedReader;
import java.io.DataInputStream;
import java.io.DataOutputStream;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.FileWriter;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.PrintWriter;
import java.io.StringWriter;
import java.text.DecimalFormat;
import java.util.ArrayList;
import java.util.Collections;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Locale;
import java.util.Map;
import java.util.Set;
import java.util.StringTokenizer;
import java.util.TreeMap;
import java.util.TreeSet;

import marytts.features.FeatureDefinition;

/**
 * Builds and manages the cover sets
 * 
 * @author Anna Hunecke
 *
 */
public class CoverageDefinition {

	/* cover sets for simple diphones */
	private CoverNode simpleCover;
	/* weights of the different levels of the cover set */
	private double phoneLevelWeight;
	private double diphoneLevelWeight;
	private double prosodyLevelWeight;
	/* use simple or clustered cover set for usefulness computation */
	private boolean simpleDiphones;
	/* consider frequency when computing usefulness */
	private boolean considerFrequency;
	/*
	 * the actual setting of the frequency (setting only considered when considerFrequency is true)
	 */
	private String frequencySetting;
	/* consider the length of a sentence when computing usefulness */
	private boolean considerSentenceLength;
	/*
	 * max/min sentence length a selected sentence is allowed to have (settings only considered when considerSentenceLength is
	 * true)
	 */
	private int maxSentLengthAllowed;
	private int minSentLengthAllowed;
	/*
	 * number by which the wanted weight of a node is divided each time a new feature vector is added to the node
	 */
	private double wantedWeightDecrease;
	/* the index of the four features in the feature vector */
	private int phoneFeatIndex;
	private int diphoneFeatIndex;
	// private int phoneClassesIndex; // CHECK IF THIS FEATURE WILL NOT BE USED ANY MORE???
	private int prosodyIndex;
	// number of target features used, (phone, next_phone, selection_prosody = 3 )
	private int numTargetFeaturesUsed;

	/* the number of possible prosody feature values */
	private int numProsodyValues;

	/* the number of possible phones */
	private int numPhoneValues;
	/* the number of possible phones minus the phones to ignore */
	private int numPhoneValuesMinusIgnored;
	/* the number of possible simple diphones */
	private int numPossibleSimpleDiphones;
	/* the number of feature vectors in the cover set */
	private int numSelectedFeatVects;
	/* the number of tokens in the corpus */
	private int numTokens;
	/* the number of simple diphones types in the corpus */
	private int numSimpleDiphoneTypes;
	/* the number of simple feature vector types in the corpus */
	private int numSimpleFeatVectTypes;
	/* average/max/min sentence length in the corpus */
	private double averageSentLength;
	private int maxSentLength;
	private int minSentLength;
	/* the number of sentences in the cover set */
	private int numSentencesInCover;
	/* max/min sentence length in the cover set */
	private int maxSentLengthInCover;
	private int minSentLengthInCover;
	/*
	 * maximum sizes of simple/clustered cover (=number of Leaves)
	 */
	private int numSimpleLeaves;
	/* the phone coverage of the corpus */
	private double possiblePhoneCoverage;
	/* the simple diphone coverage of the corpus */
	private double possibleSimpleDiphoneCoverage;
	/* the overall (=phone+simpleDiphone+prosody) coverage of the corpus */
	private double possibleOverallSimpleCoverage;
	/* the phone types in the corpus */
	private Set possiblePhoneTypes;
	/*
	 * keep track of the coverage development over time by adding a the current coverage value each time the cover is updated
	 */
	private List phoneCoverageInTime;
	private List diphoneCoverageInTime;
	private List overallCoverageInTime;
	/* set of covered phones/simple diphones/clustered diphones */
	private Set phonesInCover;
	private Set simpleDiphonesInCover;
	/* number of simple prosodic variations in cover */
	private int numSimpleFeatVectsInCover;
	/* the featureDefinition for the feature vectors */
	private final FeatureDefinition featDef;
	/* the number of sentences in the corpus */
	private int numSentences;
	/* the phones that are not in the corpus and have to be ignored */
	private Set phonesToIgnore;
	/* the possible phone values */
	private String[] possiblePhoneArray;
	/* the possible next phone values */
	private String[] possibleNextPhoneArray;
	/* the possible next phone values */
	private String[] possibleProsodyArray;
	private String[][] possibleDiphones;
	private String[][][] possibleDiphonesProsody;
	/* For printing statistics, count the number of occurrences of each diphone in the text corpus */
	private int[][] diphoneFrequencies;
	private int[] phoneFrequencies;

	private CoverageFeatureProvider cfProvider;

	/**
	 * Build a new coverage definition and read in the config file
	 * 
	 * 
	 * 
	 * @param featDef
	 *            the feature definition for the vectors
	 * @param cfProvider
	 *            coverage feature provider
	 * @param configFile
	 *            optionally, the coverage config file name. if this is null, default settings will be used.
	 * @throws Exception
	 *             Exception
	 */
	public CoverageDefinition(FeatureDefinition featDef, CoverageFeatureProvider cfProvider, String configFile) throws Exception {
		this.featDef = featDef;
		this.cfProvider = cfProvider;

		readConfigFile(featDef, configFile);
		setupFeatureIndexes();
		initializeVariables();
	}

	/**
	 * @param featDef
	 *            featDef
	 * @param configFile
	 *            configFile
	 * @throws Exception
	 *             Exception
	 */
	private void readConfigFile(FeatureDefinition featDef, String configFile) throws Exception {
		try {
			InputStream inStream;
			if (configFile != null) {
				inStream = new FileInputStream(new File(configFile));
			} else {
				inStream = getClass().getResourceAsStream("covDef.config");
			}
			BufferedReader configIn = new BufferedReader(new InputStreamReader(inStream, "UTF-8"));
			String line;
			int numparams = 0;
			// loop over the lines of the config file
			while ((line = configIn.readLine()) != null) {
				if (!line.startsWith("#") && !line.equals("")) {
					StringTokenizer tok = new StringTokenizer(line);
					String key = tok.nextToken();
					String value = tok.nextToken();
					if (key.equals("simpleDiphones")) {
						if (value.equals("true")) {
							simpleDiphones = true;
						} else {
							simpleDiphones = false;
						}
						numparams++;
						continue;
					}
					if (key.equals("frequency")) {
						if (value.equals("none")) {
							considerFrequency = false;
						} else {
							considerFrequency = true;
							frequencySetting = value;
						}
						numparams++;
						continue;
					}
					if (key.equals("sentenceLength")) {
						if (value.equals("none")) {
							considerSentenceLength = false;
						} else {
							considerSentenceLength = true;
							maxSentLengthAllowed = Integer.parseInt(value);
							minSentLengthAllowed = Integer.parseInt(tok.nextToken());
						}
						numparams++;
						continue;
					}
					if (key.equals("wantedWeight")) {
						phoneLevelWeight = Double.parseDouble(value);
						diphoneLevelWeight = Double.parseDouble(tok.nextToken());
						prosodyLevelWeight = Double.parseDouble(tok.nextToken());
						numparams++;
						continue;
					}
					if (key.equals("wantedWeightDecrease")) {
						wantedWeightDecrease = Double.parseDouble(value);
						numparams++;
						continue;
					}
					if (key.equals("missingPhones")) {

						phonesToIgnore = new HashSet();
						phonesToIgnore.add(0); // The non-existing phone "0"
						// phoneFeatIndex = featDef.getFeatureIndex("phone");
						// phonesToIgnore.add(
						// new Integer(featDef.getFeatureValueAsByte(phoneFeatIndex,value)));
						while (tok.hasMoreTokens()) {
							phonesToIgnore.add(new Integer(featDef.getFeatureValueAsByte(phoneFeatIndex, tok.nextToken())));

						}

						numparams++;
					}

				}
			}
			if (numparams < 6) {
				throw new Exception("Error reading coverage Definition Config File: " + configFile + " there are only "
						+ numparams + " instead of 6 settings");
			}
		} catch (Exception e) {
			e.printStackTrace();
			throw new Exception("Could not read coverage Definition Config File: " + configFile);
		}
	}

	/**
	 * 
	 */
	private void setupFeatureIndexes() {
		System.out.println("TARGETFEATURES used:");
		numTargetFeaturesUsed = 0;
		for (int i = 0; i < featDef.getNumberOfFeatures(); i++) {
			if (featDef.getFeatureName(i).contentEquals("phone")) {
				phoneFeatIndex = featDef.getFeatureIndex("phone");
				numTargetFeaturesUsed++;
				System.out.println("  feature(" + i + ")=" + featDef.getFeatureName(i));
			} else if (featDef.getFeatureName(i).contentEquals("next_phone")) {
				diphoneFeatIndex = featDef.getFeatureIndex("next_phone");
				numTargetFeaturesUsed++;
				System.out.println("  feature(" + i + ")=" + featDef.getFeatureName(i));
			} else if (featDef.getFeatureName(i).contentEquals("selection_prosody")) {
				prosodyIndex = featDef.getFeatureIndex("selection_prosody");
				numTargetFeaturesUsed++;
				System.out.println("  feature(" + i + ")=" + featDef.getFeatureName(i));
			} else
				System.out.println("  NO implementation in CoverageDefinition for the feature =" + featDef.getFeatureName(i));
		}

		numPhoneValues = featDef.getNumberOfValues(phoneFeatIndex);
		numPhoneValuesMinusIgnored = numPhoneValues - phonesToIgnore.size() - 1;
		numPossibleSimpleDiphones = numPhoneValuesMinusIgnored * (numPhoneValuesMinusIgnored + 1);
		numProsodyValues = featDef.getNumberOfValues(prosodyIndex);

		possiblePhoneArray = featDef.getPossibleValues(phoneFeatIndex);
		possibleNextPhoneArray = featDef.getPossibleValues(diphoneFeatIndex);
		possibleProsodyArray = featDef.getPossibleValues(prosodyIndex);

		// For efficiency, we build all strings once -- much better than doing it all the time:
		possibleDiphones = new String[possiblePhoneArray.length][possibleNextPhoneArray.length];
		possibleDiphonesProsody = new String[possiblePhoneArray.length][possibleNextPhoneArray.length][possibleProsodyArray.length];
		for (int i = 0; i < possiblePhoneArray.length; i++) {
			for (int j = 0; j < possibleNextPhoneArray.length; j++) {
				String diphone = possiblePhoneArray[i] + "_" + possibleNextPhoneArray[j];
				possibleDiphones[i][j] = diphone;
				for (int k = 0; k < possibleProsodyArray.length; k++) {
					possibleDiphonesProsody[i][j][k] = diphone + "_" + possibleProsodyArray[k];
				}
			}
		}
	}

	/**
	 * 
	 */
	private void initializeVariables() {
		// initialise several variables
		numSelectedFeatVects = 0;
		numSentencesInCover = 0;
		maxSentLengthInCover = 0;
		minSentLengthInCover = 20;
		phoneCoverageInTime = new ArrayList();
		diphoneCoverageInTime = new ArrayList();
		overallCoverageInTime = new ArrayList();
		phonesInCover = new HashSet();
		simpleDiphonesInCover = new HashSet();
		numSimpleFeatVectsInCover = 0;
	}

	/**
	 * Compute the coverage of the corpus, build and fill the cover sets. This will iterate once through the entire corpus, to
	 * compute the maximally achievable coverage with this corpus.
	 * 
	 * @throws IOException
	 *             IOException
	 */
	public void initialiseCoverage() throws IOException {
		// stuff used for counting the phones and diphones
		possiblePhoneTypes = new HashSet();
		Set simpleFeatVectTypes = new HashSet();

		int numPhoneTypes = 0;
		numSimpleDiphoneTypes = 0;
		numTokens = 0;
		averageSentLength = 0.0;
		maxSentLength = 0;
		minSentLength = 20;

		phoneFrequencies = new int[possiblePhoneArray.length];
		diphoneFrequencies = new int[possiblePhoneArray.length][possibleNextPhoneArray.length];

		// build Cover
		buildCover();

		numSentences = cfProvider.getNumSentences();

		int tenPercent = Math.max(numSentences / 10, 1);

		// loop over the feature vectors
		System.out.println("\nAnalysing feature vectors of " + numSentences + " sentences:");
		for (int index = 0; index < numSentences; index++) {

			if ((index % tenPercent) == 0 && index != 0) {
				int percentage = index / tenPercent;
				System.out.print(" " + percentage + "0% ");
			}
			// for each vector, get the values for the relevant features
			// add them to the list of possible values

			byte[] features = cfProvider.getCoverageFeatures(index); // the feature vectors of one sentence
			if (features == null) {
				System.err.println("WARNING: null features for sentence id " + cfProvider.getID(index) + " -- skipping");
				continue;
			}
			int numFeatVects = features.length / numTargetFeaturesUsed;

			// compute statistics of sentence length
			averageSentLength += numFeatVects;
			if (numFeatVects > maxSentLength)
				maxSentLength = numFeatVects;
			if (numFeatVects < minSentLength)
				minSentLength = numFeatVects;

			// System.out.println("Analysing feature vectors of sentence id " + cfProvider.getID(index) + ":  numFeatVects=" +
			// numFeatVects);

			// Loop over all feature vectors in current sentence:
			for (int i = 0; i < numFeatVects; i++) {
				numTokens++;

				// first deal with current phone
				// byte nextPhonebyte = getVectorValue(vectorBuf,i,phoneFeatIndex);
				byte nextPhonebyte = features[i * numTargetFeaturesUsed + phoneFeatIndex];

				// System.out.println("i=" + i + " phone="
				// + featDef.getFeatureValueAsString(myphoneFeatIndex,nextPhonebyte));

				// add 1 to the frequency value of the phone
				phoneFrequencies[nextPhonebyte]++;

				// deal with current diphone
				// byte nextnextPhonebyte = getVectorValue(vectorBuf,i,diphoneFeatIndex);
				byte nextnextPhonebyte = features[i * numTargetFeaturesUsed + diphoneFeatIndex];
				String simpleDiphone = possibleDiphones[nextPhonebyte][nextnextPhonebyte];

				// add 1 to the frequency value of the diphone
				diphoneFrequencies[nextPhonebyte][nextnextPhonebyte]++;

				// deal with current diphone
				// byte prosodyValue = getVectorValue(vectorBuf,i,prosodyIndex);
				byte prosodyValue = features[i * numTargetFeaturesUsed + prosodyIndex];
				simpleFeatVectTypes.add(possibleDiphonesProsody[nextPhonebyte][nextnextPhonebyte][prosodyValue]);

				// save feature vector in simple diphone tree
				// CoverLeaf leaf = goDownTree(vectorBuf,i,false);
				// leaf.addPossibleInstance();
				CoverLeaf leaf = (CoverLeaf) simpleCover.children[nextPhonebyte].children[nextnextPhonebyte].children[prosodyValue];
				leaf.maxNumFeatVects++;

			}
		}
		System.out.println(" 100% ");

		// count phones
		numPhoneTypes = getPhonesInCorpus().size();

		// count diphones
		numSimpleDiphoneTypes = 0;
		for (int i = 0; i < possibleDiphones.length; i++) {
			for (int j = 0; j < possibleDiphones[i].length; j++) {
				if (diphoneFrequencies[i][j] > 0) {
					numSimpleDiphoneTypes++;
				}
			}
		}

		// compute average sentence length
		averageSentLength = averageSentLength / (double) numSentences;

		// calculate cover size
		numPossibleSimpleDiphones = numPhoneValuesMinusIgnored * (numPhoneValuesMinusIgnored + 1);
		numSimpleLeaves = numPossibleSimpleDiphones * numProsodyValues;

		// number of feature vector types
		numSimpleFeatVectTypes = simpleFeatVectTypes.size();
		// compute coverage of corpus
		possiblePhoneCoverage = (double) numPhoneTypes / (double) (numPhoneValuesMinusIgnored);
		possibleSimpleDiphoneCoverage = numSimpleDiphoneTypes / (double) numPossibleSimpleDiphones;
		possibleOverallSimpleCoverage = (double) numSimpleFeatVectTypes / (double) numSimpleLeaves;

		// calculate relative frequency for each node
		// rel. freq. = freq / all tokens
		if (simpleDiphones) {
			computeRelativeFrequency(simpleCover, numTokens);
		}

	}

	/**
	 * Build the trees that represent the cover sets
	 *
	 */
	private void buildCover() {
		simpleCover = new CoverNode((byte) numPhoneValues, wantedWeightDecrease);

		// compute all possible combinations
		for (int k = 0; k < possiblePhoneArray.length; k++) {
			if (phonesToIgnore.contains(new Integer(k)))
				continue;
			// find out the index of the current phone
			byte nextIndex = (byte) k;

			// add a node for the phonetic identity of the next phone
			CoverNode nextSimpleChild = new CoverNode((byte) numPhoneValues, wantedWeightDecrease);

			// set the weight that determines how many instances
			// are wanted of this phone
			nextSimpleChild.setWantedWeight(phoneLevelWeight);

			simpleCover.addChild(nextSimpleChild, nextIndex);

			byte numGrandChildren = nextSimpleChild.getNumChildren();
			// go through the grandchildren of simpleCover
			for (byte i = 0; i < numGrandChildren; i++) {
				// each grandchild is a prosody node
				CoverNode prosodyNode = new CoverNode((byte) numProsodyValues, wantedWeightDecrease);

				// set the weight that determines how many instances
				// are wanted of this diphone
				prosodyNode.setWantedWeight(diphoneLevelWeight);
				nextSimpleChild.addChild(prosodyNode, i);
				// go through the children of the prosody node
				for (byte j = 0; j < numProsodyValues; j++) {
					// each child is a leaf
					CoverLeaf prosodyChild = new CoverLeaf(wantedWeightDecrease);
					// set the weight that determines how many instances
					// are wanted of this prosody variation
					prosodyChild.setWantedWeight(prosodyLevelWeight);
					prosodyNode.addChild(prosodyChild, j);
				}
			}
		}

	}

	/**
	 * Get descriptive statistics for the full corpus. These values are independent of the selection of any sentences.
	 * 
	 * @return cs
	 */
	public CoverageStatistics getCorpusStatistics() {
		CoverageStatistics cs = new CoverageStatistics();
		cs.coveredPhones = getPhonesInCorpus();
		cs.allPhones = getAllPhones();
		cs.coveredDiphones = getDiphonesInCorpus();
		cs.numPossibleDiphones = numPossibleSimpleDiphones;
		cs.numCoveredDiphonesWithProsody = numSimpleFeatVectTypes;
		cs.numPossibleDiphonesWithProsody = numSimpleLeaves;
		cs.numTokens = numTokens;
		return cs;
	}

	public Set getPhonesInCorpus() {
		Set phones = new TreeSet();
		int phoneFeatureIndex = featDef.getFeatureIndex("phone");
		for (int i = 0; i < phoneFrequencies.length; i++) {
			if (phoneFrequencies[i] > 0) {
				phones.add(featDef.getFeatureValueAsString(phoneFeatureIndex, i));
			}
		}
		return phones;
	}

	public Set getAllPhones() {
		Set phones = new TreeSet();
		int phoneFeatureIndex = featDef.getFeatureIndex("phone");
		int i = 0;
		for (String ph : featDef.getPossibleValues(phoneFeatureIndex)) {
			if (!phonesToIgnore.contains(i)) {
				phones.add(ph);
			}
			i++;
		}
		return phones;
	}

	public Set getDiphonesInCorpus() {
		Set diphones = new HashSet();
		for (int i = 0; i < phoneFrequencies.length; i++) {
			for (int j = 0; j < phoneFrequencies.length; j++) {
				if (diphoneFrequencies[i][j] > 0) {
					diphones.add(possibleDiphones[i][j]);
				}
			}
		}
		return diphones;
	}

	/**
	 * Go down the cover tree according to the values in the feature vector
	 * 
	 * @param simpleDiphones
	 *            if true, go down simple cover tree, else go down clustered cover tree
	 * @param vectors
	 *            the feature vectors
	 * @param index
	 *            the index of the current feature vector
	 * @param addNewFeatureVector
	 *            if true, decrease wantedWeights of the nodes you pass
	 * @return the leaf that you arrived at
	 */
	private CoverLeaf goDownTree(byte[] vectors, int index, boolean addNewFeatureVector) {
		// go down to phone level
		// byte nextIndex = getVectorValue(vectors,index,phoneFeatIndex);
		byte nextIndex = vectors[index * numTargetFeaturesUsed + phoneFeatIndex];
		// CoverNode nextNode = simpleCover.getChild(nextIndex);
		CoverNode nextNode = simpleCover.children[nextIndex];

		if (addNewFeatureVector)
			nextNode.decreaseWantedWeight();

		// go down to diphone level
		// nextIndex = getVectorValue(vectors,index,diphoneFeatIndex);
		nextIndex = vectors[index * numTargetFeaturesUsed + diphoneFeatIndex];
		// nextNode = nextNode.getChild(nextIndex);
		nextNode = nextNode.children[nextIndex];
		if (addNewFeatureVector)
			nextNode.decreaseWantedWeight();

		// go down to prosody level
		// nextIndex = getVectorValue(vectors,index,prosodyIndex);
		nextIndex = vectors[index * numTargetFeaturesUsed + prosodyIndex];
		// nextNode = nextNode.getChild(nextIndex);
		nextNode = nextNode.children[nextIndex];
		if (addNewFeatureVector)
			nextNode.decreaseWantedWeight();

		if (!(nextNode instanceof CoverLeaf)) {
			// something went wrong
			throw new Error("Went down cover tree for feature vector" + " and did not end up on leaf!");
		}
		return (CoverLeaf) nextNode;

	}

	/**
	 * Compute the relative frequency of each node in the corpus
	 * 
	 * @param node
	 *            the node to compute the frequency for
	 * @param allTokens
	 *            total number of tokens in the corpus
	 * @return the frequency for the given node
	 */
	private double computeRelativeFrequency(CoverNode node, double allTokens) {
		double freq = 0;
		if (node instanceof CoverLeaf) {
			// compute the relative frequency for this leaf
			int numPossInstances = ((CoverLeaf) node).maxNumFeatVects();
			freq = (double) numPossInstances / allTokens;
			if (considerFrequency) {
				if (frequencySetting.equals("1minus")) {
					node.setFrequencyWeight(1 - freq);
				} else {
					if (frequencySetting.equals("inverse")) {
						node.setFrequencyWeight(1 / freq);
					} else {
						node.setFrequencyWeight(freq);
					}
				}
			}
		} else {
			// frequency is the sum of the frequency of the children
			byte numChildren = node.getNumChildren();
			// go through children
			for (byte i = 0; i < numChildren; i++) {
				CoverNode child = node.getChild(i);
				if (child == null)
					continue;
				freq += computeRelativeFrequency(child, allTokens);
			}
			if (considerFrequency) {
				if (frequencySetting.equals("1minus")) {
					node.setFrequencyWeight(1 - freq);
				} else {
					if (frequencySetting.equals("inverse")) {
						node.setFrequencyWeight(1 / freq);
					} else {
						node.setFrequencyWeight(freq);
					}
				}
			}
		}
		return freq;

	}

	/**
	 * Print a statistic of the unit distribution in the corpus
	 * 
	 * @param out
	 *            the print writer to print to
	 * @throws Exception
	 *             Exception
	 */
	public void printTextCorpusStatistics(PrintWriter out) throws Exception {
		DecimalFormat df = new DecimalFormat("0.00000");
		out.println("*********************" + "\n* Unit distribution *" + "\n*********************\n\n");

		/* print out the sentence length statistics */
		out.println("Number of sentences : " + numSentences);
		out.println("Average sentence length : " + averageSentLength);
		out.println("Maximum sentence length : " + maxSentLength);
		out.println("Minimum sentence length : " + minSentLength);

		/* print out coverage statistics */
		CoverageStatistics stats = getCorpusStatistics();
		out.println(stats);

		/*
		 * out.println("\nClustered Coverage:"); out.println("phones: "+df.format(possiblePhoneCoverage));
		 * out.println("diphones: "+df.format(possibleClusteredDiphoneCoverage));
		 * out.println("overall: "+df.format(possibleOverallClusteredCoverage)+"\n\n");
		 */

		if (possiblePhoneCoverage < 1) {
			out.println("The following phones are missing: ");
			for (int k = 1; k < possiblePhoneArray.length; k++) {
				String nextPhone = possiblePhoneArray[k];
				if (phonesToIgnore.contains(new Integer(k)))
					continue;
				if (!possiblePhoneTypes.contains(nextPhone)) {
					out.print(nextPhone + " ");
				}
			}
			out.print("\n");
		}

		out.println("\n\nDiphones and their frequencies :\n");
		out.println("Simple diphones:\n");
		printDiphones(out);

		out.flush();
		out.close();
	}

	/**
	 * Print the settings of the config file
	 * 
	 * @param out
	 *            the PrintWriter to print to
	 */
	public void printSettings(PrintWriter out) {
		/* print out setings */
		out.println("\nSettings of Coverage Definition:");
		out.println("simpleDiphones " + Boolean.toString(simpleDiphones));
		if (considerFrequency) {
			out.println("frequency " + frequencySetting);
		} else {
			out.println("frequency none");
		}
		out.println("considerSentenceLength " + Boolean.toString(considerSentenceLength));

		out.println("phoneLevelWeight " + phoneLevelWeight);
		out.println("diphoneLevelWeight " + diphoneLevelWeight);
		out.println("prosodyLevelWeight " + prosodyLevelWeight);
		out.println("divideWantedWeightBy " + wantedWeightDecrease);
		if (considerSentenceLength) {
			out.println("maxSentenceLength " + maxSentLengthAllowed);
			out.println("minSentenceLength " + minSentLengthAllowed);
		}

	}

	/**
	 * Print the diphone distribution of the corpus
	 * 
	 * @param out
	 *            the PrintWriter to print to
	 * @param ph2Frequency
	 *            maps from diphones to their frequency
	 */
	private void printDiphones(PrintWriter out) {
		DecimalFormat df = new DecimalFormat("0.00000");
		// Sort phones according to their frequencies
		TreeMap> freq2Diphones = new TreeMap>(Collections.reverseOrder());
		Map freq2Prob = new HashMap();
		for (int i = 0; i < possibleDiphones.length; i++) {
			for (int j = 0; j < possibleDiphones[i].length; j++) {
				String diphone = possibleDiphones[i][j];
				int freq = diphoneFrequencies[i][j];
				if (!freq2Diphones.containsKey(freq)) {
					List phoneList = new ArrayList();
					phoneList.add(diphone);
					freq2Diphones.put(freq, phoneList);
					double prob = (double) freq * 100.0 / (double) numTokens;
					freq2Prob.put(freq, prob);
				} else {
					List phoneList = freq2Diphones.get(freq);
					phoneList.add(diphone);
				}
			}
		}

		// output phones and their frequencies
		Set frequencies = freq2Diphones.keySet();
		for (Integer nextFreq : frequencies) {
			Double nextProb = freq2Prob.get(nextFreq);
			List nextPhoneList = freq2Diphones.get(nextFreq);
			for (int i = 0; i < nextPhoneList.size(); i++) {
				out.print(nextPhoneList.get(i));
				out.print(" : ");
				out.print(nextFreq);
				out.print(", ");
				out.println(df.format(nextProb));
			}
		}
	}

	/**
	 * Print statistics of the selected sentences and a table of coverage development over time
	 * 
	 * @param distributionFile
	 *            the file to print the statistics to
	 * @param developmentFile
	 *            the file to print the coverage development to
	 * @param logDevelopment
	 *            if true, print development file
	 * @throws Exception
	 *             Exception
	 */
	public void printSelectionDistribution(String distributionFile, String developmentFile, boolean logDevelopment)
			throws Exception {
		PrintWriter disOut = new PrintWriter(new FileWriter(new File(distributionFile)));
		/* print settings */
		DecimalFormat df = new DecimalFormat("0.00000");
		disOut.println("\nSettings of Coverage Definition:");
		disOut.println("simpleDiphones " + Boolean.toString(simpleDiphones));
		if (considerFrequency) {
			disOut.println("frequency " + frequencySetting);
		} else {
			disOut.println("frequency none");
		}
		disOut.println("considerSentenceLength " + Boolean.toString(considerSentenceLength));
		disOut.println("phoneLevelWeight " + phoneLevelWeight);
		disOut.println("diphoneLevelWeight " + diphoneLevelWeight);
		disOut.println("prosodyLevelWeight " + prosodyLevelWeight);
		disOut.println("maxSentenceLength " + maxSentLengthAllowed);
		disOut.println("minSentenceLength " + minSentLengthAllowed);
		disOut.println("divideWantedWeightBy " + wantedWeightDecrease);

		/* print results */
		disOut.println("\nResults:");
		disOut.println("Num sent in cover : " + numSentencesInCover);
		double avSentLength = (double) numSelectedFeatVects / (double) numSentencesInCover;
		disOut.println("Avg sent length : " + df.format(avSentLength));
		disOut.println("Max sent length : " + maxSentLengthInCover);
		disOut.println("Min sent length : " + minSentLengthInCover);

		/* print distribution info */
		double phoneCov = (double) phonesInCover.size() / (double) numPhoneValuesMinusIgnored;
		double simpleDiphoneCov = (double) simpleDiphonesInCover.size() / (double) numPossibleSimpleDiphones;
		double overallSimpleCov = (double) numSimpleFeatVectsInCover / (double) numSimpleLeaves;
		// double clusteredDiphoneCov = (double)clusteredDiphonesInCover.size()/(double)numPossibleClusteredDiphones;
		// double overallClusteredCov = (double)numClusteredFeatVectsInCover/(double)numClusteredLeaves;

		disOut.println("phones: " + df.format(phoneCov) + " (" + df.format(possiblePhoneCoverage) + ")");

		disOut.println("Simple Coverage:");
		disOut.println("phones: " + df.format(phoneCov) + " (" + df.format(possiblePhoneCoverage) + ")");
		disOut.println("diphones: " + df.format(simpleDiphoneCov) + " (" + df.format(possibleSimpleDiphoneCoverage) + ")");
		disOut.println("overall: " + df.format(overallSimpleCov) + " (" + df.format(possibleOverallSimpleCoverage) + ")");

		/*
		 * disOut.println("Clustered Coverage:");
		 * //disOut.println("phones: "+df.format(phoneCov)+" ("+df.format(possiblePhoneCoverage)+")");
		 * disOut.println("diphones: "+df.format(clusteredDiphoneCov) +" ("+df.format(possibleClusteredDiphoneCoverage)+")");
		 * disOut.println("overall: "+df.format(overallClusteredCov) +" ("+df.format(possibleOverallClusteredCoverage)+")");
		 */
		disOut.flush();
		disOut.close();

		/* print coverage development over time */
		if (logDevelopment) {
			PrintWriter devOut = new PrintWriter(new FileWriter(new File(developmentFile)));
			devOut.println("\toverall coverage\tdiphone coverage\tphone coverage");
			for (int i = 0; i < overallCoverageInTime.size(); i++) {
				devOut.print(i + "\t" + df.format(overallCoverageInTime.get(i)) + "\t" + df.format(diphoneCoverageInTime.get(i))
						+ "\t" + df.format(phoneCoverageInTime.get(i)) + "\n");
			}
			devOut.flush();
			devOut.close();
		}
	}

	public void printResultToLog(PrintWriter logOut) {
		/* print settings */
		DecimalFormat df = new DecimalFormat("0.00000");
		logOut.println("simpleDiphones " + Boolean.toString(simpleDiphones));
		if (considerFrequency) {
			logOut.println("frequency " + frequencySetting);
		} else {
			logOut.println("frequency none");
		}
		logOut.println("considerSentenceLength " + Boolean.toString(considerSentenceLength));
		logOut.println("phoneLevelWeight " + phoneLevelWeight);
		logOut.println("diphoneLevelWeight " + diphoneLevelWeight);
		logOut.println("prosodyLevelWeight " + prosodyLevelWeight);
		logOut.println("maxSentenceLength " + maxSentLengthAllowed);
		logOut.println("minSentenceLength " + minSentLengthAllowed);
		logOut.println("divideWantedWeightBy " + wantedWeightDecrease);

		logOut.println("\nNum sent in cover : " + numSentencesInCover);
		double avSentLength = (double) numSelectedFeatVects / (double) numSentencesInCover;
		logOut.println("Avg sent length : " + df.format(avSentLength));
		logOut.println("Max sent length : " + maxSentLengthInCover);
		logOut.println("Min sent length : " + minSentLengthInCover);

		/* print distribution info */
		double phoneCov = (double) phonesInCover.size() / (double) numPhoneValuesMinusIgnored;
		double simpleDiphoneCov = (double) simpleDiphonesInCover.size() / (double) numPossibleSimpleDiphones;
		double overallSimpleCov = (double) numSimpleFeatVectsInCover / (double) numSimpleLeaves;

		logOut.println("phones: " + df.format(phoneCov) + " (" + df.format(possiblePhoneCoverage) + ")");

		logOut.println("Simple Coverage:");
		// logOut.println("phones: "+df.format(phoneCov)+" ("+df.format(possiblePhoneCoverage)+")");
		logOut.println("diphones: " + df.format(simpleDiphoneCov) + " (" + df.format(possibleSimpleDiphoneCoverage) + ")");
		logOut.println("overall: " + df.format(overallSimpleCov) + " (" + df.format(possibleOverallSimpleCoverage) + ")");

		/*
		 * logOut.println("Clustered Coverage:");
		 * //logOut.println("phones: "+df.format(phoneCov)+" ("+df.format(possiblePhoneCoverage)+")");
		 * logOut.println("diphones: "+df.format(clusteredDiphoneCov) +" ("+df.format(possibleClusteredDiphoneCoverage)+")");
		 * logOut.println("overall: "+df.format(overallClusteredCov) +" ("+df.format(possibleOverallClusteredCoverage)+")\n\n");
		 */

	}

	/**
	 * Add the feature vectors for one sentence to the cover
	 * 
	 * @param features
	 *            the feature vectors to add
	 */
	public void updateCover(byte[] features) {
		int numFeatVects = features.length / numTargetFeaturesUsed;
		// loop through the feature vectors
		for (int i = 0; i < numFeatVects; i++) {

			/* update simpleCover */
			CoverLeaf leaf = goDownTree(features, i, true);
			// if this is the first feature vector in this leaf
			// decrease cover size
			if (leaf.getNumFeatureVectors() == 0) {
				numSimpleFeatVectsInCover++;
			}
			leaf.addFeatureVector();
			String phone = possiblePhoneArray[getVectorValue(features, i, phoneFeatIndex)];
			// update coverage statistics
			String diphone = phone + "_" + possibleNextPhoneArray[getVectorValue(features, i, diphoneFeatIndex)];

			phonesInCover.add(phone);
			simpleDiphonesInCover.add(diphone);

		}
		// update phone coverage statistics
		double phoneCoverage = (double) phonesInCover.size() / (double) numPhoneValuesMinusIgnored;
		phoneCoverageInTime.add(new Double(phoneCoverage));
		// update diphone and overall coverage statistics
		if (simpleDiphones) {
			double diphoneCoverage = (double) simpleDiphonesInCover.size() / (double) numPossibleSimpleDiphones;
			double overallCoverage = (double) numSimpleFeatVectsInCover / (double) numSimpleLeaves;
			diphoneCoverageInTime.add(new Double(diphoneCoverage));
			overallCoverageInTime.add(new Double(overallCoverage));
		}
		// compute statistics of sentence length
		numSentencesInCover++;
		if (numFeatVects > maxSentLengthInCover)
			maxSentLengthInCover = numFeatVects;
		if (numFeatVects < minSentLengthInCover)
			minSentLengthInCover = numFeatVects;
		numSelectedFeatVects += numFeatVects;
	}

	/**
	 * Check if cover has maximum simple diphone coverage
	 * 
	 * @return true if cover has maximum simple diphone coverage
	 */
	public boolean reachedMaxSimpleDiphones() {
		return simpleDiphonesInCover.size() >= numSimpleDiphoneTypes;
	}

	/**
	 * Check if cover has maximum simple prosody coverage
	 * 
	 * @return true if cover has maximum simple prosody coverage
	 */
	public boolean reachedMaxSimpleProsody() {
		return numSimpleFeatVectsInCover == numSimpleFeatVectTypes;
	}

	/**
	 * Get the usefulness of the given feature vectors Usefulness of a feature vector is defined as the sum of the score for the
	 * feature vectors on all levels of the tree. On each level, the score is the product of the two weights of the node. The
	 * first weight reflects the frequency/ inverted frequency of the value associated with the node in the corpus (→
	 * frequencyWeight). The second weight reflects how much an instance of a feature vector containing the associated value is
	 * wanted in the cover (→ wantedWeight).
	 * 
	 * @param featureVectors
	 *            the feature vectors
	 * @return the usefulness
	 */
	public double usefulnessOfFVs(byte[] featureVectors) {
		double usefulness = 0.0;
		// int numFeatureVectors = featureVectors.length/4;
		int numFeatureVectors = featureVectors.length / numTargetFeaturesUsed;
		if (considerSentenceLength) {
			// too long sentences are useless
			if (numFeatureVectors > maxSentLengthAllowed)
				return -1.0;
			// too short sentences are useless as well
			if (numFeatureVectors < minSentLengthAllowed)
				return -1.0;
		}
		// loop over the feature vectors
		// System.out.print("Usefulness = ");
		// we cannot trust that all bytes in the feature vector are meaningful -- therefore,
		// it is not guaranteed that numFeatureVectors * numTargetFeaturesUsed == featureVectors.length!!
		for (int pos = 0, max = numFeatureVectors * numTargetFeaturesUsed; pos < max; pos += numTargetFeaturesUsed) {

			double u = 0;
			// get the associated leaf
			// go down to phone level
			// byte nextIndex = getVectorValue(featureVectors,i,phoneFeatIndex);
			byte nextIndex = featureVectors[pos + phoneFeatIndex];
			CoverNode nextNode = simpleCover.children[nextIndex];

			// double relFreq = nextNode.getFrequencyWeight();
			// double wantedWeight = nextNode.getWantedWeight();
			// System.out.print(" +"+relFreq+"*"+wantedWeight);
			// u += nextNode.frequencyWeight * nextNode.wantedWeight;
			u += nextNode.usefulness;
			// go down to diphone level
			// nextIndex = getVectorValue(featureVectors,i,diphoneFeatIndex);
			nextIndex = featureVectors[pos + diphoneFeatIndex];
			nextNode = nextNode.children[nextIndex];
			// relFreq = nextNode.getFrequencyWeight();
			// wantedWeight = nextNode.getWantedWeight();
			// System.out.print(" +"+relFreq+"*"+wantedWeight);
			// u += nextNode.frequencyWeight * nextNode.wantedWeight;
			u += nextNode.usefulness;
			// go down to prosody level
			// nextIndex = getVectorValue(featureVectors,i,prosodyIndex);
			nextIndex = featureVectors[pos + prosodyIndex];
			nextNode = nextNode.children[nextIndex];
			// relFreq = nextNode.getFrequencyWeight();
			// wantedWeight = nextNode.getWantedWeight();
			// System.out.print(" +"+relFreq+"*"+wantedWeight+"\n");
			// u += nextNode.frequencyWeight * nextNode.wantedWeight;
			u += nextNode.usefulness;
			usefulness += u;
		}
		// System.out.print(" = "+usefulness+"\n");
		return usefulness / (double) numFeatureVectors;
	}

	public CoverageFeatureProvider getCoverageFeatureProvider() {
		return cfProvider;
	}

	public byte getVectorValue(byte[] vectors, int vectorIndex, int valueIndex) {
		// byte result = vectors[vectorIndex*4+valueIndex];
		return vectors[vectorIndex * numTargetFeaturesUsed + valueIndex];
	}

	/**
	 * Print the cover sets to the given file
	 * 
	 * @param filename
	 *            the file to print to
	 * @throws Exception
	 *             Exception
	 */
	public void writeCoverageBin(String filename) throws Exception {
		DataOutputStream out = new DataOutputStream(new FileOutputStream(new File(filename)));
		/* print all the relevant information */
		out.writeInt(numTokens);
		out.writeInt(numSimpleDiphoneTypes);
		out.writeInt(numSimpleFeatVectTypes);
		out.writeDouble(averageSentLength);
		out.writeInt(maxSentLength);
		out.writeInt(minSentLength);
		out.writeInt(numSimpleLeaves);
		out.writeDouble(possiblePhoneCoverage);
		out.writeDouble(possibleSimpleDiphoneCoverage);
		out.writeDouble(possibleOverallSimpleCoverage);
		out.writeInt(numSentences);
		/* print the coverage tree */
		writeTreeBin(out, simpleCover);
		out.flush();
		out.close();
	}

	/**
	 * Print the cover tree
	 * 
	 * @param out
	 *            the output stream to write to
	 * @param cover
	 *            the tree to print
	 * @throws IOException
	 *             IOException
	 */
	private void writeTreeBin(DataOutputStream out, CoverNode cover) throws IOException {

		// go down to phone level
		byte numChildren = cover.getNumChildren();
		double frequencyWeight = cover.getFrequencyWeight();
		double wantedWeight = cover.getWantedWeight();
		double wantedWeightDecrease = cover.getWantedWeightDecrease();
		out.writeByte(numChildren);
		for (byte i = 0; i < numChildren; i++) {
			if (phonesToIgnore.contains(new Integer(i)))
				continue;
			CoverNode phoneNode = cover.getChild(i);
			frequencyWeight = phoneNode.getFrequencyWeight();
			wantedWeight = phoneNode.getWantedWeight();
			wantedWeightDecrease = phoneNode.getWantedWeightDecrease();
			byte numNextChildren = phoneNode.getNumChildren();
			out.writeByte(numNextChildren);
			// go down to diphone level
			for (byte j = 0; j < numNextChildren; j++) {
				CoverNode diphoneNode = phoneNode.getChild(j);
				// go down to prosody level
				byte numNextNextChildren = diphoneNode.getNumChildren();
				out.writeByte(numNextNextChildren);
				for (byte k = 0; k < numNextNextChildren; k++) {
					CoverLeaf nextLeaf = (CoverLeaf) diphoneNode.getChild(k);
					int numVectors = nextLeaf.maxNumFeatVects();
					frequencyWeight = nextLeaf.getFrequencyWeight();
					wantedWeight = nextLeaf.getWantedWeight();
					wantedWeightDecrease = nextLeaf.getWantedWeightDecrease();
					out.writeInt(numVectors);
				}

			}

		}

	}

	/**
	 * Read the cover sets from the given file
	 * 
	 * @param filename
	 *            the file containing the cover sets
	 * @param idSentenceList
	 *            the id of the sentence list
	 * @throws Exception
	 *             Exception
	 */
	// public void readCoverageBin(String filename, FeatureDefinition fDef, String[] basenames)throws Exception{
	public void readCoverageBin(String filename, int[] idSentenceList) throws Exception {

		DataInputStream in = new DataInputStream(new FileInputStream(new File(filename)));
		/* read all the relevant information */
		numTokens = in.readInt();
		numSimpleDiphoneTypes = in.readInt();
		numSimpleFeatVectTypes = in.readInt();
		averageSentLength = in.readDouble();
		maxSentLength = in.readInt();
		minSentLength = in.readInt();
		numSimpleLeaves = in.readInt();
		possiblePhoneCoverage = in.readDouble();
		possibleSimpleDiphoneCoverage = in.readDouble();
		possibleOverallSimpleCoverage = in.readDouble();
		numSentences = in.readInt();
		/* print the coverage tree */
		readTreeBin(in);
		in.close();
		System.out.print("Num Tokens: " + numTokens + "\n");
	}

	/**
	 * Read a cover tree
	 * 
	 * @param in
	 *            the inputstream to read from
	 * @param isSimpleCover
	 *            if true, build the cover tree fro simpleDiphones
	 * @throws Exception
	 *             Exception
	 */
	private void readTreeBin(DataInputStream in) throws Exception {
		byte numChildren = in.readByte();
		double wantedWeight = 0.0;
		CoverNode cover = new CoverNode(numChildren, wantedWeightDecrease, wantedWeight);
		for (byte i = 0; i < numChildren; i++) {
			if (phonesToIgnore.contains(new Integer(i)))
				continue;
			byte nextNumChildren = in.readByte();

			CoverNode diphoneNode = new CoverNode(nextNumChildren, wantedWeightDecrease, phoneLevelWeight);
			cover.addChild(diphoneNode, i);

			for (byte j = 0; j < nextNumChildren; j++) {
				byte nextNextNumChildren = in.readByte();
				CoverNode prosodyNode = new CoverNode(nextNextNumChildren, wantedWeightDecrease, diphoneLevelWeight);
				diphoneNode.addChild(prosodyNode, j);

				for (byte k = 0; k < nextNextNumChildren; k++) {
					int numVectors = in.readInt();
					CoverLeaf leafNode = new CoverLeaf(wantedWeightDecrease, prosodyLevelWeight, numVectors);
					prosodyNode.addChild(leafNode, k);
				}

			}

		}
		computeRelativeFrequency(cover, numTokens);
		simpleCover = cover;
	}

	/**
	 * A node in the cover tree Represents a feature. Number of children is the number of possible values.
	 * 
	 * @author Anna Hunecke
	 *
	 */
	class CoverNode {

		/* children of this node */
		protected CoverNode[] children;
		/* number of children of this node */
		private byte numChildren;
		/*
		 * how much is this node and its children wanted in the cover
		 */
		protected double wantedWeight;
		/*
		 * frequency/inverted frequency of the node in the corpus
		 */
		protected double frequencyWeight;
		/* number by which the wantedWeight is divided */
		protected double wantedWeightDecrease;

		/* usefulness is the product of wantedWeight and frequencyWeight. It is here purely for efficiency reasons. */
		protected double usefulness;

		/**
		 * Build a new CoverNode Set frequency weight to 1
		 */
		public CoverNode() {
			frequencyWeight = 1;
			usefulness = 0;
		}

		/**
		 * Build a new CoverNode
		 * 
		 * @param numChildren
		 *            the number of children
		 * @param wantedWeightDecrease
		 *            the value by which the wanted weight is divided
		 * @param wantedWeight
		 *            the wanted weight
		 */
		public CoverNode(byte numChildren, double wantedWeightDecrease, double wantedWeight) {
			this.numChildren = numChildren;
			children = new CoverNode[numChildren];
			this.wantedWeightDecrease = wantedWeightDecrease;
			this.wantedWeight = wantedWeight;
			frequencyWeight = 1;
			usefulness = frequencyWeight * this.wantedWeight;
		}

		/**
		 * Build a new CoverNode
		 * 
		 * @param values
		 *            the number of values
		 * @param wantedWeightDecrease
		 *            the wantedWeightDecrease
		 */
		public CoverNode(byte values, double wantedWeightDecrease) {
			children = new CoverNode[values];
			numChildren = (byte) children.length;
			frequencyWeight = 1;
			this.wantedWeightDecrease = wantedWeightDecrease;
			usefulness = 0;
		}

		/**
		 * Add a new child
		 * 
		 * @param child
		 *            the child
		 * @param value
		 *            the position of the child in the children array
		 */
		public void addChild(CoverNode child, byte value) {
			children[value] = child;
		}

		/**
		 * Get a child
		 * 
		 * @param value
		 *            the position of the child in the children array
		 * @return the child (null, if there is no child at this position)
		 */
		public CoverNode getChild(byte value) {
			return children[value];
		}

		/**
		 * Get the number of children
		 * 
		 * @return the number of children
		 */
		public byte getNumChildren() {
			return numChildren;
		}

		/**
		 * Set the wantedWeight
		 * 
		 * @param wantedWeight
		 *            the new wantedWeight
		 */
		public void setWantedWeight(double wantedWeight) {
			this.wantedWeight = wantedWeight;
			usefulness = this.wantedWeight * frequencyWeight;
		}

		/**
		 * Get the wantedWeight
		 * 
		 * @return the wantedWeight
		 */
		public double getWantedWeight() {
			return wantedWeight;
		}

		/**
		 * Get the wantedWeightDecrease
		 * 
		 * @return the wantedWeightDecrease
		 */
		public double getWantedWeightDecrease() {
			return wantedWeightDecrease;
		}

		/**
		 * Decrease the wantedWeight by dividing it by wantedWeightDecrease
		 *
		 */
		public void decreaseWantedWeight() {
			wantedWeight = wantedWeight / wantedWeightDecrease;
			usefulness = frequencyWeight * wantedWeight;
		}

		/**
		 * Set the frequencyWeight
		 * 
		 * @param frequencyWeight
		 *            the new frequencyWeight
		 */
		public void setFrequencyWeight(double frequencyWeight) {
			this.frequencyWeight = frequencyWeight;
			usefulness = this.frequencyWeight * wantedWeight;
		}

		/**
		 * Get the frequencyWeight
		 * 
		 * @return the frequencyWeight
		 */
		public double getFrequencyWeight() {
			return frequencyWeight;
		}

	}

	/**
	 * A leaf in the cover tree. Collects the feature vectors that belong to the path that leads to the leaf.
	 * 
	 * @author Anna Hunecke
	 *
	 */
	class CoverLeaf extends CoverNode {

		/* the number of feature vectors in this node */
		private int numFeatVects;
		/*
		 * the maximimum number of feature vectors that could be in this node (according to the corpus)
		 */
		private int maxNumFeatVects;

		/**
		 * Build a new cover leaf
		 * 
		 * @param wantedWeightDecrease
		 *            the wantedWeightDecrease
		 */
		public CoverLeaf(double wantedWeightDecrease) {
			super();
			numFeatVects = 0;
			maxNumFeatVects = 0;
			this.wantedWeightDecrease = wantedWeightDecrease;
			frequencyWeight = 1;
		}

		/**
		 * Build a new CoverLeaf
		 * 
		 * @param wantedWeightDecrease
		 *            the wantedWeightDecrease
		 * @param wantedWeight
		 *            the wanted weight
		 * @param maxNumFeatVects
		 *            maximum number of feature vectors that can be collected in this leaf
		 */
		public CoverLeaf(double wantedWeightDecrease, double wantedWeight, int maxNumFeatVects) {
			this.wantedWeightDecrease = wantedWeightDecrease;
			this.wantedWeight = wantedWeight;
			this.maxNumFeatVects = maxNumFeatVects;
			frequencyWeight = 1;
		}

		/**
		 * Add a new feature vector
		 * 
		 * @param featureVector
		 *            the new feature vector
		 */
		public void addFeatureVector() {
			numFeatVects++;
		}

		/**
		 * Increase the maximum number of feature vectors by one (because we have seen a feature vector for this node in the
		 * corpus)
		 */
		public void addPossibleInstance() {
			maxNumFeatVects++;
		}

		/**
		 * Get the number of feature vectors of this node
		 * 
		 * @return the number of feature vectors
		 */
		public int getNumFeatureVectors() {
			return numFeatVects;
		}

		/**
		 * Get the maximum number of feature vectors of this node
		 * 
		 * @return the maximum number of feature vectors
		 */
		public int maxNumFeatVects() {
			return maxNumFeatVects;
		}
	}

	public static class CoverageStatistics {
		public Set coveredPhones;
		public Set allPhones;
		public Set coveredDiphones;
		public int numPossibleDiphones;
		public int numCoveredDiphonesWithProsody;
		public int numPossibleDiphonesWithProsody;
		public int numTokens;

		@Override
		public String toString() {
			StringWriter sw = new StringWriter();
			PrintWriter out = new PrintWriter(sw);
			out.println("\nSimple Coverage:");
			out.printf(Locale.US, "phones: %.5f\n", coveredPhones.size() / (double) allPhones.size());
			out.printf(Locale.US, "diphones: %.5f\n", coveredDiphones.size() / (double) numPossibleDiphones);
			out.printf(Locale.US, "overall: %.5f\n", numCoveredDiphonesWithProsody / (double) numPossibleDiphonesWithProsody);

			if (coveredPhones.size() < allPhones.size()) {
				out.println("The following phones are missing: ");
				for (String ph : allPhones) {
					if (!coveredPhones.contains(ph)) {
						out.print(ph + " ");
					}
				}
				out.println();
				out.println("The following phones are present: ");
				for (String ph : coveredPhones) {
					out.print(ph + " ");
				}
				out.println();
			}

			/* print out the diphone statistics */
			out.println("\n");
			out.println("Number of diphones seen      : " + numTokens);
			out.println("Number of different diphones : " + coveredDiphones.size() + " out of a theoretical "
					+ numPossibleDiphones);

			out.close();
			return sw.toString();

		}
	}
}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy