edu.stanford.nlp.parser.lexparser.HTKLatticeReader Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of stanford-parser Show documentation
Stanford Parser processes raw text in English, Chinese, German, Arabic, and French, and extracts constituency parse trees.
There is a newer version: 3.9.2
Show newest version
package edu.stanford.nlp.parser.lexparser;

import edu.stanford.nlp.io.IOUtils;
import edu.stanford.nlp.trees.Tree;

import java.io.BufferedReader;
import java.util.ArrayList;
import java.util.Collections;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

public class HTKLatticeReader {

  public final boolean DEBUG;
  public final boolean PRETTYPRINT;
  public static final boolean USESUM = true;
  public static final boolean USEMAX = false;
  private final boolean mergeType;
  public static final String SILENCE = "";

  private int numStates;
  private List latticeWords;
  private int[] nodeTimes;
  private ArrayList[] wordsAtTime;
  private ArrayList[] wordsStartAt;
  private ArrayList[] wordsEndAt;

  private void readInput(BufferedReader in) throws Exception {

    // GET RID OF COMMENT LINES
    String line = in.readLine();
    while (line.trim().startsWith("#")) {
      line = in.readLine();
    }

    // READ LATTICE
    latticeWords = new ArrayList<>();

    Pattern wordLinePattern = Pattern.compile("(\\d+)\\s+(\\d+)\\s+lm=(-?\\d+\\.\\d+),am=(-?\\d+\\.\\d+)\\s+([^( ]+)(?:\\((\\d+)\\))?.*");
    Matcher wordLineMatcher = wordLinePattern.matcher(line);

    while (wordLineMatcher.matches()) {
      int startNode = Integer.parseInt(wordLineMatcher.group(1)) - 1;
      int endNode = Integer.parseInt(wordLineMatcher.group(2)) - 1;
      double lm = Double.parseDouble(wordLineMatcher.group(3));
      double am = Double.parseDouble(wordLineMatcher.group(4));
      String word = wordLineMatcher.group(5).toLowerCase();
      String pronun = wordLineMatcher.group(6);

      if (word.equalsIgnoreCase("")) {
        line = in.readLine();
        wordLineMatcher = wordLinePattern.matcher(line);
        continue;
      }
      if (word.equalsIgnoreCase("")) {
        word = Lexicon.BOUNDARY;
      }

      int pronunciation;
      if (pronun == null) {
        pronunciation = 0;
      } else {
        pronunciation = Integer.parseInt(pronun);
      }

      LatticeWord lw = new LatticeWord(word, startNode, endNode, lm, am, pronunciation, mergeType);
      if (DEBUG) {
        System.err.println(lw);
      }
      latticeWords.add(lw);

      line = in.readLine();
      wordLineMatcher = wordLinePattern.matcher(line);
    }

    // GET NUMBER OF NODES
    numStates = Integer.parseInt(line.trim());
    if (DEBUG) {
      System.err.println(numStates);
    }

    // READ NODE TIMES
    nodeTimes = new int[numStates];

    Pattern nodeTimePattern = Pattern.compile("(\\d+)\\s+t=(\\d+)\\s*");
    Matcher nodeTimeMatcher;

    for (int i = 0; i < numStates; i++) {
      nodeTimeMatcher = nodeTimePattern.matcher(in.readLine());

      if (!nodeTimeMatcher.matches()) {
        System.err.println("Input File Error");
        System.exit(1);
      }

      // assert ((Integer.parseInt(nodeTimeMatcher.group(1))-1) == i) ;

      nodeTimes[i] = Integer.parseInt(nodeTimeMatcher.group(2));

      if (DEBUG) {
        System.err.println(i + "\tt=" + nodeTimes[i]);
      }
    }
  }

  private void mergeSimultaneousNodes() {

    int[] indexMap = new int[nodeTimes.length];

    indexMap[0] = 0;
    int prevNode = 0;
    int prevTime = nodeTimes[0];
    if (DEBUG) {
      System.err.println(0 + " (" + nodeTimes[0] + ")" + "-->" + 0 + " (" + nodeTimes[0] + ") ++");
    }
    for (int i = 1; i < nodeTimes.length; i++) {
      if (prevTime == nodeTimes[i]) {
        indexMap[i] = prevNode;
        if (DEBUG) {
          System.err.println(i + " (" + nodeTimes[i] + ")" + "-->" + prevNode + " (" + nodeTimes[prevNode] + ") **");
        }
      } else {
        indexMap[i] = prevNode = i;
        prevTime = nodeTimes[i];
        if (DEBUG) {
          System.err.println(i + " (" + nodeTimes[i] + ")" + "-->" + prevNode + " (" + nodeTimes[prevNode] + ") ++");
        }
      }
    }

    for  (LatticeWord lw : latticeWords) {
      lw.startNode = indexMap[lw.startNode];
      lw.endNode = indexMap[lw.endNode];
      if (DEBUG) {
        System.err.println(lw);
      }
    }
  }

  private void removeEmptyNodes() {
    int[] indexMap = new int[numStates];
    int j = 0;
    for (int i = 0; i < numStates; i++) {
      indexMap[i] = j;
      if (wordsStartAt[i].size() != 0 || wordsEndAt[i].size() != 0) {
        j++;
      }
    }

    for (HTKLatticeReader.LatticeWord lw : latticeWords) {
      wordsStartAt[lw.startNode].remove(lw);
      wordsEndAt[lw.endNode].remove(lw);
      for (int i = lw.startNode; i < lw.endNode; i++) {
        wordsAtTime[i].remove(lw);
      }

      lw.startNode = indexMap[lw.startNode];
      lw.endNode = indexMap[lw.endNode];
      wordsStartAt[lw.startNode].add(lw);
      wordsEndAt[lw.endNode].add(lw);
      for (int i = lw.startNode; i < lw.endNode; i++) {
        wordsAtTime[i].add(lw);
      }
    }

    numStates = j;
    ArrayList[] tmp = wordsAtTime;
    wordsAtTime = new ArrayList[numStates];
    System.arraycopy(tmp, 0, wordsAtTime, 0, numStates);

    tmp = wordsStartAt;
    wordsStartAt = new ArrayList[numStates];
    System.arraycopy(tmp, 0, wordsStartAt, 0, numStates);

    tmp = wordsEndAt;
    wordsEndAt = new ArrayList[numStates];
    System.arraycopy(tmp, 0, wordsEndAt, 0, numStates);

  }

  private void buildWordTimeArrays() {
    buildWordsAtTime();
    buildWordsStartAt();
    buildWordsEndAt();
  }

  private void buildWordsAtTime() {
    wordsAtTime = new ArrayList[numStates];
    for (int i = 0; i < wordsAtTime.length; i++) {
      wordsAtTime[i] = new ArrayList<>();
    }

    for (LatticeWord lw : latticeWords) {
      for (int j = lw.startNode; j <= lw.endNode; j++) {
        wordsAtTime[j].add(lw);
      }
    }
  }

  private void buildWordsStartAt() {
    wordsStartAt = new ArrayList[numStates];
    for (int i = 0; i < wordsStartAt.length; i++) {
      wordsStartAt[i] = new ArrayList<>();
    }

    for (LatticeWord lw : latticeWords) {
      wordsStartAt[lw.startNode].add(lw);
    }
  }

  private void buildWordsEndAt() {
    wordsEndAt = new ArrayList[numStates];
    for (int i = 0; i < wordsEndAt.length; i++) {
      wordsEndAt[i] = new ArrayList<>();
    }

    for (LatticeWord lw : latticeWords) {
      wordsEndAt[lw.endNode].add(lw);
    }
  }

  private void removeRedundency() {

    boolean changed = true;

    while (changed) {
      changed = false;
      for (ArrayList aWordsAtTime : wordsAtTime) {
        if (aWordsAtTime.size() < 2) {
          continue;
        }
        INNER:
        for (int j = 0; j < aWordsAtTime.size() - 1; j++) {
          LatticeWord w1 = aWordsAtTime.get(j);
          for (int k = j + 1; k < aWordsAtTime.size(); k++) {
            LatticeWord w2 = aWordsAtTime.get(k);
            if (w1.word.equalsIgnoreCase(w2.word)) {
              if (removeRedundentPair(w1, w2)) {
                //int numMerged = mergeDuplicates();
                //if (DEBUG) { System.err.println("merged " + numMerged + " identical entries."); }
                changed = true;
                //printWords();
                //j--;
                continue INNER;
                //return;
              }
            }
          }
        }
      }
    }
  }

  private boolean removeRedundentPair(LatticeWord w1, LatticeWord w2) {

    if (DEBUG) {
      System.err.println("trying to remove:");
      System.err.println(w1);
      System.err.println(w2);
    }

    int w1Start = w1.startNode;
    int w2Start = w2.startNode;
    int w1End = w1.endNode;
    int w2End = w2.endNode;

    // we must pick new start and end times that are legal
    int newStart, oldStart;
    if (w1Start < w2Start) {
      newStart = w2Start;
      oldStart = w1Start;
    } else {
      newStart = w1Start;
      oldStart = w2Start;
    }

    int newEnd, oldEnd;
    if (w1End < w2End) {
      newEnd = w1End;
      oldEnd = w2End;
    } else {
      newEnd = w2End;
      oldEnd = w1End;
    }

    // check legality (illegality not guarenteed)
    for (LatticeWord lw : wordsStartAt[oldStart]) {
      if (lw.endNode < newStart || ((lw.endNode == newStart) && (lw.endNode != lw.startNode))) {
        if (DEBUG) {
          System.err.println("failed");
        }
        return false;
      }
    }
    for (LatticeWord lw : wordsEndAt[oldEnd]) {
      if (lw.startNode > newEnd || ((lw.startNode == newEnd) && (lw.endNode != lw.startNode))) {
        if (DEBUG) {
          System.err.println("failed");
        }
        return false;
      }
    }

    // change start/end times of adjacent entries
    changeStartTimes(wordsStartAt[oldEnd], newEnd);
    changeEndTimes(wordsEndAt[oldStart], newStart);

    // change start/end times of words adjacent to adjacent entries
    changeStartTimes(wordsStartAt[oldStart], newStart);
    changeEndTimes(wordsEndAt[oldEnd], newEnd);

    if (DEBUG) {
      System.err.println("succeeded");
    }
    return true;
  }


  private void changeStartTimes(List words, int newStartTime) {
    ArrayList toRemove = new ArrayList<>();
    for (LatticeWord lw : words) {
      latticeWords.remove(lw);
      int oldStartTime = lw.startNode;
      lw.startNode = newStartTime;

      if (latticeWords.contains(lw)) {
        if (DEBUG) {
          System.err.println("duplicate found");
        }
        LatticeWord twin = latticeWords.get(latticeWords.indexOf(lw));
        // assert (twin != lw) ;
        lw.startNode = oldStartTime;
        twin.merge(lw);
        //wordsStartAt[lw.startNode].remove(lw);
        toRemove.add(lw);
        wordsEndAt[lw.endNode].remove(lw);
        for (int i = lw.startNode; i <= lw.endNode; i++) {
          wordsAtTime[i].remove(lw);
        }
      } else {
        if (oldStartTime < newStartTime) {
          for (int i = oldStartTime; i < newStartTime; i++) {
            wordsAtTime[i].remove(lw);
          }
        } else {
          for (int i = newStartTime; i < oldStartTime; i++) {
            wordsAtTime[i].add(lw);
          }
        }
        latticeWords.add(lw);
        if (oldStartTime != newStartTime) {
          //wordsStartAt[oldStartTime].remove(lw);
          toRemove.add(lw);
          wordsStartAt[newStartTime].add(lw);
        }
      }
    }
    words.removeAll(toRemove);
  }

  private void changeEndTimes(List words, int newEndTime) {
    ArrayList toRemove = new ArrayList<>();
    for (LatticeWord lw : words) {
      latticeWords.remove(lw);
      int oldEndTime = lw.endNode;
      lw.endNode = newEndTime;

      if (latticeWords.contains(lw)) {
        if (DEBUG) {
          System.err.println("duplicate found");
        }
        LatticeWord twin = latticeWords.get(latticeWords.indexOf(lw));
        // assert (twin != lw) ;
        lw.endNode = oldEndTime;
        twin.merge(lw);
        wordsStartAt[lw.startNode].remove(lw);
        //wordsEndAt[lw.endNode].remove(lw);
        toRemove.add(lw);
        for (int i = lw.startNode; i <= lw.endNode; i++) {
          wordsAtTime[i].remove(lw);
        }
      } else {
        if (oldEndTime > newEndTime) {
          for (int i = newEndTime + 1; i <= oldEndTime; i++) {
            wordsAtTime[i].remove(lw);
          }
        } else {
          for (int i = oldEndTime + 1; i <= newEndTime; i++) {
            wordsAtTime[i].add(lw);
          }
        }
        latticeWords.add(lw);
        if (oldEndTime != newEndTime) {
          //wordsEndAt[oldEndTime].remove(lw);
          toRemove.add(lw);
          wordsEndAt[newEndTime].add(lw);
        }
      }
    }
    words.removeAll(toRemove);
  }

  private void removeSilence() {
    ArrayList silences = new ArrayList<>();
    for (LatticeWord lw : latticeWords) {
      if (lw.word.equalsIgnoreCase(SILENCE)) {
        silences.add(lw);
      }
    }
    for (LatticeWord lw : silences) {
      //if (lw.endNode == numStates) {
      changeEndTimes(wordsEndAt[lw.startNode], lw.endNode);
      //} else {
      //changeStartTimes(wordsStartAt[lw.endNode], lw.startNode);
      //}
    }
    silences.clear();
    for (HTKLatticeReader.LatticeWord lw : latticeWords) {
      if (lw.word.equalsIgnoreCase(SILENCE)) {
        silences.add(lw);
      }
    }
    for (LatticeWord lw : silences) {
      if (lw.word.equalsIgnoreCase(SILENCE)) {
        latticeWords.remove(lw);
        wordsStartAt[lw.startNode].remove(lw);
        wordsEndAt[lw.endNode].remove(lw);
        for (int j = lw.startNode; j <= lw.endNode; j++) {
          wordsAtTime[j].remove(lw);
        }
      }
    }
  }

  private int mergeDuplicates() {
    int numMerged = 0;
    for (int i = 0; i < latticeWords.size() - 1; i++) {
      LatticeWord first = latticeWords.get(i);
      for (int j = i + 1; j < latticeWords.size(); j++) {
        LatticeWord second = latticeWords.get(j);
        if (first.equals(second)) {
          if (DEBUG) {
            System.err.println("removed duplicate");
          }
          first.merge(second);
          latticeWords.remove(j);
          wordsStartAt[second.startNode].remove(second);
          wordsEndAt[second.endNode].remove(second);
          for (int k = second.startNode; k <= second.endNode; k++) {
            wordsAtTime[k].remove(second);
          }
          numMerged++;
          j--;
        }
      }
    }
    return numMerged;
  }

  public void printWords() {
    Collections.sort(latticeWords);
    System.out.println("Words: ");
    for (LatticeWord lw : latticeWords) {
      System.out.println(lw);
    }
  }

  private double getProb(LatticeWord lw) {
    return lw.am * 100.0 + lw.lm;
  }

  //     private LatticeWord[][] nBest(int n) {

  //     }

  public void processLattice() {
    // System.err.println(1);
    buildWordTimeArrays();
    //System.err.println(2);
    removeSilence();
    //System.err.println(3);
    mergeDuplicates();
    //System.err.println(4);
    removeRedundency();
    //System.err.println(5);
    removeEmptyNodes();
    //System.err.println(6);
    if (PRETTYPRINT) {
      printWords();
    }

  }


  public HTKLatticeReader(String filename) throws Exception {
    this(filename, USESUM, false, false);
  }

  public HTKLatticeReader(String filename, boolean mergeType) throws Exception {
    this(filename, mergeType, false, false);
  }

  public HTKLatticeReader(String filename, boolean mergeType, boolean debug, boolean prettyPrint) throws Exception {
    this.DEBUG = debug;
    this.PRETTYPRINT = prettyPrint;
    this.mergeType = mergeType;

    BufferedReader in = IOUtils.readerFromString(filename);
    //System.err.println(-1);
    readInput(in);
    //System.err.println(0);
    if (PRETTYPRINT) {
      printWords();
    }

    processLattice();

  }

  public List getLatticeWords() {
    return latticeWords;
  }

  public int getNumStates() {
    return numStates;
  }

  public List getWordsOverSpan(int a, int b) {
    ArrayList words = new ArrayList<>();
    for (LatticeWord lw : wordsStartAt[a]) {
      if (lw.endNode == b) {
        words.add(lw);
      }
    }
    return words;
  }

  public static void main(String[] args) throws Exception {

    boolean mergeType = USESUM;
    boolean prettyPrint = true;
    boolean debug = false;
    String parseGram = null;
    String filename = args[0];

    for (int i = 1; i < args.length; i++) {
      if (args[i].equalsIgnoreCase("-debug")) {
        debug = true;
      } else if (args[i].equalsIgnoreCase("-useMax")) {
        mergeType = USEMAX;
      } else if (args[i].equalsIgnoreCase("-useSum")) {
        mergeType = USESUM;
      } else if (args[i].equalsIgnoreCase("-noPrettyPrint")) {
        prettyPrint = false;
      } else if (args[i].equalsIgnoreCase("-parser")) {
        parseGram = args[++i];
      } else {
        System.err.println("unrecognized flag: " + args[i]);
        System.err.println("usage: java LatticeReader  [ -debug ] [ -useMax ] [ -useSum ] [ -noPrettyPrint ] [ -parser parserFile ]");
        System.exit(0);
      }
    }

    HTKLatticeReader lr = new HTKLatticeReader(filename, mergeType, debug, prettyPrint);

    if (parseGram != null) {
      Options op = new Options();
      // TODO: these options all get clobbered by the Options object
      // stored in the LexicalizedParser (unless it's a text file?)
      op.doDep = false;
      op.testOptions.maxLength = 80;
      op.testOptions.maxSpanForTags = 80;
      LexicalizedParser lp = LexicalizedParser.loadModel(parseGram, op);
      // TODO: somehow merge this into ParserQuery instead of being
      // LexicalizedParserQuery specific
      LexicalizedParserQuery pq = lp.lexicalizedParserQuery();
      pq.parse(lr);
      Tree t = pq.getBestParse();
      t.pennPrint();
    }
    //lr.processLattice();
  }

  public static class LatticeWord implements Comparable {
    public String word;
    public int startNode, endNode;
    public double lm, am;
    public int pronunciation;
    public final boolean mergeType;

    public LatticeWord(String word, int startNode, int endNode, double lm, double am, int pronunciation, boolean mergeType) {

      this.word = word;
      this.startNode = startNode;
      this.endNode = endNode;
      this.lm = lm;
      this.am = am;
      this.pronunciation = pronunciation;
      this.mergeType = mergeType;
    }

    public void merge(LatticeWord lw) {
      if (mergeType == USEMAX) {
        am = Math.max(am, lw.am);
        lw.am = am;
      } else if (mergeType == USESUM) {
        double tmp = lw.am;
        lw.am += am;
        am += tmp;
      }
    }

    @Override
    public String toString() {
      StringBuffer sb = new StringBuffer();
      sb.append(startNode).append("\t");
      sb.append(endNode).append("\t");
      sb.append("lm=").append(lm).append(",");
      sb.append("am=").append(am).append("\t");
      sb.append(word);//.append("(").append(pronunciation).append(")");
      return sb.toString();
    }

    @Override
    public boolean equals(Object o) {
      if (!(o instanceof LatticeWord)) {
        return false;
      }
      LatticeWord other = (LatticeWord) o;
      if (!word.equalsIgnoreCase(other.word)) {
        return false;
      }
      if (startNode != other.startNode) {
        return false;
      }
      if (endNode != other.endNode) {
        return false;
      }
      //if (pronunciation != other.pronunciation) { return false; }
      return true;
    }

    public int compareTo(LatticeWord other) {
      if (startNode < other.startNode) {
        return -1;
      } else if (startNode > other.startNode) {
        return 1;
      }

      if (endNode < other.endNode) {
        return -1;
      } else if (endNode > other.endNode) {
        return 1;
      }

      return 0;
    }

  }

}