edu.stanford.nlp.ling.tokensregex.PhraseTable Maven / Gradle / Ivy

Go to download
package edu.stanford.nlp.ling.tokensregex;

import edu.stanford.nlp.io.IOUtils;
import edu.stanford.nlp.ling.CoreAnnotations;
import edu.stanford.nlp.ling.CoreLabel;
import edu.stanford.nlp.pipeline.Annotation;
import edu.stanford.nlp.pipeline.Annotator;
import edu.stanford.nlp.stats.ClassicCounter;
import edu.stanford.nlp.stats.Counter;
import edu.stanford.nlp.util.*;

import java.io.BufferedReader;
import java.io.IOException;
import java.io.Serializable;
import java.util.*;
import java.util.regex.Pattern;

/**
 * Table used to lookup multi-word phrases.
 * This class provides functions for looking up all instances of known phrases in a document in an efficient manner.
 *
 * Phrases can be added to the phrase table using
 * 
 *   readPhrases
 *   readPhrasesWithTagScores
 *   addPhrase
 * 
 *
 * You can lookup phrases in the table using
 * 
 *   get
 *   lookup
 * 
 *
 * You can find phrases occurring in a piece of text using
 * 
 *   findAllMatches
 *   findNonOverlappingPhrases
 * 
 * @author Angel Chang
 */
public class PhraseTable implements Serializable
{
  private static final String PHRASE_END = "";
  private static final long serialVersionUID = 1L;
  Map rootTree;

  public boolean normalize = true;
  public boolean caseInsensitive = false;
  public boolean ignorePunctuation = false;
  public boolean ignorePunctuationTokens = true;
  public Annotator tokenizer;  // tokenizing annotator

  int nPhrases = 0;
  int nStrings = 0;

  transient CacheMap normalizedCache = new CacheMap<>(5000);

  public PhraseTable() {}

  public PhraseTable(int initSize) { rootTree = new HashMap<>(initSize); }

  public PhraseTable(boolean normalize, boolean caseInsensitive, boolean ignorePunctuation) {
    this.normalize = normalize;
    this.caseInsensitive = caseInsensitive;
    this.ignorePunctuation = ignorePunctuation;
  }

  public boolean isEmpty() {
    return (nPhrases == 0);
  }

  public boolean containsKey(Object key) {
    return get(key) != null;
  }

  public Phrase get(Object key) {
    if (key instanceof String) {
      return lookup((String) key);
    } else if (key instanceof WordList) {
      return lookup((WordList) key);
    } else {
      return null;
    }
  }

  /**
   * Clears this table
   */
  public void clear()
  {
    rootTree = null;
    nPhrases = 0;
    nStrings = 0;
  }

  public void setNormalizationCacheSize(int cacheSize)
  {
    CacheMap newNormalizedCache = new CacheMap<>(cacheSize);
    newNormalizedCache.putAll(normalizedCache);
    normalizedCache = newNormalizedCache;
  }

  /**
   * Input functions to read in phrases to the table
   */

  private static final Pattern tabPattern = Pattern.compile("\t");

  /**
   * Read in phrases from a file (assumed to be tab delimited)
   * @param filename - Name of file
   * @param checkTag - Indicates if there is a tag column (assumed to be 2nd column)
   *                   If false, treats entire line as the phrase
   * @throws IOException
   */
  public void readPhrases(String filename, boolean checkTag) throws IOException
  {
    readPhrases(filename, checkTag, tabPattern);
  }

  /**
   * Read in phrases from a file.  Column delimiters are matched using regex
   * @param filename - Name of file
   * @param checkTag - Indicates if there is a tag column (assumed to be 2nd column)
   *                   If false, treats entire line as the phrase
   * @param delimiterRegex - Regex for identifying column delimiter
   * @throws IOException
   */
  public void readPhrases(String filename, boolean checkTag, String delimiterRegex) throws IOException
  {
    readPhrases(filename, checkTag, Pattern.compile(delimiterRegex));
  }

  public void readPhrases(String filename, boolean checkTag, Pattern delimiterPattern) throws IOException
  {
    Timing timer = new Timing();
    timer.doing("Reading phrases: " + filename);
    BufferedReader br = IOUtils.getBufferedFileReader(filename);
    String line;
    while ((line = br.readLine()) != null) {
      if (checkTag) {
        String[] columns = delimiterPattern.split(line, 2);
        if (columns.length == 1) {
          addPhrase(columns[0]);
        } else {
          addPhrase(columns[0], columns[1]);
        }
      } else {
        addPhrase(line);
      }
    }
    br.close();
    timer.done();
  }

  /**
   * Read in phrases where there is each pattern has a score of being associated with a certain tag.
   * The file format is assumed to be
   *   phrase\ttag1 count\ttag2 count...
   * where the phrases and tags are delimited by tabs, and each tag and count is delimited by whitespaces
   * @param filename
   * @throws IOException
   */
  public void readPhrasesWithTagScores(String filename) throws IOException
  {
    readPhrasesWithTagScores(filename, tabPattern, whitespacePattern);
  }

  public void readPhrasesWithTagScores(String filename, String fieldDelimiterRegex,
                                    String countDelimiterRegex) throws IOException
  {
    readPhrasesWithTagScores(filename, Pattern.compile(fieldDelimiterRegex), Pattern.compile(countDelimiterRegex));
  }

  public void readPhrasesWithTagScores(String filename, Pattern fieldDelimiterPattern, Pattern countDelimiterPattern) throws IOException
  {
    Timing timer = new Timing();
    timer.doing("Reading phrases: " + filename);
    BufferedReader br = IOUtils.getBufferedFileReader(filename);
    String line;
    int lineno = 0;
    while ((line = br.readLine()) != null) {
      String[] columns = fieldDelimiterPattern.split(line);
      String phrase = columns[0];
      // Pick map factory to use depending on number of tags we have
      MapFactory mapFactory = (columns.length < 20)?
              MapFactory.arrayMapFactory(): MapFactory.linkedHashMapFactory();
      Counter counts = new ClassicCounter<>(mapFactory);
      for (int i = 1; i < columns.length; i++) {
        String[] tagCount = countDelimiterPattern.split(columns[i], 2);
        if (tagCount.length == 2) {
          try {
            counts.setCount(tagCount[0], Double.parseDouble(tagCount[1]));
          } catch (NumberFormatException ex) {
            throw new RuntimeException("Error processing field " + i + ": '" + columns[i] +
                    "' from (" + filename + ":" + lineno + "): " + line, ex);
          }
        } else {
          throw new RuntimeException("Error processing field " + i + ": '" + columns[i] +
                  "' from + (" + filename + ":" + lineno + "): " + line);
        }
      }
      addPhrase(phrase, null, counts);
      lineno++;
    }
    br.close();
    timer.done();
  }

  public void readPhrases(String filename, int phraseColIndex, int tagColIndex) throws IOException
  {
    if (phraseColIndex < 0) {
      throw new IllegalArgumentException("Invalid phraseColIndex " + phraseColIndex);
    }
    Timing timer = new Timing();
    timer.doing("Reading phrases: " + filename);
    BufferedReader br = IOUtils.getBufferedFileReader(filename);
    String line;
    while ((line = br.readLine()) != null) {
      String[] columns = tabPattern.split(line);
      String phrase = columns[phraseColIndex];
      String tag = (tagColIndex >= 0)? columns[tagColIndex]: null;
      addPhrase(phrase, tag);
    }
    br.close();
    timer.done();
  }

  public static Phrase getLongestPhrase(List phrases)
  {
    Phrase longest = null;
    for (Phrase phrase:phrases) {
      if (longest == null || phrase.isLonger(longest)) {
        longest = phrase;
      }
    }
    return longest;
  }

  public String[] splitText(String phraseText)
  {
    String[] words;
    if (tokenizer != null) {
      Annotation annotation = new Annotation(phraseText);
      tokenizer.annotate(annotation);
      List tokens = annotation.get(CoreAnnotations.TokensAnnotation.class);
      words = new String[tokens.size()];
      for (int i = 0; i < tokens.size(); i++) {
        words[i] = tokens.get(i).word();
      }
    } else {
      phraseText = possPattern.matcher(phraseText).replaceAll(" 's$1");
      words = delimPattern.split(phraseText);
    }
    return words;
  }

  public WordList toWordList(String phraseText)
  {
    String[] words = splitText(phraseText);
    return new StringList(words);
  }

  public WordList toNormalizedWordList(String phraseText)
  {
    String[] words = splitText(phraseText);
    List list = new ArrayList<>(words.length);
    for (String word:words) {
      word = getNormalizedForm(word);
      if (word.length() > 0) {
        list.add(word);
      }
    }
    return new StringList(list);
  }

  public void addPhrases(Collection phraseTexts)
  {
    for (String phraseText:phraseTexts) {
      addPhrase(phraseText, null);
    }
  }

  public void addPhrases(Map taggedPhraseTexts)
  {
    for (String phraseText:taggedPhraseTexts.keySet()) {
      addPhrase(phraseText, taggedPhraseTexts.get(phraseText));
    }
  }

  public boolean addPhrase(String phraseText)
  {
    return addPhrase(phraseText, null);
  }

  public boolean addPhrase(String phraseText, String tag)
  {
    return addPhrase(phraseText, tag, null);
  }

  public boolean addPhrase(String phraseText, String tag, Object phraseData)
  {
    WordList wordList = toNormalizedWordList(phraseText);
    return addPhrase(phraseText, tag, wordList, phraseData);
  }

  public boolean addPhrase(List tokens)
  {
    return addPhrase(tokens, null);
  }

  public boolean addPhrase(List tokens, String tag)
  {
    return addPhrase(tokens, tag, null);
  }

  public boolean addPhrase(List tokens, String tag, Object phraseData)
  {
    WordList wordList = new StringList(tokens);
    return addPhrase(StringUtils.join(tokens, " "), tag, wordList, phraseData);
  }

  private int MAX_LIST_SIZE = 20;
  private synchronized boolean addPhrase(String phraseText, String tag, WordList wordList, Object phraseData)
  {
    if (rootTree == null) {
      rootTree = new HashMap<>();
    }
    return addPhrase(rootTree, phraseText, tag, wordList, phraseData, 0);
  }

  private synchronized void addPhrase(Map tree, Phrase phrase, int wordIndex)
  {
    String word = (phrase.wordList.size() <= wordIndex)? PHRASE_END:phrase.wordList.getWord(wordIndex);
    Object node = tree.get(word);
    if (node == null) {
      tree.put(word, phrase);
    } else if (node instanceof Phrase) {
      // create list with this phrase and other and put it here
      List list = new ArrayList(2);
      list.add(phrase);
      list.add(node);
      tree.put(word, list);
    } else if (node instanceof Map) {
      addPhrase((Map) node, phrase, wordIndex+1);
    } else if (node instanceof List) {
      ((List) node).add(phrase);
    } else {
      throw new RuntimeException("Unexpected class " + node.getClass() + " while adding word "
              + wordIndex + "(" + word + ") in phrase " + phrase.getText());
    }
  }

  private synchronized boolean addPhrase(Map tree,
                                         String phraseText, String tag, WordList wordList, Object phraseData, int wordIndex)
  {
    // Find place to insert this item
    boolean phraseAdded = false;  // True if this phrase was successfully added to the phrase table
    boolean newPhraseAdded = false;    // True if the phrase was a new phrase
    boolean oldPhraseNewFormAdded = false;      // True if the phrase already exists, and this was new form added to old phrase
    for (int i = wordIndex; i < wordList.size(); i++) {
      String word = Interner.globalIntern(wordList.getWord(i));
      Object node = tree.get(word);
      if (node == null) {
        // insert here
        Phrase phrase = new Phrase(wordList, phraseText, tag, phraseData);
        tree.put(word, phrase);
        phraseAdded = true;
        newPhraseAdded = true;
      } else if (node instanceof Phrase) {
        // check rest of the phrase matches
        Phrase oldphrase = (Phrase) node;
        int matchedTokenEnd = checkWordListMatch(
          oldphrase, wordList, 0, wordList.size(), i+1, true);
        if (matchedTokenEnd >= 0) {
          oldPhraseNewFormAdded = oldphrase.addForm(phraseText);
        } else {
          // create list with this phrase and other and put it here
          Phrase newphrase = new Phrase(wordList, phraseText, tag, phraseData);
          List list = new ArrayList(2);
          list.add(oldphrase);
          list.add(newphrase);
          tree.put(word, list);
          newPhraseAdded = true;
        }
        phraseAdded = true;
      } else if (node instanceof Map) {
        tree = (Map) node;
      } else if (node instanceof List) {
        // Search through list for matches to word (at this point, the table is small, so no Map)
        List lookupList = (List) node;
        int nMaps = 0;
        for (Object obj:lookupList) {
          if (obj instanceof Phrase) {
            // check rest of the phrase matches
            Phrase oldphrase = (Phrase) obj;
            int matchedTokenEnd = checkWordListMatch(
              oldphrase, wordList, 0, wordList.size(), i, true);
            if (matchedTokenEnd >= 0) {
              oldPhraseNewFormAdded = oldphrase.addForm(phraseText);
              phraseAdded = true;
              break;
            }
          } else if (obj instanceof Map) {
            if (nMaps == 1) {
              throw new RuntimeException("More than one map in list while adding word "
                      + i + "(" + word + ") in phrase " + phraseText);
            }
            tree = (Map) obj;
            nMaps++;
          } else  {
            throw new RuntimeException("Unexpected class in list " + obj.getClass() + " while adding word "
                    + i + "(" + word + ") in phrase " + phraseText);
          }
        }
        if (!phraseAdded && nMaps == 0) {
          // add to list
          Phrase newphrase = new Phrase(wordList, phraseText, tag, phraseData);
          lookupList.add(newphrase);
          newPhraseAdded = true;
          phraseAdded = true;
          if (lookupList.size() > MAX_LIST_SIZE) {
            // convert lookupList (should consist only of phrases) to map
            Map newMap = new HashMap(lookupList.size());
            for (Object obj:lookupList) {
              if (obj instanceof Phrase) {
                Phrase oldphrase = (Phrase) obj;
                addPhrase(newMap, oldphrase, i+1);
              } else  {
                throw new RuntimeException("Unexpected class in list " + obj.getClass() + " while converting list to map");
              }
            }
            tree.put(word,newMap);
          }
        }
      } else {
        throw new RuntimeException("Unexpected class in list " + node.getClass() + " while adding word "
                + i + "(" + word + ") in phrase " + phraseText);
      }
      if (phraseAdded) {
        break;
      }
    }
    if (!phraseAdded) {
      if (wordList.size() == 0) {
        System.err.println("WARNING: " + phraseText + " not added");
      } else {
        Phrase oldphrase = (Phrase) tree.get(PHRASE_END);
        if (oldphrase != null) {
          int matchedTokenEnd = checkWordListMatch(
                  oldphrase, wordList, 0, wordList.size(), wordList.size(), true);
          if (matchedTokenEnd >= 0) {
            oldPhraseNewFormAdded = oldphrase.addForm(phraseText);
          } else {
            // create list with this phrase and other and put it here
            Phrase newphrase = new Phrase(wordList, phraseText, tag, phraseData);
            List list = new ArrayList(2);
            list.add(oldphrase);
            list.add(newphrase);
            tree.put(PHRASE_END, list);
            newPhraseAdded = true;
          }
        } else {
          Phrase newphrase = new Phrase(wordList, phraseText, tag, phraseData);
          tree.put(PHRASE_END, newphrase);
          newPhraseAdded = true;
        }
      }
    }
    if (newPhraseAdded) {
      nPhrases++;
      nStrings++;
    } else {
      nStrings++;
    }
    return (newPhraseAdded || oldPhraseNewFormAdded);
  }

  public String getNormalizedForm(String word)
  {
    String normalized = normalizedCache.get(word);
    if (normalized == null) {
      normalized = createNormalizedForm(word);
      synchronized (this) {
        normalizedCache.put(word, normalized);
      }
    }
    return normalized;
  }

  private static final Pattern punctWhitespacePattern = Pattern.compile("\\s*(\\p{Punct})\\s*");
  private static final Pattern whitespacePattern = Pattern.compile("\\s+");
  private static final Pattern delimPattern = Pattern.compile("[\\s_-]+");
  private static final Pattern possPattern = Pattern.compile("'s(\\s+|$)");
  private String createNormalizedForm(String word)
  {
    if (normalize) {
      word = StringUtils.normalize(word);
    }
    if (caseInsensitive) {
      word = word.toLowerCase();
    }
    if (ignorePunctuation) {
      word = punctWhitespacePattern.matcher(word).replaceAll("");
    } else if (ignorePunctuationTokens) {
      if (punctWhitespacePattern.matcher(word).matches()) {
        word = "";
      }
    }
    word = whitespacePattern.matcher(word).replaceAll("");
    return word;
  }

  public Phrase lookup(String phrase)
  {
    return lookup(toWordList(phrase));
  }

  public Phrase lookupNormalized(String phrase)
  {
    return lookup(toNormalizedWordList(phrase));
  }

  public Phrase lookup(WordList wordList)
  {
    if (wordList == null || rootTree == null) return null;
    Map tree = rootTree;
    for (int i = 0; i < wordList.size(); i++) {
      String word = wordList.getWord(i);
      Object node = tree.get(word);
      if (node == null) {
        return null;
      } else if (node instanceof Phrase) {
        Phrase phrase = (Phrase) node;
        int matchedTokenEnd = checkWordListMatch(
          phrase, wordList, 0, wordList.size(), i, true);

        if (matchedTokenEnd >= 0) {
          return phrase;
        }
      } else if (node instanceof Map) {
        tree = (Map) node;
      } else if (node instanceof List) {
        // Search through list for matches to word (at this point, the table is small, so no Map)
        List lookupList = (List) node;
        int nMaps = 0;
        for (Object obj:lookupList) {
          if (obj instanceof Phrase) {
            // check rest of the phrase matches
            Phrase phrase = (Phrase) obj;
            int matchedTokenEnd = checkWordListMatch(
              phrase, wordList, 0, wordList.size(), i, true);

            if (matchedTokenEnd >= 0) {
              return phrase;
            }
          } else if (obj instanceof Map) {
            if (nMaps == 1) {
              throw new RuntimeException("More than one map in list while looking up word "
                      + i + "(" + word + ") in phrase " + wordList.toString());
            }
            tree = (Map) obj;
            nMaps++;
          } else  {
            throw new RuntimeException("Unexpected class in list " + obj.getClass() + " while looking up word "
                    + i + "(" + word + ") in phrase " + wordList.toString());
          }
        }
        if (nMaps == 0) {
          return null;
        }
      } else {
        throw new RuntimeException("Unexpected class in list " + node.getClass() + " while looking up word "
                + i + "(" + word + ") in phrase " + wordList.toString());
      }
    }
    Phrase phrase = (Phrase) tree.get(PHRASE_END);
    if (phrase != null) {
      int matchedTokenEnd = checkWordListMatch(
        phrase, wordList, 0, wordList.size(), wordList.size(), true);
      return (matchedTokenEnd >= 0)? phrase:null;
    } else {
      return null;
    }
  }

  /**
   * Given a segment of text, returns list of spans (PhraseMatch) that corresponds
   *  to a phrase in the table
   * @param text Input text to search over
   * @return List of all matched spans
   */
  public List findAllMatches(String text)
  {
    WordList tokens = toNormalizedWordList(text);
    return findAllMatches(tokens, 0, tokens.size(), false);
  }

  /**
   * Given a list of tokens, returns list of spans (PhraseMatch) that corresponds
   *  to a phrase in the table
   * @param tokens List of tokens to search over
   * @return List of all matched spans
   */
  public List findAllMatches(WordList tokens)
  {
    return findAllMatches(tokens, 0, tokens.size(), true);
  }

  /**
   * Given a segment of text, returns list of spans (PhraseMatch) that corresponds
   *  to a phrase in the table (filtered by the list of acceptable phrase)
   * @param acceptablePhrases - What phrases to look for (need to be subset of phrases already in table)
   * @param text Input text to search over
   * @return List of all matched spans
   */
  public List findAllMatches(List acceptablePhrases, String text)
  {
    WordList tokens = toNormalizedWordList(text);
    return findAllMatches(acceptablePhrases, tokens, 0, tokens.size(), false);
  }

  /**
   * Given a list of tokens, returns list of spans (PhraseMatch) that corresponds
   *  to a phrase in the table (filtered by the list of acceptable phrase)
   * @param acceptablePhrases - What phrases to look for (need to be subset of phrases already in table)
   * @param tokens List of tokens to search over
   * @return List of all matched spans
   */
  public List findAllMatches(List acceptablePhrases, WordList tokens)
  {
    return findAllMatches(acceptablePhrases, tokens, 0, tokens.size(), true);
  }

  public List findAllMatches(WordList tokens,
                                          int tokenStart, int tokenEnd,
                                          boolean needNormalization)
  {
    return findMatches(null, tokens, tokenStart, tokenEnd,
            needNormalization,
            true /* find all */,
            false /* don't need to match end exactly */);
  }

  public List findAllMatches(List acceptablePhrases,
                                          WordList tokens,
                                          int tokenStart, int tokenEnd,
                                          boolean needNormalization)
  {
    return findMatches(acceptablePhrases, tokens, tokenStart, tokenEnd,
            needNormalization,
            true /* find all */,
            false /* don't need to match end exactly */);
  }

  public List findMatches(String text)
  {
    WordList tokens = toNormalizedWordList(text);
    return findMatches(tokens, 0, tokens.size(), false);
  }

  public List findMatches(WordList tokens)
  {
    return findMatches(tokens, 0, tokens.size(), true);
  }

  public List findMatches(WordList tokens,
                                       int tokenStart, int tokenEnd,
                                       boolean needNormalization)
  {
    return findMatches(null, tokens, tokenStart, tokenEnd,
            needNormalization,
            false /* don't need to find all */,
            false /* don't need to match end exactly */);
  }

  public List findMatches(String text,
                                       int tokenStart, int tokenEnd,
                                       boolean needNormalization)
  {
    WordList tokens = toNormalizedWordList(text);
    return findMatches(tokens, tokenStart, tokenEnd, false);
  }

  protected int checkWordListMatch(Phrase phrase, WordList tokens,
                                   int tokenStart, int tokenEnd,
                                   int checkStart,
                                   boolean matchEnd)
  {
    if (checkStart < tokenStart) return -1;
    int i;
    int phraseSize = phrase.wordList.size();
    for (i = checkStart; i < tokenEnd && i - tokenStart < phraseSize; i++) {
      String word = tokens.getWord(i);
      String phraseWord = phrase.wordList.getWord(i - tokenStart);
      if (!phraseWord.equals(word)) {
        return -1;
      }
    }
    if (i - tokenStart == phraseSize) {
      // All tokens in phrase has been matched!
      if (matchEnd) {
        return (i == tokenEnd)? i:-1;
      } else {
        return i;
      }
    } else {
      return -1;
    }
  }

  public List findNonOverlappingPhrases(List phraseMatches)
  {
    if (phraseMatches.size() > 1) {
      return IntervalTree.getNonOverlapping(phraseMatches, PHRASEMATCH_LENGTH_ENDPOINTS_COMPARATOR);
    } else {
      return phraseMatches;
    }
  }

  protected List findMatches(Collection acceptablePhrases,
                                          WordList tokens, int tokenStart, int tokenEnd,
                                          boolean needNormalization, boolean findAll, boolean matchEnd)
  {
    if (needNormalization) {
      assert(tokenStart >= 0);
      assert(tokenEnd > tokenStart);
      int n = tokenEnd - tokenStart;
      List normalized = new ArrayList<>(n);
      int[] tokenIndexMap = new int[n+1];
      int j = 0, last = 0;
      for (int i = tokenStart; i < tokenEnd; i++) {
        String word = tokens.getWord(i);
        word = getNormalizedForm(word);
        if (word.length() != 0) {
          normalized.add(word);
          tokenIndexMap[j] = i;
          last = i;
          j++;
        }
      }
      tokenIndexMap[j] = Math.min(last+1, tokenEnd);
      List matched = findMatchesNormalized(acceptablePhrases, new StringList(normalized),
              0, normalized.size(), findAll, matchEnd);
      for (PhraseMatch pm:matched) {
        assert(pm.tokenBegin >= 0);
        assert(pm.tokenEnd >= pm.tokenBegin);
        assert(pm.tokenEnd <= normalized.size());
        if (pm.tokenEnd > 0 && pm.tokenEnd > pm.tokenBegin) {
          pm.tokenEnd = tokenIndexMap[pm.tokenEnd-1]+1;
        } else {
          pm.tokenEnd = tokenIndexMap[pm.tokenEnd];
        }
        pm.tokenBegin = tokenIndexMap[pm.tokenBegin];
        assert(pm.tokenBegin >= 0);
        assert(pm.tokenEnd >= pm.tokenBegin);
      }
      return matched;
    } else {
      return findMatchesNormalized(acceptablePhrases, tokens, tokenStart, tokenEnd, findAll, matchEnd);
    }
  }

  protected List findMatchesNormalized(Collection acceptablePhrases,
                                                    WordList tokens, int tokenStart, int tokenEnd,
                                                    boolean findAll, boolean matchEnd)
  {
    List matched = new ArrayList<>();
    Stack todoStack = new Stack<>();
    todoStack.push(new StackEntry(rootTree, tokenStart, tokenStart, tokenEnd, findAll? tokenStart+1:-1));
    while (!todoStack.isEmpty()) {
      StackEntry cur = todoStack.pop();
      Map tree = cur.tree;
      for (int i = cur.tokenNext; i <= cur.tokenEnd; i++) {
        if (tree.containsKey(PHRASE_END)) {
          Phrase phrase = (Phrase) tree.get(PHRASE_END);
          if (acceptablePhrases == null || acceptablePhrases.contains(phrase)) {
            int matchedTokenEnd = checkWordListMatch(
              phrase, tokens, cur.tokenStart, cur.tokenEnd, i, matchEnd);
            if (matchedTokenEnd >= 0) {
              matched.add(new PhraseMatch(phrase, cur.tokenStart, matchedTokenEnd));
            }
          }
        }
        if (i == cur.tokenEnd) break;
        String word = tokens.getWord(i);
        Object node = tree.get(word);
        if (node == null) {
          break;
        } else if (node instanceof Phrase) {
          // check rest of the phrase matches
          Phrase phrase = (Phrase) node;
          if (acceptablePhrases == null || acceptablePhrases.contains(phrase)) {
            int matchedTokenEnd = checkWordListMatch(
              phrase, tokens, cur.tokenStart, cur.tokenEnd, i+1, matchEnd);
            if (matchedTokenEnd >= 0) {
              matched.add(new PhraseMatch(phrase, cur.tokenStart, matchedTokenEnd));
            }
          }
          break;
        } else if (node instanceof Map) {
          tree = (Map) node;
        } else if (node instanceof List) {
          // Search through list for matches to word (at this point, the table is small, so no Map)
          List lookupList = (List) node;
          for (Object obj:lookupList) {
            if (obj instanceof Phrase) {
              // check rest of the phrase matches
              Phrase phrase = (Phrase) obj;
              if (acceptablePhrases == null || acceptablePhrases.contains(phrase)) {
                int matchedTokenEnd = checkWordListMatch(
                  phrase, tokens, cur.tokenStart, cur.tokenEnd, i+1, matchEnd);
                if (matchedTokenEnd >= 0) {
                  matched.add(new PhraseMatch(phrase, cur.tokenStart, matchedTokenEnd));
                }
              }
            } else if (obj instanceof Map) {
              todoStack.push(new StackEntry((Map) obj, cur.tokenStart, i+1, cur.tokenEnd, -1));
            } else  {
              throw new RuntimeException("Unexpected class in list " + obj.getClass() + " while looking up " + word);
            }
          }
          break;
        } else {
          throw new RuntimeException("Unexpected class " + node.getClass() + " while looking up " + word);
        }
      }
      if (cur.continueAt >= 0) {
        int newStart = (cur.continueAt > cur.tokenStart)? cur.continueAt: cur.tokenStart+1;
        if (newStart < cur.tokenEnd) {
          todoStack.push(new StackEntry(cur.tree, newStart, newStart, cur.tokenEnd, newStart+1));
        }
      }
    }
    return matched;
  }

  public Iterator iterator() {
    return new PhraseTableIterator(this);
  }

  private static class PhraseTableIterator extends AbstractIterator {
    private PhraseTable phraseTable;
    private Stack> iteratorStack = new Stack<>();
    private Phrase next = null;

    public PhraseTableIterator(PhraseTable phraseTable) {
      this.phraseTable = phraseTable;
      this.iteratorStack.push(this.phraseTable.rootTree.values().iterator());
      this.next = getNext();
    }

    private Phrase getNext() {
      while (!iteratorStack.isEmpty()) {
        Iterator