All Downloads are FREE. Search and download functionalities are using the official Maven repository.

edu.stanford.nlp.trees.CollocationFinder Maven / Gradle / Ivy

Go to download

Stanford Parser processes raw text in English, Chinese, German, Arabic, and French, and extracts constituency parse trees.

There is a newer version: 3.9.2
Show newest version
package edu.stanford.nlp.trees; 
import edu.stanford.nlp.util.logging.Redwood;

import static java.lang.System.err;

import java.io.PrintWriter;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.List;

import edu.stanford.nlp.ling.Label;
import edu.stanford.nlp.ling.TaggedWord;
import edu.stanford.nlp.ling.WordTag;
import edu.stanford.nlp.process.Morphology;
import edu.stanford.nlp.util.ArrayUtils;
import edu.stanford.nlp.util.Generics;
import edu.stanford.nlp.util.Pair;
import edu.stanford.nlp.util.StringUtils;

/**
 * Finds WordNet collocations in parse trees.  It can restructure
 * collocations as single words, where the original words are joined by
 * underscores.  You can test performance by using the "collocations" option
 * to the TreePrint class.
 *
 * @author Chris Cox
 * @author Eric Yeh
 */

public class CollocationFinder  {

  /** A logger for this class */
  private static Redwood.RedwoodChannels log = Redwood.channels(CollocationFinder.class);

  private static boolean DEBUG = false;
  private final Tree qTree;
  private final HeadFinder hf;
  private final List collocationCollector;
  private final WordNetConnection wnConnect;

  /**
   * Construct a new {@code CollocationFinder} over the {@code Tree} t.
   * The default {@link HeadFinder} is a {@link CollinsHeadFinder}.
   * @param t parse tree
   * @param w wordnet connection
   */
  public CollocationFinder(Tree t, WordNetConnection w) {
    this(t, w, new CollinsHeadFinder());
  }


  /**
   * Construct a new {@code CollocationFinder} over the {@code Tree} t.
   * @param t parse tree
   * @param w wordnet connection
   * @param hf {@link HeadFinder} to use
   */
  public CollocationFinder(Tree t, WordNetConnection w, HeadFinder hf) {
    CoordinationTransformer transformer = new CoordinationTransformer(hf);
    this.wnConnect = w;
    this.qTree = transformer.transformTree(t);
    this.collocationCollector = Generics.newArrayList();
    this.hf = hf;
    this.getCollocationsList();
    if (DEBUG) {
      log.info("Collected collocations: " + collocationCollector);
    }
  }

  /**
   * Returns the "collocations included" parse tree.
   *
   * @return the mangled tree which applies collocations found in this object.
   */
  public Tree getMangledTree() {
    return getMangledTree(qTree);
  }

  private Tree getMangledTree(Tree t) {
    Collocation matchingColl = null;
    for (Tree child : t.children()) {
      child = getMangledTree(child);
    }
    //boolean additionalCollocationsExist = false;
    for (Collocation c : collocationCollector) {
      // if there are multiple collocations with the same parent node,
      // this will take the longer one
      if (t.equals(c.parentNode)) {
        if (matchingColl == null ||
            (c.span.first() <= matchingColl.span.first() &&
                c.span.second() >= matchingColl.span.second())) {
          matchingColl = c;
          if (DEBUG) {
            err.println("Found matching collocation for tree:");
            t.pennPrint();
            err.print("  head label: " + c.headLabel);
            err.println("; collocation string: " + c.collocationString);
            err.println("  Constituents: "+ c.indicesOfConstituentChildren);
          }
        }
      }
    }

    if (matchingColl == null) {
      return t;
    } else {
      if (DEBUG) {
        err.println("Collapsing " + matchingColl);
      }
      Tree[] allChildren = t.children();
      // get the earliest child in the collocation and store it as first child.
      // delete the rest.
      StringBuilder mutatedString = new StringBuilder(160);
      for (int i : matchingColl.indicesOfConstituentChildren) {
        String strToAppend = mergeLeavesIntoCollocatedString(allChildren[i]);
        mutatedString.append(strToAppend);
        mutatedString.append("_");
      }
      mutatedString = mutatedString.deleteCharAt(mutatedString.length() - 1);

      // Starting with the latest constituent, delete all the "pruned" children
      if (DEBUG) { err.println("allChildren is: " + Arrays.toString(allChildren)); }
      for (int index = matchingColl.indicesOfConstituentChildren.size() - 1; index > 0; index--) {
        int thisConstituent = matchingColl.indicesOfConstituentChildren.get(index);
        allChildren = (Tree[]) ArrayUtils.removeAt(allChildren, thisConstituent);
        if (DEBUG) { err.println(" deleted " + thisConstituent + "; allChildren is: " + Arrays.toString(allChildren)); }
      }
      //name for the leaf string of our new collocation
      String newNodeString = mutatedString.toString();

      int firstChildIndex = matchingColl.indicesOfConstituentChildren.get(0);
      //now we mutate the earliest constituent
      Tree newCollocationChild = allChildren[firstChildIndex];
      if (DEBUG) err.println("Manipulating: " + newCollocationChild);
      newCollocationChild.setValue(matchingColl.headLabel.value());
      Tree newCollocationLeaf = newCollocationChild.treeFactory().newLeaf(newNodeString);
      newCollocationChild.setChildren(Collections.singletonList(newCollocationLeaf));
      if (DEBUG) err.println("  changed to: " + newCollocationChild);

      allChildren[firstChildIndex] = newCollocationChild;
      t.setChildren(allChildren);

      if (DEBUG) {
        err.println("Restructured tree is:");
        t.pennPrint();
        err.println();
      }
      return t;
    }
  }

  /**
   * Traverses the parse tree to find WordNet collocations.
   */
  private void getCollocationsList() {
    getCollocationsList(qTree);
  }

  /**
   * Prints the collocations found in this Tree as strings.
   * Each is followed by its boundary constituent indices in the original tree.
   * 
Example: throw_up (2,3) *
came_up_with (7,9) */ public void printCollocationStrings(PrintWriter pw){ //ArrayList strs = new ArrayList(); for (Collocation c: collocationCollector) { String cs = c.collocationString; pw.println(cs+" ("+(c.span.first()+1)+","+(c.span.second()+1)+")"); } } /** * This method does the work of traversing the tree and writing collocations * to the CollocationCollector (an internal data structure). * * @param t Tree to get collocations from. */ private void getCollocationsList(Tree t) { int leftMostLeaf = Trees.leftEdge(t,qTree); if (t.isPreTerminal()) return; List children = t.getChildrenAsList(); if (children.isEmpty()) return; //TODO: fix determineHead // - in phrases like "World Trade Organization 's" the head of the parent NP is "POS". // - this is problematic for the collocationFinder which assigns this head // as the POS for the collocation "World_Trade_Organization"! Label headLabel = hf.determineHead(t).label(); int leftSistersBuffer = 0; //measures the length of sisters in words when reading for (int i = 0; i < children.size();i++){ ArrayList childConstituents = new ArrayList<>(); childConstituents.add(i); Tree subtree = children.get(i); int currWindowLength = 0; //measures the length in words of the current collocation. getCollocationsList(subtree); //recursive call to get colls in subtrees. StringBuilder testString = new StringBuilder(160); testString.append(treeAsStemmedCollocation(subtree)); testString.append('_'); Integer thisSubtreeLength = subtree.yield().size(); currWindowLength+=thisSubtreeLength; StringBuilder testStringNonStemmed = new StringBuilder(160); testStringNonStemmed.append(treeAsNonStemmedCollocation(subtree)); testStringNonStemmed.append('_'); //for each subtree i, we iteratively append word yields of succeeding sister //subtrees j and check their wordnet entries. if they exist we write them to //the global collocationCollector pair by the indices of the leftmost and //rightmost words in the collocation. for (int j = i+1; j < children.size(); j++) { Tree sisterNode = children.get(j); childConstituents.add(j); testString.append(treeAsStemmedCollocation(sisterNode)); testStringNonStemmed.append(treeAsNonStemmedCollocation(sisterNode)); currWindowLength+=sisterNode.yield().size(); if (DEBUG) { // err.println("Testing string w/ reported indices:" + testString.toString() // + " (" +(leftMostLeaf+leftSistersBuffer)+","+(leftMostLeaf+leftSistersBuffer+currWindowLength-1)+")"); } //ignore collocations beginning with "the" or "a" if (StringUtils.lookingAt(testString.toString(), "(?:[Tt]he|THE|[Aa][Nn]?)[ _]")) { if (false) { err.println("CollocationFinder: Not collapsing the/a word: " + testString); } } else if (wordNetContains(testString.toString())) { Pair c = new Pair<>(leftMostLeaf + leftSistersBuffer, leftMostLeaf + leftSistersBuffer + currWindowLength - 1); ArrayList childConstituentsClone = new ArrayList<>(childConstituents); Collocation col = new Collocation(c,t,childConstituentsClone,testString.toString(),headLabel); collocationCollector.add(col); if (DEBUG) { err.println("Found collocation in wordnet: "+ testString); err.println(" Span of collocation is: " + c + "; childConstituents is: " + c); } } testString.append('_'); if (StringUtils.lookingAt(testStringNonStemmed.toString(), "(?:[Tt]he|THE|[Aa][Nn]?)[ _]")) { if (false) { err.println("CollocationFinder: Not collapsing the/a word: " + testStringNonStemmed); } } else if (wordNetContains(testStringNonStemmed.toString())) { Pair c = new Pair<>(leftMostLeaf + leftSistersBuffer, leftMostLeaf + leftSistersBuffer + currWindowLength - 1); ArrayList childConstituentsClone = new ArrayList<>(childConstituents); Collocation col = new Collocation(c,t,childConstituentsClone,testStringNonStemmed.toString(),headLabel); collocationCollector.add(col); if (DEBUG) { err.println("Found collocation in wordnet: "+ testStringNonStemmed); err.println(" Span of collocation is: " + c + "; childConstituents is: " + c); } } testStringNonStemmed.append("_"); } leftSistersBuffer+=thisSubtreeLength; } } private static String treeAsStemmedCollocation(Tree t) { List list= getStemmedWordTagsFromTree(t); // err.println(list.size()); StringBuilder s = new StringBuilder(160); WordTag firstWord = list.remove(0); s.append(firstWord.word()); for(WordTag wt : list) { s.append("_"); s.append(wt.word()); } //err.println("Expressing this as:"+s.toString()); return s.toString(); } private static String treeAsNonStemmedCollocation(Tree t) { List list= getNonStemmedWordTagsFromTree(t); StringBuilder s = new StringBuilder(160); WordTag firstWord = list.remove(0); s.append(firstWord.word()); for(WordTag wt : list) { s.append('_'); s.append(wt.word()); } return s.toString(); } private static String mergeLeavesIntoCollocatedString(Tree t) { StringBuilder sb = new StringBuilder(160); ArrayList sent = t.taggedYield(); for (TaggedWord aSent : sent) { sb.append(aSent.word()).append('_'); } return sb.substring(0,sb.length() -1); } private static String mergeLeavesIntoCollocatedString(Tree[] trees) { StringBuilder sb = new StringBuilder(160); for (Tree t: trees) { ArrayList sent = t.taggedYield(); for (TaggedWord aSent : sent) { sb.append(aSent.word()).append('_'); } } return sb.substring(0,sb.length() -1); } /** * * @param t a tree * @return the WordTags corresponding to the leaves of the tree, * stemmed according to their POS tags in the tree. */ private static List getStemmedWordTagsFromTree(Tree t) { List stemmedWordTags = Generics.newArrayList(); ArrayList s = t.taggedYield(); for (TaggedWord w : s) { WordTag wt = Morphology.stemStatic(w.word(), w.tag()); stemmedWordTags.add(wt); } return stemmedWordTags; } private static List getNonStemmedWordTagsFromTree(Tree t) { List wordTags = Generics.newArrayList(); ArrayList s = t.taggedYield(); for (TaggedWord w : s) { WordTag wt = new WordTag(w.word(), w.tag()); wordTags.add(wt); } return wordTags; } /** * Checks to see if WordNet contains the given word in its lexicon. * @param s Token * @return If the given token is in WordNet. */ private boolean wordNetContains(String s) { return wnConnect.wordNetContains(s); } /** * Holds information for one collocation. */ private static class Collocation { final Pair span; final Tree parentNode; final Label headLabel; final List indicesOfConstituentChildren; final String collocationString; private Collocation(Pair span, Tree parentNode, ArrayList indicesOfConstituentChildren, String collocationString, Label headLabel) { this.span=span; this.parentNode = parentNode; this.collocationString=collocationString; this.indicesOfConstituentChildren=indicesOfConstituentChildren; this.headLabel=headLabel; } @Override public String toString() { return collocationString + indicesOfConstituentChildren + "/" + headLabel; } } // end static class Collocation } // end class CollocationFinder




© 2015 - 2024 Weber Informatics LLC | Privacy Policy