Stanford CoreNLP provides a set of natural language analysis tools which can take raw English language text input and give the base forms of words, their parts of speech, whether they are names of companies, people, etc., normalize dates, times, and numeric quantities, mark up the structure of sentences in terms of phrases and word dependencies, and indicate which noun phrases refer to the same entities. It provides the foundational building blocks for higher level text understanding applications.

import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;

import edu.stanford.nlp.trees.Tree;
import edu.stanford.nlp.trees.TreeFactory;
import edu.stanford.nlp.trees.TreeNormalizer;
import edu.stanford.nlp.trees.tregex.TregexMatcher;
import edu.stanford.nlp.trees.tregex.TregexPattern;
import edu.stanford.nlp.trees.tregex.tsurgeon.Tsurgeon;
import edu.stanford.nlp.trees.tregex.tsurgeon.TsurgeonPattern;
import edu.stanford.nlp.util.Pair;

 * Provides routines for "decompressing" further the expanded trees
 * formed by multiword token splitting.
 * Multiword token expansion leaves constituent words as siblings in a
 * "flat" tree structure. This often represents an incorrect parse of
 * the sentence. For example, the phrase "Ministerio de Finanzas" should
 * not be parsed as a flat structure like
 *     (grup.nom (np00000 Ministerio) (sp000 de) (np00000 Finanzas))
 * but rather a "deep" structure like
 *     (grup.nom (sp (prep (sp000 de))
 *                   (sn (grup.nom (np0000 Finanzas)))))
 * This class provides methods for detecting common linguistic patterns
 * that should be expanded in this way.
public class MultiWordTreeExpander {

   * Regular expression to match groups inside which we want to expand things
  private static final String CANDIDATE_GROUPS = "(^grup\\.(adv|c[cs]|[iwz]|nom|prep|pron|verb)|\\.inter)";

  private static final String PREPOSITIONS =

  private final TregexPattern parentheticalExpression = TregexPattern.compile(
    "fpa=left > /^grup\\.nom$/ " + "$++ fpt=right");

  private final TsurgeonPattern groupParentheticalExpression
    = Tsurgeon.parseOperation("createSubtree grup.nom.inter4 left right");

   * Yes, some multiword tokens contain multiple clauses..
  private final TregexPattern multipleClauses
    = TregexPattern.compile(
      // Nested nominal group containing period punctuation
      "/^grup\\.nom/ > /^grup\\.nom/ < (fp !$-- fp $- /^[^g]/=right1 $+ __=left2)" +
      // Match boundaries for subtrees created
      " <, __=left1 <` __=right2");

  private final TsurgeonPattern expandMultipleClauses
    = Tsurgeon.parseOperation("[createSubtree grup.nom left1 right1]" +
      "[createSubtree grup.nom left2 right2]");

  private final TregexPattern prepositionalPhrase
    = TregexPattern.compile(// Match candidate preposition
                            "sp000=tag < /(?i)^" + PREPOSITIONS + "$/" +
                            // Headed by a group that was generated from
                            // multi-word token expansion and that we
                            // wish to expand further
                            " > (/" + CANDIDATE_GROUPS + "/ <- __=right)" +
                            // With an NP on the left (-> this is a
                            // prep. phrase) and not preceded by any
                            // other prepositions
                            " $+ /^([adnswz]|p[ipr])/=left !$-- sp000");

  private final TregexPattern leadingPrepositionalPhrase
    = TregexPattern.compile(// Match candidate preposition
                            "sp000=tag < /(?i)^" + PREPOSITIONS + "$/" +
                            // Which is the first child in a group that
                            // was generated from multi-word token
                            // expansion and that we wish to expand
                            // further
                            " >, (/" + CANDIDATE_GROUPS + "/ <- __=right)" +
                            // With an NP on the left (-> this is a
                            // prep. phrase) and not preceded by any
                            // other prepositions
                            " $+ /^([adnswz]|p[ipr])/=left !$-- sp000");

   * First step in expanding prepositional phrases: group NP to right of
   * preposition under a `grup.nom` subtree (specially labeled for now
   * so that we can target it in the next step)
  private final TsurgeonPattern expandPrepositionalPhrase1 =
    Tsurgeon.parseOperation("[createSubtree grup.nom.inter left right]");

   * Matches intermediate prepositional phrase structures as produced by
   * the first step of expansion.
  private final TregexPattern intermediatePrepositionalPhrase
    = TregexPattern.compile("sp000=preptag $+ /^grup\\.nom\\.inter$/=gn");

   * Second step: replace intermediate prepositional phrase structure
   * with final result.
  private final TsurgeonPattern expandPrepositionalPhrase2 =
    Tsurgeon.parseOperation("[adjoinF (sp (prep T=preptarget) (sn foot@)) gn]" +
                            "[relabel gn /.inter$//]" +
                            "[replace preptarget preptag]" +
                            "[delete preptag]");

  private final TregexPattern prepositionalVP =
    TregexPattern.compile("sp000=tag < /(?i)^(para|al?|del?)$/" +
                          " > (/" + CANDIDATE_GROUPS + "/ <- __=right)" +
                          " $+ vmn0000=left !$-- sp000");

  private final TsurgeonPattern expandPrepositionalVP1 =
    Tsurgeon.parseOperation("[createSubtree S.inter left right]" +
                            "[adjoinF (infinitiu foot@) left]");

  private final TregexPattern intermediatePrepositionalVP =
    TregexPattern.compile("sp000=preptag $+ /^S\\.inter$/=si");

  private final TsurgeonPattern expandPrepositionalVP2 =
    Tsurgeon.parseOperation("[adjoin (sp prep=target S@) si] [move preptag >0 target]");

  private final TregexPattern conjunctPhrase =
    TregexPattern.compile("cc=cc" +
                          // In one of our expanded phrases (match
                          // bounds of this expanded phrase; these form
                          // the left edge of first new subtree and the
                          // right edge of the second new subtree)
                          " > (/^grup\\.nom/ <, __=left1 <` __=right2)" +
                          // Fetch more bounds: node to immediate left
                          // of cc is the right edge of the first new
                          // subtree, and node to right of cc is the
                          // left edge of the second new subtree
                          // NB: left1 may the same as right1; likewise
                          // for the second tree
                          " $- /^[^g]/=right1 $+ /^[^g]/=left2");

  private final TsurgeonPattern expandConjunctPhrase =
    Tsurgeon.parseOperation("[adjoinF (conj foot@) cc]" +
                            "[createSubtree grup.nom.inter2 left1 right1]" +
                            "[createSubtree grup.nom.inter2 left2 right2]");

   * Simple intermediate conjunct: a constituent which heads a single
   * substantive
  private final TregexPattern intermediateSubstantiveConjunct =
    TregexPattern.compile("/grup\\.nom\\.inter2/=target <: /^[dnpw]/");

   * Rename simple intermediate conjunct as a `grup.nom`
  private final TsurgeonPattern expandIntermediateSubstantiveConjunct =
    Tsurgeon.parseOperation("[relabel target /grup.nom/]");

   * Simple intermediate conjunct: a constituent which heads a single
   * adjective
  private final TregexPattern intermediateAdjectiveConjunct =
    TregexPattern.compile("/^grup\\.nom\\.inter2$/=target <: /^a/");

   * Rename simple intermediate adjective conjunct as a `grup.a`
  private final TsurgeonPattern expandIntermediateAdjectiveConjunct =
    Tsurgeon.parseOperation("[relabel target /grup.a/]");

   * Match parts of an expanded conjunct which must be labeled as a noun
   * phrase given their children.
  private final TregexPattern intermediateNounPhraseConjunct =
    TregexPattern.compile("/^grup\\.nom\\.inter2$/=target < /^s[pn]$/");

  private final TsurgeonPattern expandIntermediateNounPhraseConjunct =
    Tsurgeon.parseOperation("[relabel target sn]");

   * Intermediate conjunct: verb
  private final TregexPattern intermediateVerbConjunct =
    TregexPattern.compile("/^grup\\.nom\\.inter2$/=gn <: /^vmi/");

  private final TsurgeonPattern expandIntermediateVerbConjunct =
    Tsurgeon.parseOperation("[adjoin (S (grup.verb@)) gn]");

   * Match parts of an expanded conjunct which should be labeled as
   * nominal groups.
  private final TregexPattern intermediateNominalGroupConjunct =
    TregexPattern.compile("/^grup\\.nom\\.inter2$/=target !< /^[^n]/");

  private final TsurgeonPattern expandIntermediateNominalGroupConjunct =
    Tsurgeon.parseOperation("[relabel target /grup.nom/]");

   * Match articles contained within nominal groups of substantives so
   * that they can be moved out
  private final TregexPattern articleLeadingNominalGroup =
    TregexPattern.compile("/^d[aip]/=art >, (/^grup\\.nom$/=ng > sn)");

  private final TsurgeonPattern expandArticleLeadingNominalGroup =
    Tsurgeon.parseOperation("[insert (spec=target) $+ ng] [move art >0 target]");

  private final TregexPattern articleInsideOrphanedNominalGroup =
    TregexPattern.compile("/^d[aip]/=d >, (/^grup\\.nom/=ng !> sn)");

  private final TsurgeonPattern expandArticleInsideOrphanedNominalGroup =
    Tsurgeon.parseOperation("[adjoinF (sn=sn spec=spec foot@) ng] [move d >0 spec]");

  private final TregexPattern determinerInsideNominalGroup =
    TregexPattern.compile("/^d[^n]/=det >, (/^grup\\.nom/=ng > sn) $ __");

  private final TsurgeonPattern expandDeterminerInsideNominalGroup =
    Tsurgeon.parseOperation("[insert (spec=target) $+ ng] [move det >0 target]");

  // "en opinion del X," "además del Y"
  private final TregexPattern contractionTrailingIdiomBeforeNominalGroup
    = TregexPattern.compile("sp000 >` (/^grup\\.prep$/ > (__ $+ /^grup\\.nom/=ng)) < /^(de|a)l$/=contraction");

  // -> "(en opinion de) (el X)," "(además de) (el Y)"
  private final TsurgeonPattern joinArticleWithNominalGroup
    = Tsurgeon.parseOperation("[relabel contraction /l//] [adjoinF (sn (spec (da0000 el)) foot@) ng]");

  private final TregexPattern contractionInSpecifier
    = TregexPattern.compile("sp000=parent < /(?i)^(a|de)l$/=contraction > spec");

  private final TregexPattern delTodo = TregexPattern.compile("del=contraction . todo > sp000=parent");

  // "del X al Y"
  private final TregexPattern contractionInRangePhrase
    = TregexPattern.compile("sp000 < /(?i)^(a|de)l$/=contraction >: (conj $+ (/^grup\\.(w|nom)/=group))");

  private final TsurgeonPattern expandContractionInRangePhrase
    = Tsurgeon.parseOperation("[relabel contraction /(?i)l//] [adjoinF (sn (spec (da0000 el)) foot@) group]");

   * Operation to extract article from contraction and just put it next to the container
  private final TsurgeonPattern extendContraction
    = Tsurgeon.parseOperation("[relabel contraction /l//] [insert (da0000 el) $- parent]");

  // ---------

  // Final cleanup operations

  private final TregexPattern terminalPrepositions
    = TregexPattern.compile("sp000=sp < /" + PREPOSITIONS + "/ >- (/^grup\\.nom/ >+(/^grup\\.nom/) sn=sn >>- =sn)");

  private final TsurgeonPattern extractTerminalPrepositions = Tsurgeon.parseOperation(
    "[insert (prep=prep) $- sn] [move sp >0 prep]");

   * Match terminal prepositions in prepositional phrases: "a lo largo de"
  private final TregexPattern terminalPrepositions2
    = TregexPattern.compile("prep=prep >` (/^grup\\.nom$/ >: (sn=sn > /^(grup\\.prep|sp)$/))");

  private final TsurgeonPattern extractTerminalPrepositions2
    = Tsurgeon.parseOperation("move prep $- sn");

   * Match terminal prepositions in infinitive clause within prepositional phrase: "a partir de," etc.
  private final TregexPattern terminalPrepositions3
    = TregexPattern.compile("sp000=sp $- infinitiu >` (S=S >` /^(grup\\.prep|sp)$/)");

  private final TsurgeonPattern extractTerminalPrepositions3
    = Tsurgeon.parseOperation("[insert (prep=prep) $- S] [move sp >0 prep]");

  private final TregexPattern adverbNominalGroups = TregexPattern.compile("/^grup\\.nom./=ng <: /^r[gn]/=r");
  private final TsurgeonPattern replaceAdverbNominalGroup = Tsurgeon.parseOperation("replace ng r");

   * Match blocks of only adjectives (one or more) with a nominal group parent. These constituents should be rewritten
   * beneath an adjectival group constituent.
  private final TregexPattern adjectiveSpanInNominalGroup
    = TregexPattern.compile("/^grup\\.nom/=ng <, aq0000=left <` aq0000=right !< /^[^a]/");

   * Match dependent clauses mistakenly held under nominal groups ("lo que X")
  private final TregexPattern clauseInNominalGroup
    = TregexPattern.compile("lo . (que > (pr000000=pr >, /^grup\\.nom/=ng $+ (/^v/=vb >` =ng)))");

  private final TsurgeonPattern labelClause
    = Tsurgeon.parseOperation("[relabel ng S] [adjoinF (relatiu foot@) pr] [adjoinF (grup.verb foot@) vb]");

   * Infinitive clause mistakenly held under nominal group
  private final TregexPattern clauseInNominalGroup2 = TregexPattern.compile("/^grup\\.nom/=gn $- spec <: /^vmn/");
  private final TsurgeonPattern labelClause2 = Tsurgeon.parseOperation("[adjoin (S (infinitiu@)) gn]");

  private final TregexPattern clauseInNominalGroup3 = TregexPattern.compile("sn=sn <, (/^vmn/=inf $+ (sp >` =sn))");
  private final TsurgeonPattern labelClause3
    = Tsurgeon.parseOperation("[relabel sn S] [adjoinF (infinitiu foot@) inf]");

  private final TregexPattern loneAdjectiveInNominalGroup
    = TregexPattern.compile("/^a/=a > /^grup\\.nom/ $ /^([snwz]|p[ipr])/ !$ /^a/");
  private final TsurgeonPattern labelAdjective = Tsurgeon.parseOperation("[adjoinF (s.a (grup.a foot@)) a]");

  private final TsurgeonPattern groupAdjectives = Tsurgeon.parseOperation("createSubtree (s.a grup.a@) left right");

   * Some brute-force fixes:
  private final TregexPattern alMenos
    = TregexPattern.compile("/(?i)^al$/ . /(?i)^menos$/ > (sp000 $+ rg > /^grup\\.adv$/=ga)");
  private final TsurgeonPattern fixAlMenos
    = Tsurgeon.parseOperation("replace ga (grup.adv (sp (prep (sp000 a)) (sn (spec (da0000 lo)) (grup.nom (s.a (grup.a (aq0000 menos)))))))");
  private final TregexPattern todoLoContrario
    = TregexPattern.compile("(__=ttodo < /(?i)^todo$/) $+ (__=tlo < /(?i)^lo$/ $+ (__=tcon < /(?i)^contrario$/))");
  private final TsurgeonPattern fixTodoLoContrario
    = Tsurgeon.parseOperation("[adjoin (sn (grup.nom (pp000000@))) tlo] [adjoin (grup.a (aq0000@)) tcon]");

   * Mark infinitives within verb groups ("hacer ver", etc.)
  private final TregexPattern infinitiveInVerbGroup
    = TregexPattern.compile("/^grup\\.verb$/=grup < (/^v/ !$-- /^v/ $++ (/^vmn/=target !$++ /^vmn/))");
  private final TsurgeonPattern markInfinitive = Tsurgeon.parseOperation("[adjoinF (infinitiu foot@) target]");

   * The corpus marks entire multiword verb tokens like "teniendo en
   * cuenta" as gerunds / infinitives (by heading them with a
   * constituent "gerundi" / "infinitiu"). Now that we've split into
   * separate words, transfer this gerund designation so that it heads
   * the verb only.
  private final TregexPattern floppedGerund
    = TregexPattern.compile("/^grup\\.verb$/=grup >: gerundi=ger < (/^vmg/=vb !$ /^vmg/)");
  private final TsurgeonPattern unflopFloppedGerund
    = Tsurgeon.parseOperation("[adjoinF (gerundi foot@) vb] [replace ger grup]");
  private final TregexPattern floppedInfinitive
    = TregexPattern.compile("/^grup\\.verb$/=grup >: infinitiu=inf < (/^vmn/=vb !$ /^vmn/)");
  private final TsurgeonPattern unflopFloppedInfinitive
    = Tsurgeon.parseOperation("[adjoinF (infinitiu foot@) vb] [replace inf grup]");

   * Match `sn` constituents which can (should) be rewritten as nominal groups
  private final TregexPattern nominalGroupSubstantives =
    TregexPattern.compile("sn=target < /^[adnwz]/ !< /^([^adnswz]|neg)/");

  private final TregexPattern leftoverIntermediates =

  private final TsurgeonPattern makeNominalGroup =
    Tsurgeon.parseOperation("[relabel target /grup.nom/]");

  private final TregexPattern redundantNominalRewrite =
    TregexPattern.compile("/^grup\\.nom$/ <: sn=child >: sn=parent");

  private final TsurgeonPattern fixRedundantNominalRewrite =
    Tsurgeon.parseOperation("[replace parent child]");

  private final TregexPattern redundantPrepositionGroupRewrite =
    TregexPattern.compile("/^grup\\.prep$/=parent <: sp=child >: prep");

  private final TsurgeonPattern fixRedundantPrepositionGroupRewrite =
    Tsurgeon.parseOperation("[relabel child /grup.prep/] [replace parent child]");

  private final TregexPattern redundantPrepositionGroupRewrite2 = TregexPattern.compile("/^grup\\.prep$/=gp <: sp=sp");
  private final TsurgeonPattern fixRedundantPrepositionGroupRewrite2 = Tsurgeon.parseOperation("replace gp sp");

   * Patterns in this list turn flat structures into intermediate forms
   * which will eventually become deep phrase structures.
  private final List> firstStepExpansions = Arrays.asList(
    // Should be first-ish
          new Pair<>(parentheticalExpression, groupParentheticalExpression),
          new Pair<>(multipleClauses, expandMultipleClauses),

          new Pair<>(leadingPrepositionalPhrase,
          new Pair<>(conjunctPhrase, expandConjunctPhrase),
          new Pair<>(prepositionalPhrase, expandPrepositionalPhrase1),
          new Pair<>(prepositionalVP, expandPrepositionalVP1),

          new Pair<>(contractionTrailingIdiomBeforeNominalGroup,
          new Pair<>(contractionInSpecifier, extendContraction),
          new Pair<>(delTodo, extendContraction),
          new Pair<>(contractionInRangePhrase,

    // Should not happen until the last moment! The function words
    // being targeted have weaker "scope" than others earlier
    // targeted, and so we don't want to clump things around them
    // until we know we have the right to clump
          new Pair<>(articleLeadingNominalGroup,
          new Pair<>(articleInsideOrphanedNominalGroup,
          new Pair<>(determinerInsideNominalGroup,

   * Patterns in this list clean up "intermediate" phrase structures
   * produced by previous step and produce something from them that
   * looks like the rest of the corpus.
  private final List> intermediateExpansions = Arrays.asList(
          new Pair<>(intermediatePrepositionalPhrase,
          new Pair<>(intermediatePrepositionalVP, expandPrepositionalVP2),

          new Pair<>(intermediateSubstantiveConjunct,
          new Pair<>(intermediateAdjectiveConjunct,
          new Pair<>(intermediateNounPhraseConjunct,
          new Pair<>(intermediateVerbConjunct,
          new Pair<>(intermediateNominalGroupConjunct,

   * Patterns in this list perform last-minute cleanup of leftover
   * grammar mistakes which this class created.
  private final List> finalCleanup = Arrays.asList(
          new Pair<>(terminalPrepositions, extractTerminalPrepositions),
          new Pair<>(terminalPrepositions2, extractTerminalPrepositions2),
          new Pair<>(terminalPrepositions3, extractTerminalPrepositions3),

          new Pair<>(nominalGroupSubstantives, makeNominalGroup),
          new Pair<>(adverbNominalGroups, replaceAdverbNominalGroup),
          new Pair<>(adjectiveSpanInNominalGroup, groupAdjectives),
          new Pair<>(clauseInNominalGroup, labelClause),
          new Pair<>(clauseInNominalGroup2, labelClause2),
          new Pair<>(clauseInNominalGroup3, labelClause3),
          new Pair<>(loneAdjectiveInNominalGroup, labelAdjective),

    // Verb phrase-related cleanup.. order is important!
          new Pair<>(infinitiveInVerbGroup, markInfinitive),
          new Pair<>(floppedGerund, unflopFloppedGerund),
          new Pair<>(floppedInfinitive, unflopFloppedInfinitive),

    // Fixes for specific common phrases
          new Pair<>(alMenos, fixAlMenos),
          new Pair<>(todoLoContrario, fixTodoLoContrario),

    // Lastly..
    // These final fixes are not at all linguistically motivated -- just need to make the trees less dirty
          new Pair<>(redundantNominalRewrite, fixRedundantNominalRewrite),

          new Pair<>(redundantPrepositionGroupRewrite,

          new Pair<>(redundantPrepositionGroupRewrite2,
          new Pair<>(leftoverIntermediates, makeNominalGroup)

   * Recognize candidate patterns for expansion in the given tree and
   * perform the expansions. See the class documentation for more
   * information.
  public Tree expandPhrases(Tree t, TreeNormalizer tn, TreeFactory tf) {
    // Keep running this sequence of patterns until no changes are
    // affected. We need this for nested expressions like "para tratar
    // de regresar al empleo." This first step produces lots of
    // "intermediate" tree structures which need to be cleaned up later.
    Tree oldTree;
    do {
      oldTree = t.deepCopy();
      t = Tsurgeon.processPatternsOnTree(firstStepExpansions, t);
    } while (!t.equals(oldTree));

    // Now clean up intermediate tree structures
    t = Tsurgeon.processPatternsOnTree(intermediateExpansions, t);

    // Normalize first to allow for contraction expansion, etc.
    t = tn.normalizeWholeTree(t, tf);

    // Final cleanup
    t = Tsurgeon.processPatternsOnTree(finalCleanup, t);

    return t;


// incidentes . lamentables (nested articles near middle)
// chiquilla . vistosa (giant multiword at end)
// espejo . deformante (article fun at start)
// menor . coste (watch "Comisión del Mercado" thing at end)
// totalmente . evitables ("en opinion del" at end)

// TODO (corpus)
// epígrafe . Arte (flat!)

// TODO (parser)
// debería .. encima ("por encima de" parse, coordinated NP)
// manía .. catalán ("castellana" parsed as being under a participi constituent)

