All Downloads are FREE. Search and download functionalities are using the official Maven repository.

edu.stanford.nlp.trees.tregex.TregexPatternCompiler Maven / Gradle / Ivy

Go to download

Stanford Parser processes raw text in English, Chinese, German, Arabic, and French, and extracts constituency parse trees.

There is a newer version: 3.9.2
Show newest version
// TregexPatternCompiler
// Copyright (c) 2004-2007 The Board of Trustees of
// The Leland Stanford Junior University. All Rights Reserved.
//
// This program is free software; you can redistribute it and/or
// modify it under the terms of the GNU General Public License
// as published by the Free Software Foundation; either version 2
// of the License, or (at your option) any later version.
//
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
// GNU General Public License for more details.
//
// You should have received a copy of the GNU General Public License
// along with this program; if not, write to the Free Software
// Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.
//
//
// For more information, bug reports, fixes, contact:
//    Christopher Manning
//    Dept of Computer Science, Gates 1A
//    Stanford CA 94305-9010
//    USA
//    Support/Questions: [email protected]
//    Licensing: [email protected]
//    http://www-nlp.stanford.edu/software/tregex.shtml

package edu.stanford.nlp.trees.tregex;

import java.io.StringReader;
import java.util.ArrayList;
import java.util.List;

import java.util.function.Function;
import edu.stanford.nlp.util.Pair;
import edu.stanford.nlp.trees.CollinsHeadFinder;
import edu.stanford.nlp.trees.HeadFinder;
import edu.stanford.nlp.trees.PennTreebankLanguagePack;

/**
 * A class for compiling TregexPatterns with specific HeadFinders and or
 * basicCategoryFunctions.
 *
 * @author Galen Andrew
 */
public class TregexPatternCompiler {

  static final Function DEFAULT_BASIC_CAT_FUNCTION =
    new PennTreebankLanguagePack().getBasicCategoryFunction();

  static final HeadFinder DEFAULT_HEAD_FINDER = new CollinsHeadFinder();

  private final Function basicCatFunction;
  private final HeadFinder headFinder;

  private final List> macros =
    new ArrayList>();

  public static final TregexPatternCompiler defaultCompiler =
    new TregexPatternCompiler();

  public TregexPatternCompiler() {
    this(DEFAULT_HEAD_FINDER, DEFAULT_BASIC_CAT_FUNCTION);
  }

  /**
   * A compiler that uses this basicCatFunction and the default HeadFinder.
   *
   * @param basicCatFunction the function mapping Strings to Strings
   */
  public TregexPatternCompiler(Function basicCatFunction) {
    this(DEFAULT_HEAD_FINDER, basicCatFunction);
  }

  /**
   * A compiler that uses this HeadFinder and the default basicCategoryFunction
   *
   * @param headFinder the HeadFinder
   */
  public TregexPatternCompiler(HeadFinder headFinder) {
    this(headFinder, DEFAULT_BASIC_CAT_FUNCTION);
  }

  /**
   * A compiler that uses this HeadFinder and this basicCategoryFunction
   *
   * @param headFinder       the HeadFinder
   * @param basicCatFunction The function mapping Strings to Strings
   */
  public TregexPatternCompiler(HeadFinder headFinder,
                               Function basicCatFunction) {
    this.headFinder = headFinder;
    this.basicCatFunction = basicCatFunction;
  }

  // todo [cdm 2013]: Provide an easy way to do Matcher.quoteReplacement(): This would be quite useful, since the replacement will often contain $ or \

  /** Define a macro for rewriting a pattern in any tregex expression compiled
   *  by this compiler. The semantics of this is that all instances of the
   *  original in the pattern are replaced by the replacement, using exactly
   *  the semantics of String.replaceAll(original, replacement) and the
   *  result will then be compiled by the compiler. As such, note that a
   *  macro can replace any part of a tregex expression, in a syntax
   *  insensitive way.  Here's an example:
   *  {@code tpc.addMacro("FINITE_BE_AUX", "/^(?i:am|is|are|was|were)$/");}
   *
   *  @param original The String to match; becomes the first argument of a
   *                  String.replaceAll()
   *  @param replacement The replacement String; becomes the second argument
   *                  of a String.replaceAll()
   */
  public void addMacro(String original, String replacement) {
    macros.add(new Pair(original, replacement));
  }


  /**
   * Create a TregexPattern from this tregex string using the headFinder and
   * basicCat function this TregexPatternCompiler was created with.
   *
   * Implementation note: If there is an invalid token in the Tregex
   * parser, JavaCC will throw a TokenMgrError.  This is a class
   * that extends Error, not Exception (OMG! - bad!), and so rather than
   * requiring clients to catch it, we wrap it in a ParseException.
   * (The original Error's are thrown in TregexParserTokenManager.)
   *
   * @param tregex The pattern to parse
   * @return A new TregexPattern object based on this string
   * @throws TregexParseException If the expression is syntactically invalid
   */
  public TregexPattern compile(String tregex) {
    for (Pair macro : macros) {
      tregex = tregex.replaceAll(macro.first(), macro.second());
    }
    TregexPattern pattern;
    try {
      TregexParser parser = new TregexParser(new StringReader(tregex + '\n'),
                                             basicCatFunction, headFinder);
      pattern = parser.Root();
    } catch (TokenMgrError tme) {
      throw new TregexParseException("Could not parse " + tregex, tme);
    } catch (ParseException e) {
      throw new TregexParseException("Could not parse " + tregex, e);
    }
    pattern.setPatternString(tregex);
    return pattern;
  }

}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy