edu.stanford.nlp.trees.tregex.TregexPatternCompiler Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of stanford-parser Show documentation
Show all versions of stanford-parser Show documentation
Stanford Parser processes raw text in English, Chinese, German, Arabic, and French, and extracts constituency parse trees.
// TregexPatternCompiler
// Copyright (c) 2004-2007 The Board of Trustees of
// The Leland Stanford Junior University. All Rights Reserved.
//
// This program is free software; you can redistribute it and/or
// modify it under the terms of the GNU General Public License
// as published by the Free Software Foundation; either version 2
// of the License, or (at your option) any later version.
//
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU General Public License for more details.
//
// You should have received a copy of the GNU General Public License
// along with this program; if not, write to the Free Software
// Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
//
//
// For more information, bug reports, fixes, contact:
// Christopher Manning
// Dept of Computer Science, Gates 1A
// Stanford CA 94305-9010
// USA
// Support/Questions: [email protected]
// Licensing: [email protected]
// http://www-nlp.stanford.edu/software/tregex.shtml
package edu.stanford.nlp.trees.tregex;
import java.io.StringReader;
import java.util.ArrayList;
import java.util.List;
import java.util.function.Function;
import edu.stanford.nlp.util.Pair;
import edu.stanford.nlp.trees.CollinsHeadFinder;
import edu.stanford.nlp.trees.HeadFinder;
import edu.stanford.nlp.trees.PennTreebankLanguagePack;
/**
* A class for compiling TregexPatterns with specific HeadFinders and or
* basicCategoryFunctions.
*
* @author Galen Andrew
*/
public class TregexPatternCompiler {
static final Function DEFAULT_BASIC_CAT_FUNCTION =
new PennTreebankLanguagePack().getBasicCategoryFunction();
static final HeadFinder DEFAULT_HEAD_FINDER = new CollinsHeadFinder();
private final Function basicCatFunction;
private final HeadFinder headFinder;
private final List> macros =
new ArrayList>();
public static final TregexPatternCompiler defaultCompiler =
new TregexPatternCompiler();
public TregexPatternCompiler() {
this(DEFAULT_HEAD_FINDER, DEFAULT_BASIC_CAT_FUNCTION);
}
/**
* A compiler that uses this basicCatFunction and the default HeadFinder.
*
* @param basicCatFunction the function mapping Strings to Strings
*/
public TregexPatternCompiler(Function basicCatFunction) {
this(DEFAULT_HEAD_FINDER, basicCatFunction);
}
/**
* A compiler that uses this HeadFinder and the default basicCategoryFunction
*
* @param headFinder the HeadFinder
*/
public TregexPatternCompiler(HeadFinder headFinder) {
this(headFinder, DEFAULT_BASIC_CAT_FUNCTION);
}
/**
* A compiler that uses this HeadFinder and this basicCategoryFunction
*
* @param headFinder the HeadFinder
* @param basicCatFunction The function mapping Strings to Strings
*/
public TregexPatternCompiler(HeadFinder headFinder,
Function basicCatFunction) {
this.headFinder = headFinder;
this.basicCatFunction = basicCatFunction;
}
// todo [cdm 2013]: Provide an easy way to do Matcher.quoteReplacement(): This would be quite useful, since the replacement will often contain $ or \
/** Define a macro for rewriting a pattern in any tregex expression compiled
* by this compiler. The semantics of this is that all instances of the
* original in the pattern are replaced by the replacement, using exactly
* the semantics of String.replaceAll(original, replacement) and the
* result will then be compiled by the compiler. As such, note that a
* macro can replace any part of a tregex expression, in a syntax
* insensitive way. Here's an example:
* {@code tpc.addMacro("FINITE_BE_AUX", "/^(?i:am|is|are|was|were)$/");}
*
* @param original The String to match; becomes the first argument of a
* String.replaceAll()
* @param replacement The replacement String; becomes the second argument
* of a String.replaceAll()
*/
public void addMacro(String original, String replacement) {
macros.add(new Pair(original, replacement));
}
/**
* Create a TregexPattern from this tregex string using the headFinder and
* basicCat function this TregexPatternCompiler was created with.
*
* Implementation note: If there is an invalid token in the Tregex
* parser, JavaCC will throw a TokenMgrError. This is a class
* that extends Error, not Exception (OMG! - bad!), and so rather than
* requiring clients to catch it, we wrap it in a ParseException.
* (The original Error's are thrown in TregexParserTokenManager.)
*
* @param tregex The pattern to parse
* @return A new TregexPattern object based on this string
* @throws TregexParseException If the expression is syntactically invalid
*/
public TregexPattern compile(String tregex) {
for (Pair macro : macros) {
tregex = tregex.replaceAll(macro.first(), macro.second());
}
TregexPattern pattern;
try {
TregexParser parser = new TregexParser(new StringReader(tregex + '\n'),
basicCatFunction, headFinder);
pattern = parser.Root();
} catch (TokenMgrError tme) {
throw new TregexParseException("Could not parse " + tregex, tme);
} catch (ParseException e) {
throw new TregexParseException("Could not parse " + tregex, e);
}
pattern.setPatternString(tregex);
return pattern;
}
}