edu.stanford.nlp.process.Morphology Maven / Gradle / Ivy
package edu.stanford.nlp.process;
import java.io.FileReader;
import java.io.IOException;
import java.io.Reader;
import java.io.StringReader;
import java.util.logging.Logger;
import edu.stanford.nlp.io.IOUtils;
import edu.stanford.nlp.ling.CoreAnnotation;
import edu.stanford.nlp.ling.CoreAnnotations;
import edu.stanford.nlp.ling.CoreLabel;
import edu.stanford.nlp.ling.Word;
import edu.stanford.nlp.ling.WordLemmaTag;
import edu.stanford.nlp.ling.WordTag;
import java.util.function.Function;
/**
* Morphology computes the base form of English words, by removing just
* inflections (not derivational morphology). That is, it only does noun
* plurals, pronoun case, and verb endings, and not things like comparative adjectives
* or derived nominals. It is based on a finite-state
* transducer implemented by John Carroll et al., written in flex and publicly
* available.
* See: http://www.informatics.susx.ac.uk/research/nlp/carroll/morph.html .
* There are several ways of invoking Morphology. One is by calling the static
* methods:
*
* - WordTag stemStatic(String word, String tag)
* - WordTag stemStatic(WordTag wordTag)
*
* If we have created a Morphology object already we can use the methods
* WordTag stem(String word, string tag) or WordTag stem(WordTag wordTag).
*
* Another way of using Morphology is to run it on an input file by running
* java Morphology filename
. In this case, POS tags MUST be
* separated from words by an underscore ("_").
*
* Note that a single instance of Morphology is not thread-safe, as
* the underlying lexer object is not built to be re-entrant. One thing that
* you can do to get around this is build a new Morphology object for
* each thread or each set of calls to the Morphology. For example, the
* MorphaAnnotator builds a Morphology for each document it annotates.
* The other approach is to use the synchronized methods in this class.
* The crucial lexer-accessing portion of all the static methods is synchronized
* (otherwise, their use tended to be threading bugs waiting to happen).
* If you want less synchronization, create your own Morphology objects.
*
* @author Kristina Toutanova ([email protected])
* @author Christopher Manning
*/
public class Morphology implements Function {
private static final Logger LOGGER = Logger.getLogger(Morphology.class.getName());
private static final boolean DEBUG = false;
private static Morpha staticLexer;
private final Morpha lexer;
public Morphology() {
lexer = new Morpha(System.in);
}
/**
* Process morphologically words from a Reader.
*
* @param in The Reader to read from
*/
public Morphology(Reader in) {
lexer = new Morpha(in);
}
public Morphology(Reader in, int flags) {
lexer = new Morpha(in);
lexer.setOptions(flags);
}
public Word next() throws IOException {
String nx = lexer.next();
if (nx == null) {
return null;
} else {
return new Word(nx);
}
}
public Word stem(Word w) {
return new Word(stem(w.value()));
}
public String stem(String word) {
try {
lexer.yyreset(new StringReader(word));
lexer.yybegin(Morpha.any);
String wordRes = lexer.next();
return wordRes;
} catch (IOException e) {
LOGGER.warning("Morphology.stem() had error on word " + word);
return word;
}
}
public String lemma(String word, String tag) {
return lemmatize(word, tag, lexer, lexer.option(1));
}
public String lemma(String word, String tag, boolean lowercase) {
return lemmatize(word, tag, lexer, lowercase);
}
/**
* Adds the LemmaAnnotation to the given CoreLabel.
*/
public void stem(CoreLabel label) {
stem(label, CoreAnnotations.LemmaAnnotation.class);
}
/**
* Adds stem under annotation {@code ann} to the given CoreLabel.
* Assumes that it has a TextAnnotation and PartOfSpeechAnnotation.
*/
public void stem(CoreLabel label,
Class extends CoreAnnotation> ann) {
String lemma = lemmatize(label.word(), label.tag(), lexer, lexer.option(1));
label.set(ann, lemma);
}
/** Lemmatize the word, being sensitive to the tag, using the
* passed in lexer.
*
* @param lowercase If this is true, words other than proper nouns will
* be changed to all lowercase.
*/
private static String lemmatize(String word, String tag, Morpha lexer, boolean lowercase) {
boolean wordHasForbiddenChar = word.indexOf('_') >= 0 || word.indexOf(' ') >= 0 || word.indexOf('\n') >= 0;
String quotedWord = word;
if (wordHasForbiddenChar) {
// choose something unlikely. Classical Vedic!
quotedWord = quotedWord.replaceAll("_", "\u1CF0");
quotedWord = quotedWord.replaceAll(" ", "\u1CF1");
quotedWord = quotedWord.replaceAll("\n", "\u1CF2");
}
String wordtag = quotedWord + '_' + tag;
if (DEBUG) System.err.println("Trying to normalize |" + wordtag + "|");
try {
lexer.setOption(1, lowercase);
lexer.yyreset(new StringReader(wordtag));
lexer.yybegin(Morpha.scan);
String wordRes = lexer.next();
lexer.next(); // go past tag
if (wordHasForbiddenChar) {
if (DEBUG) System.err.println("Restoring forbidden chars");
wordRes = wordRes.replaceAll("\u1CF0", "_");
wordRes = wordRes.replaceAll("\u1CF1", " ");
wordRes = wordRes.replaceAll("\u1CF2", "\n");
}
return wordRes;
} catch (IOException e) {
LOGGER.warning("Morphology.stem() had error on word " + word + "/" + tag);
return word;
}
}
private static synchronized void initStaticLexer() {
if (staticLexer == null) {
staticLexer = new Morpha(System.in);
}
}
/** Return a new WordTag which has the lemma as the value of word().
* The default is to lowercase non-proper-nouns, unless options have
* been set.
*/
public static synchronized WordTag stemStatic(String word, String tag) {
initStaticLexer();
return new WordTag(lemmatize(word, tag, staticLexer, staticLexer.option(1)), tag);
}
public static synchronized String lemmaStatic(String word, String tag,
boolean lowercase) {
initStaticLexer();
return lemmatize(word, tag, staticLexer, lowercase);
}
/** Return a new WordTag which has the lemma as the value of word().
* The default is to lowercase non-proper-nouns, unless options have
* been set.
*/
public static WordTag stemStatic(WordTag wT) {
return stemStatic(wT.word(), wT.tag());
}
@Override
public Object apply(Object in) {
if (in instanceof WordTag) {
WordTag wt = (WordTag) in;
String tag = wt.tag();
return new WordTag(lemmatize(wt.word(), tag, lexer, lexer.option(1)), tag);
}
if (in instanceof Word) {
return stem((Word) in);
}
return in;
}
/**
* Lemmatize returning a WordLemmaTag
.
*/
public WordLemmaTag lemmatize(WordTag wT) {
String tag = wT.tag();
String word = wT.word();
String lemma = lemma(word, tag);
return new WordLemmaTag(word, lemma, tag);
}
public static WordLemmaTag lemmatizeStatic(WordTag wT) {
String tag = wT.tag();
String word = wT.word();
String lemma = stemStatic(wT).word();
return new WordLemmaTag(word, lemma, tag);
}
/** Run the morphological analyzer. Options are:
*
* - -rebuildVerbTable verbTableFile Convert a verb table from a text file
* (e.g., /u/nlp/data/morph/verbstem.list) to Java code contained in Morpha.flex .
*
- -stem args ... Stem each of the following arguments, which should either be
* in the form of just word or word_tag.
*
- args ... Each argument is a file and the contents of it are stemmed as
* space-separated tokens. Note: If the tokens are tagged
* words, they must be in the format of whitespace separated word_tag pairs.
*
*/
public static void main(String[] args) throws IOException {
if (args.length == 0) {
System.err.println("java Morphology [-rebuildVerbTable file|-stem word+|file+]");
} else if (args.length == 2 && args[0].equals("-rebuildVerbTable")) {
String verbs = IOUtils.slurpFile(args[1]);
String[] words = verbs.split("\\s+");
System.out.print(" private static final String[] verbStems = { ");
for (int i = 0; i < words.length; i++) {
System.out.print("\"" + words[i] + "\"");
if (i != words.length - 1) {
System.out.print(", ");
if (i % 5 == 0) {
System.out.println();
System.out.print(" ");
}
}
}
System.out.println(" };");
} else if (args[0].equals("-stem")) {
for (int i = 1; i < args.length; i++) {
System.out.println(args[i] + " --> " + stemStatic(WordTag.valueOf(args[i], "_")));
}
} else {
int flags = 0;
for (String arg : args) {
if (arg.charAt(0) == '-') {
try {
flags = Integer.parseInt(arg.substring(1));
} catch (NumberFormatException nfe) {
System.err.println("Couldn't handle flag: " + arg + "\n");
// ignore flag
}
} else {
Morphology morph = new Morphology(new FileReader(arg), flags);
for (Word next; (next = morph.next()) != null; ) {
System.out.print(next);
}
}
}
}
}
}