edu.stanford.nlp.international.spanish.SpanishVerbStripper Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of stanford-parser Show documentation
Show all versions of stanford-parser Show documentation
Stanford Parser processes raw text in English, Chinese, German, Arabic, and French, and extracts constituency parse trees.
The newest version!
package edu.stanford.nlp.international.spanish;
import edu.stanford.nlp.io.IOUtils;
import edu.stanford.nlp.util.Pair;
import edu.stanford.nlp.util.logging.Redwood;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.Serializable;
import java.io.UnsupportedEncodingException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
/**
* Provides a utility function for removing attached pronouns from
* Spanish verb forms.
*
* @author Jon Gauthier
* @author Ishita Prasad
*/
public final class SpanishVerbStripper implements Serializable {
/** A logger for this class */
private static final Redwood.RedwoodChannels log = Redwood.channels(SpanishVerbStripper.class);
// The following three classes of verb forms can carry attached
// pronouns:
//
// - Infinitives
// - Gerunds
// - Affirmative imperatives
/**
* A struct describing the result of verb stripping.
*/
public static class StrippedVerb {
private String stem;
private String originalStem;
private List pronouns;
public StrippedVerb(String originalStem, List pronouns) {
this.originalStem = originalStem;
this.pronouns = pronouns;
}
public void setStem(String stem) {
this.stem = stem;
}
/**
* Return the normalized stem of the verb -- the way it would appear in
* isolation without attached pronouns.
*
* Here are example mappings from original verb to normalized stem:
*
*
* - sentaos -> sentad
* - vámonos -> vamos
*
*/
public String getStem() { return stem; }
/**
* Returns the original stem of the verb, simply split off from pronouns.
* (Contrast with {@link #getStem()}, which returns a normalized form.)
*/
public String getOriginalStem() { return originalStem; }
public List getPronouns() { return pronouns; }
}
/* HashMap of singleton instances */
private static final Map instances = new HashMap<>();
private final HashMap dict;
private static final String DEFAULT_DICT =
"edu/stanford/nlp/international/spanish/enclitic-inflections.data";
/** Any attached pronouns. The extra grouping around this pattern allows it to be used in String concatenations. */
private static final String PATTERN_ATTACHED_PRONOUNS =
"(?:(?:[mts]e|n?os|les?)(?:l[oa]s?)?|l[oa]s?)$";
private static final Pattern pTwoAttachedPronouns =
Pattern.compile("([mts]e|n?os|les?)(l[eoa]s?)$");
private static final Pattern pOneAttachedPronoun =
Pattern.compile("([mts]e|n?os|les?|l[oa]s?)$");
/**
* Matches infinitives and gerunds with attached pronouns.
* Original: Pattern.compile("(?:[aeiáéí]r|[áé]ndo)" + PATTERN_ATTACHED_PRONOUNS);
*/
private static final Pattern pStrippable =
Pattern.compile("(?:[aeiáéí]r|[áé]ndo|[aeáé]n?|[aeáé]mos?|[aeiáéí](?:d(?!os)|(?=os)))" + PATTERN_ATTACHED_PRONOUNS);
/**
* Matches irregular imperatives:
* decir = di, hacer = haz, ver = ve, poner = pon, salir = sal,
* ser = sé, tener = ten, venir = ven
* And id + os = idos, not ios
*/
private static final Pattern pIrregulars =
Pattern.compile("^(?:d[ií]|h[aá]z|v[eé]|p[oó]n|s[aá]l|sé|t[eé]n|v[eé]n|(?:id(?=os$)))" + PATTERN_ATTACHED_PRONOUNS);
/**
* Sets up dictionary of valid verbs and their POS info from an input file.
* The input file must be a list of whitespace-separated verb-lemma-POS triples, one verb
* form per line.
*
* @param dictPath the path to the dictionary file
*/
private static HashMap setupDictionary(String dictPath) {
HashMap dictionary = new HashMap<>();
BufferedReader br = null;
try {
br = IOUtils.readerFromString(dictPath);
for (String line; (line = br.readLine()) != null; ) {
String[] words = line.trim().split("\\s");
if (words.length < 3) {
System.err.printf("SpanishVerbStripper: adding words to dict, missing fields, ignoring line: %s%n", line);
} else {
dictionary.put(words[0], words[2]);
}
}
} catch (UnsupportedEncodingException e) {
e.printStackTrace();
} catch (IOException e) {
log.info("Could not load Spanish data file " + dictPath);
} finally {
IOUtils.closeIgnoringExceptions(br);
}
return dictionary;
}
@SuppressWarnings("unchecked")
private static final Pair[] accentFixes = new Pair[] {
new Pair(Pattern.compile("á"), "a"),
new Pair(Pattern.compile("é"), "e"),
new Pair(Pattern.compile("í"), "i"),
new Pair(Pattern.compile("ó"), "o"),
new Pair(Pattern.compile("ú"), "u")
};
// CONSTRUCTOR
/** Access via the singleton-like getInstance() methods. */
private SpanishVerbStripper(String dictPath) {
dict = setupDictionary(dictPath);
}
/**
* Singleton pattern function for getting a default verb stripper.
*/
public static SpanishVerbStripper getInstance() {
return getInstance(DEFAULT_DICT);
}
/**
* Singleton pattern function for getting a verb stripper based on
* the dictionary at dictPath.
*
* @param dictPath the path to the dictionary for this verb stripper.
*/
public static SpanishVerbStripper getInstance(String dictPath) {
SpanishVerbStripper svs = instances.get(dictPath);
if (svs == null) {
svs = new SpanishVerbStripper(dictPath);
instances.put(dictPath, svs);
}
return svs;
}
/**
* The verbs in this set have accents in their infinitive forms;
* don't remove the accents when stripping pronouns!
*/
private static final Set accentedInfinitives = new HashSet<>(Arrays.asList(
"desleír",
"desoír",
"embaír",
"engreír",
"entreoír",
"freír",
"oír",
"refreír",
"reír",
"sofreír",
"sonreír"
));
// STATIC FUNCTIONS
/**
* Determine if the given word is a verb which needs to be stripped.
*/
public static boolean isStrippable(String word) {
return pStrippable.matcher(word).find() || pIrregulars.matcher(word).find();
}
private static String removeAccents(String word) {
if (accentedInfinitives.contains(word))
return word;
String stripped = word;
for (Pair accentFix : accentFixes)
stripped = accentFix.first().matcher(stripped)
.replaceAll(accentFix.second());
return stripped;
}
/**
* Determines the case of the letter as if it had been part of the
* original string
*
* @param letter The character whose case must be determined
* @param original The string we are modelling the case on
*/
private static char getCase(String original, char letter) {
if (Character.isUpperCase(original.charAt(original.length()-1))) {
return Character.toUpperCase(letter);
} else {
return Character.toLowerCase(letter);
}
}
private static final Pattern nosse = Pattern.compile("nos|se");
/**
* Validate and normalize the given verb stripper result.
*
* Returns true if the given data is a valid pairing of verb form
* and clitic pronoun(s).
*
* May modify pair in place in order to make the pair valid.
* For example, if the pair (senta, os) is provided, this
* method will return true and modify the pair to be
* (sentad, os).
*/
private boolean normalizeStrippedVerb(StrippedVerb verb) {
String normalized = removeAccents(verb.getOriginalStem());
String firstPron = verb.getPronouns().get(0).toLowerCase();
// Look up verb in dictionary.
String verbKey = normalized.toLowerCase();
String pos = dict.get(verbKey);
boolean valid = false;
// System.out.println(verbKey + " " + dict.containsKey(verbKey + 's'));
// Validate resulting split verb and normalize the new form at the same
// time.
if (pos != null) {
// Check not invalid combination of verb root and pronoun.
// (If we combine a second-person plural imperative and the
// second person plural object pronoun, we expect to see an
// elided verb root, not the normal one that's in the
// dictionary.)
valid = ! (pos.equals("VMM02P0") && firstPron.equalsIgnoreCase("os"));
} else if (firstPron.equalsIgnoreCase("os") && dict.containsKey(verbKey + 'd')) {
// Special case: de-elide elided verb root in the case of a second
// person plural imperative + second person object pronoun
//
// (e.g., given (senta, os), return (sentad, os))
normalized = normalized + getCase(normalized, 'd');
valid = true;
} else if (nosse.matcher(firstPron).matches() && dict.containsKey(verbKey + 's')) {
// Special case: de-elide elided verb root in the case of a first
// person plural imperative + object pronoun
//
// (vámo, nos) -> (vámos, nos)
normalized = normalized + getCase(normalized, 's');
valid = true;
}
if (valid) {
// Update normalized form.
verb.setStem(normalized);
return true;
}
return false;
}
/**
* Separate attached pronouns from the given verb.
*
* @param word A valid Spanish verb with clitic pronouns attached.
* @param pSuffix A pattern to match these attached pronouns.
* @return A {@link StrippedVerb} instance or null if no attached
* pronouns were found.
*/
private StrippedVerb stripSuffix(String word, Pattern pSuffix) {
Matcher m = pSuffix.matcher(word);
if (m.find()) {
String stripped = word.substring(0, m.start());
List attached = new ArrayList<>();
for (int i = 0; i < m.groupCount(); i++)
attached.add(m.group(i + 1));
return new StrippedVerb(stripped, attached);
}
return null;
}
/**
* Attempt to separate attached pronouns from the given verb.
*
* @param verb Spanish verb
* @return Returns a StrippedVerb struct/tuple (originalStem, normalizedStem, pronouns),
* or null if no pronouns could be located and separated.
*
- originalStem: The verb stem simply split from the following pronouns.
* - normalizedStem: The verb stem normalized to dictionary form, i.e. in the
* form it would appear with the same conjugation but no pronouns.
* - pronouns: Pronouns which were attached to the verb.
*
*/
public StrippedVerb separatePronouns(String verb) {
StrippedVerb result;
// Try to strip just one pronoun first
result = stripSuffix(verb, pOneAttachedPronoun);
if (result != null && normalizeStrippedVerb(result)) {
return result;
}
// Now two
result = stripSuffix(verb, pTwoAttachedPronouns);
if (result != null && normalizeStrippedVerb(result)) {
return result;
}
return null;
}
/**
* Remove attached pronouns from a strippable Spanish verb form. (Use
* {@link #isStrippable(String)} to determine if a word is a
* strippable verb.)
*
* Converts, e.g.,
*
* - decírmelo -> decir
*
- mudarse -> mudar
*
- contándolos -> contando
*
- hazlo -> haz
*
*
* @return A verb form stripped of attached pronouns, or null
* if no pronouns were located / stripped.
*/
public String stripVerb(String verb) {
StrippedVerb separated = separatePronouns(verb);
if (separated != null) {
return separated.getStem();
}
return null;
}
private static final long serialVersionUID = -4780144226395772354L;
}