edu.stanford.nlp.international.spanish.SpanishVerbStripper Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of stanford-parser Show documentation
Stanford Parser processes raw text in English, Chinese, German, Arabic, and French, and extracts constituency parse trees.
There is a newer version: 3.9.2
Show newest version
package edu.stanford.nlp.international.spanish;

import edu.stanford.nlp.io.IOUtils;
import edu.stanford.nlp.util.Pair;

import java.io.BufferedReader;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.Serializable;
import java.io.UnsupportedEncodingException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

/**
 * Provides a utility function for removing attached pronouns from
 * Spanish verb forms.
 *
 * @author Jon Gauthier
 * @author Ishita Prasad
 */
public final class SpanishVerbStripper implements Serializable {

  // The following three classes of verb forms can carry attached
  // pronouns:
  //
  //   - Infinitives
  //   - Gerunds
  //   - Affirmative imperatives

  /* HashMap of singleton instances */
  private static final Map instances = new HashMap();

  private HashMap dict;

  private static final String DEFAULT_DICT =
    "edu/stanford/nlp/international/spanish/enclitic-inflections.data";

  private static final String PATTERN_ATTACHED_PRONOUNS =
    "(?:(?:(?:[mts]e|n?os|les?)(?:l[oa]s?)?)|l[oa]s?)$";

  private static final Pattern pTwoAttachedPronouns =
    Pattern.compile("(?:([mts]e|n?os|les?)(l[eoa]s?)?)$");

  private static final Pattern pOneAttachedPronoun =
    Pattern.compile("([mts]e|n?os|les?|l[oa]s?)$");

  /**
   * Matches infinitives and gerunds with attached pronouns.
   * Original: Pattern.compile("(?:[aeiáéí]r|[áé]ndo)" + PATTERN_ATTACHED_PRONOUNS);
   */
  private static final Pattern pStrippable =
    Pattern.compile("(?:[aeiáéí]r|[áé]ndo|[aeáé]n?|[aeáé]mos?|[aeiáéí](?:d(?!os)|(?=os)))" + PATTERN_ATTACHED_PRONOUNS);

  /**
   * Matches irregular imperatives:
   * decir = di, hacer = haz, ver = ve, poner = pon, salir = sal,
   * ser = sé, tener = ten, venir = ven
   * And id + os = idos, not ios
   */
  private static final Pattern pIrregulars =
    Pattern.compile("^(?:d[ií]|h[aá]z|v[eé]|p[oó]n|s[aá]l|sé|t[eé]n|v[eé]n|(?:id(?=os$)))" + PATTERN_ATTACHED_PRONOUNS);

  /**
   * Sets up dictionary of valid verbs and their POS info from an input file.
   * The input file must be a list of tab-separated verb-POS pairs, one verb
   * per line.
   *
   * @param dictPath the path to the dictionary file
   */
  private void setupDictionary(String dictPath) {
    try {
      dict = new HashMap();
      BufferedReader br = IOUtils.readerFromString(dictPath);
      for(String line; (line = br.readLine()) != null; ) {
        String[] words = line.trim().split("\\s");
        if(words.length < 3) {
          System.err.printf("SpanishVerbStripper: addings words to dict, missing word, ignoring line%n");
        }
        dict.put(words[0], words[2]);
      }
      IOUtils.closeIgnoringExceptions(br);
    } catch (UnsupportedEncodingException e) {
      e.printStackTrace();
    } catch (FileNotFoundException e) {
      System.err.println("Could not load Spanish data file " + dictPath);
    } catch (IOException e) {
      System.err.println("Could not load Spanish data file " + dictPath);
    }
  }

  @SuppressWarnings("unchecked")
  private static final Pair[] accentFixes = new Pair[] {
    new Pair(Pattern.compile("á"), "a"),
    new Pair(Pattern.compile("é"), "e"),
    new Pair(Pattern.compile("í"), "i"),
    new Pair(Pattern.compile("ó"), "o"),
    new Pair(Pattern.compile("ú"), "u")
  };

  // CONSTRUCTORS

  private SpanishVerbStripper() {
    this(DEFAULT_DICT);
  }

  private SpanishVerbStripper(String dictPath) {
    setupDictionary(dictPath);
  }

  /**
   * Singleton pattern function for getting a default verb stripper
   */
  public static SpanishVerbStripper getInstance() {
    return getInstance(DEFAULT_DICT);
  }

  /**
   * Singleton pattern function for getting a verb stripper based on
   * the dictionary at dictPath.
   *
   * @param dictPath the path to the dictionary for this verb stripper.
   */
  public static SpanishVerbStripper getInstance(String dictPath) {
    SpanishVerbStripper svs = instances.get(dictPath);
    if (svs == null) {
      svs = new SpanishVerbStripper(dictPath);
      instances.put(dictPath, svs);
    }
    return svs;
  }

  /**
   * The verbs in this set have accents in their infinitive forms;
   * don't remove the accents when stripping pronouns!
   */
  private static final Set accentedInfinitives = new HashSet(Arrays.asList(
    "desleír",
    "desoír",
    "embaír",
    "engreír",
    "entreoír",
    "freír",
    "oír",
    "refreír",
    "reír",
    "sofreír",
    "sonreír"
  ));

  // STATIC FUNCTIONS

  /**
   * Determine if the given word is a verb which needs to be stripped.
   */
  public static boolean isStrippable(String word) {
    return pStrippable.matcher(word).find() || pIrregulars.matcher(word).find();
  }

  private static String removeAccents(String word) {
    if (accentedInfinitives.contains(word))
      return word;

    String stripped = word;
    for (Pair accentFix : accentFixes)
      stripped = accentFix.first().matcher(stripped)
        .replaceAll(accentFix.second());
    return stripped;
  }

  /**
   * Determines the case of the letter as if it had been part of the
   * original string
   *
   * @param letter The character whose case must be determined
   * @param original The string we are modelling the case on
   */
  private static char getCase(String original, char letter) {
    if (Character.isUpperCase(original.charAt(original.length()-1))) {
      return Character.toUpperCase(letter);
    } else {
      return Character.toLowerCase(letter);
    }
  }

  /**
   * Examines the given verb pair and returns true if it is a
   * valid pairing of verb form and clitic pronoun(s).
   *
   * May modify pair in place in order to make the pair valid.
   * For example, if the pair (senta, os) is provided, this
   * method will return true and modify the pair to be
   * (sentad, os).
   */
  private boolean validateVerbPair(Pair> pair) {
    String stripped = pair.first().toLowerCase();
    String firstPron = pair.second().get(0).toLowerCase();

    String pos = dict.get(stripped);

    if (pos != null) {
      if (pos.equals("VMM02P0") && firstPron.equalsIgnoreCase("os")) {
        // Invalid combination of verb root and pronoun.
        // (If we combine a second-person plural imperative and the
        // second person plural object pronoun, we expect to see an
        // elided verb root, not the normal one that's in the
        // dictionary.)
        return false;
      }

      return true;
    }

    // Special case: de-elide elided verb root in the case of a second
    // person plural imperative + second person object pronoun
    //
    // (e.g., given (senta, os), return (sentad, os))
    if (firstPron.equalsIgnoreCase("os") && dict.containsKey(stripped + 'd')) {
      pair.setFirst(pair.first() + getCase(pair.first(), 'd'));
      return true;
    }

    // Special case: de-elide elided verb root in the case of a first
    // person plural imperative + object pronoun
    //
    // (vámo, nos) -> (vámos, nos)
    if (firstPron.matches("nos|se") && dict.containsKey(stripped + 's')) {
      pair.setFirst(pair.first() + getCase(pair.first(), 's'));
      return true;
    }

    return false;
  }

  /**
   * Separate attached pronouns from the given verb.
   *
   * @param word A valid Spanish verb with clitic pronouns attached.
   * @param pSuffix A pattern to match these attached pronouns.
   * @return A pair containing the verb (pronouns removed by the given
   *           pattern) and a list of the pronouns which were attached
   *           to the verb.
   */
  private static Pair> stripSuffix(String word,
                                                        Pattern pSuffix) {
    Matcher m = pSuffix.matcher(word);
    if (m.find()) {
      String stripped = word.substring(0, m.start());
      stripped = removeAccents(stripped);

      List attached = new ArrayList();
      for (int i = 0; i < m.groupCount(); i++)
        attached.add(m.group(i + 1));

      return new Pair>(stripped, attached);
    }

    return null;
  }

  /**
   * Attempt to separate attached pronouns from the given verb.
   *
   * @param verb Spanish verb
   * @return A pair containing the verb (pronouns removed) and a list of
   *           the pronouns which were attached to the verb, or
   *           null if no pronouns could be located and
   *           separated.
   */
  public Pair> separatePronouns(String verb) {
    Pair> separated;

    // Try to strip just one pronoun first
    separated = stripSuffix(verb, pOneAttachedPronoun);
    if (separated != null && validateVerbPair(separated))
      return separated;

    // Now two
    separated = stripSuffix(verb, pTwoAttachedPronouns);
    if (separated != null && validateVerbPair(separated))
      return separated;

    return null;
  }

  /**
   * Remove attached pronouns from a strippable Spanish verb form. (Use
   * {@link #isStrippable(String)} to determine if a word is a
   * strippable verb.)
   *
   * Converts e.g.
   *
   *   - decírmelo -> decir
   *   - mudarse -> mudar
   *   - contándolos -> contando
   *   - hazlo -> haz
   *
   * @return A verb form stripped of attached pronouns, or null
   *           if no pronouns were located / stripped.
   */
  public String stripVerb(String verb) {
    Pair> separated = separatePronouns(verb);
    if (separated != null) {
      return separated.first();
    }
    return null;
  }

  private static final long serialVersionUID = -4780144226395772354L;

}