edu.stanford.nlp.parser.lexparser.SpanishUnknownWordModel Maven / Gradle / Ivy

Show more of this group Show more artifacts with this name
Show all versions of stanford-corenlp Show documentation

Stanford CoreNLP provides a set of natural language analysis tools which can take raw English language text input and give the base forms of words, their parts of speech, whether they are names of companies, people, etc., normalize dates, times, and numeric quantities, mark up the structure of sentences in terms of phrases and word dependencies, and indicate which noun phrases refer to the same entities. It provides the foundational building blocks for higher level text understanding applications.

There is a newer version: 4.5.7

Show newest version

package edu.stanford.nlp.parser.lexparser;

import edu.stanford.nlp.util.logging.Redwood;

import edu.stanford.nlp.international.spanish.SpanishUnknownWordSignatures;
import edu.stanford.nlp.stats.ClassicCounter;
import edu.stanford.nlp.util.Index;
import edu.stanford.nlp.util.StringUtils;


public class SpanishUnknownWordModel extends BaseUnknownWordModel  {

  /** A logger for this class */
  private static final Redwood.RedwoodChannels log = Redwood.channels(SpanishUnknownWordModel.class);

  protected final boolean smartMutation;

  protected final int unknownSuffixSize;
  protected final int unknownPrefixSize;

  public SpanishUnknownWordModel(Options op, Lexicon lex,
                                 Index wordIndex,
                                 Index tagIndex,
                                 ClassicCounter unSeenCounter) {
    super(op, lex, wordIndex, tagIndex, unSeenCounter, null, null, null);
    this.smartMutation = op.lexOptions.smartMutation;
    this.unknownSuffixSize = op.lexOptions.unknownSuffixSize;
    this.unknownPrefixSize = op.lexOptions.unknownPrefixSize;
  }

  /**
   * This constructor creates an UWM with empty data structures.  Only
   * use if loading in the data separately, such as by reading in text
   * lines containing the data.
   */
  public SpanishUnknownWordModel(Options op, Lexicon lex,
                                 Index wordIndex,
                                 Index tagIndex) {
    this(op, lex, wordIndex, tagIndex, new ClassicCounter<>());
  }

  @Override
  public float score(IntTaggedWord iTW, int loc, double c_Tseen, double total, double smooth, String word) {
    double pb_W_T; // always set below

    //  unknown word model for P(T|S)

    int wordSig = getSignatureIndex(iTW.word, loc, word);
    IntTaggedWord temp = new IntTaggedWord(wordSig, iTW.tag);
    double c_TS = unSeenCounter.getCount(temp);
    temp = new IntTaggedWord(wordSig, nullTag);
    double c_S = unSeenCounter.getCount(temp);
    double c_U = unSeenCounter.getCount(NULL_ITW);
    temp = new IntTaggedWord(nullWord, iTW.tag);
    double c_T = unSeenCounter.getCount(temp);

    double p_T_U = c_T / c_U;

    if (unknownLevel == 0) {
      c_TS = 0;
      c_S = 0;
    }
    double pb_T_S = (c_TS + smooth * p_T_U) / (c_S + smooth);

    double p_T = (c_Tseen / total);
    double p_W = 1.0 / total;
    pb_W_T = Math.log(pb_T_S * p_W / p_T);

    return (float) pb_W_T;
  }


  /**
   * Returns the index of the signature of the word numbered wordIndex, where
   * the signature is the String representation of unknown word features.
   */
  @Override
  public int getSignatureIndex(int index, int sentencePosition, String word) {
    String uwSig = getSignature(word, sentencePosition);
    int sig = wordIndex.addToIndex(uwSig);
    return sig;
  }

  /**
   * TODO Can add various signatures, setting the signature via Options.
   *
   * @param word The word to make a signature for
   * @param loc Its position in the sentence (mainly so sentence-initial
   *          capitalized words can be treated differently)
   * @return A String that is its signature (equivalence class)
   */
  @Override
  public String getSignature(String word, int loc) {
    final String BASE_LABEL = "UNK";
    StringBuilder sb = new StringBuilder(BASE_LABEL);

    switch (unknownLevel) {
      case 1:
        if (StringUtils.isNumeric(word)) {
          sb.append('#');
          break;
        } else if (StringUtils.isPunct(word)) {
          sb.append('!');
          break;
        }

        // Mutually exclusive patterns
        sb.append(SpanishUnknownWordSignatures.conditionalSuffix(word));
        sb.append(SpanishUnknownWordSignatures.imperfectSuffix(word));
        sb.append(SpanishUnknownWordSignatures.infinitiveSuffix(word));
        sb.append(SpanishUnknownWordSignatures.adverbSuffix(word));

        // Broad coverage patterns -- only apply if we haven't yet matched at all
        if (sb.toString().equals(BASE_LABEL)) {
          if (SpanishUnknownWordSignatures.hasVerbFirstPersonPluralSuffix(word)) {
            sb.append("-vb1p");
          } else if (SpanishUnknownWordSignatures.hasGerundSuffix(word)) {
            sb.append("-ger");
          } else if (word.endsWith("s")) {
            sb.append("-s");
          }
        }

        // Backoff to suffix if we haven't matched anything else
        if (unknownSuffixSize > 0 && sb.toString().equals(BASE_LABEL)) {
          int min = word.length() < unknownSuffixSize ? word.length() : unknownSuffixSize;
          sb.append('-').append(word.substring(word.length() - min));
        }

        char first = word.charAt(0);
        if ((Character.isUpperCase(first) || Character.isTitleCase(first)) && !isUpperCase(word)) {
          sb.append("-C");
        } else {
          sb.append("-c");
        }

        break;

      default:
        log.error(String.format("%s: Invalid unknown word signature! (%d)%n", this.getClass().getName(),unknownLevel));
    }

    return sb.toString();
  }

  private static boolean isUpperCase(String s) {
    for (int i = 0; i < s.length(); i++) {
      if (Character.isLowerCase(s.charAt(i)))
        return false;
    }
    return true;
  }

  private static final long serialVersionUID = 5370429530690606644L;

}