edu.stanford.nlp.parser.lexparser.ArabicUnknownWordModel Maven / Gradle / Ivy

Show more of this group Show more artifacts with this name
Show all versions of stanford-corenlp Show documentation
Stanford CoreNLP provides a set of natural language analysis tools which can take raw English language text input and give the base forms of words, their parts of speech, whether they are names of companies, people, etc., normalize dates, times, and numeric quantities, mark up the structure of sentences in terms of phrases and word dependencies, and indicate which noun phrases refer to the same entities. It provides the foundational building blocks for higher level text understanding applications.
There is a newer version: 4.5.7
Show newest version
package edu.stanford.nlp.parser.lexparser; 
import edu.stanford.nlp.util.logging.Redwood;

import edu.stanford.nlp.stats.ClassicCounter;
import edu.stanford.nlp.util.Index;

/**
 * This is a basic unknown word model for Arabic.  It supports 4 different
 * types of feature modeling; see {@link #getSignature(String, int)}.
 *
 * Implementation note: the contents of this class tend to overlap somewhat
 * with {@link EnglishUnknownWordModel} and were originally included in {@link BaseLexicon}.
 *
 * @author Roger Levy
 * @author Christopher Manning
 * @author Anna Rafferty
 */
public class ArabicUnknownWordModel extends BaseUnknownWordModel  {

  /** A logger for this class */
  private static Redwood.RedwoodChannels log = Redwood.channels(ArabicUnknownWordModel.class);

  private static final long serialVersionUID = 4825624957364628771L;

  private static final int MIN_UNKNOWN = 6;

  private static final int MAX_UNKNOWN = 10;

  protected final boolean smartMutation;
  protected final int unknownSuffixSize;
  protected final int unknownPrefixSize;


  public ArabicUnknownWordModel(Options op, Lexicon lex,
                                Index wordIndex,
                                Index tagIndex,
                                ClassicCounter unSeenCounter) {
    super(op, lex, wordIndex, tagIndex, unSeenCounter, null, null, null);
    if (unknownLevel < MIN_UNKNOWN || unknownLevel > MAX_UNKNOWN) {
      throw new IllegalArgumentException("Invalid value for useUnknownWordSignatures: " + unknownLevel);
    }
    this.smartMutation = op.lexOptions.smartMutation;
    this.unknownSuffixSize = op.lexOptions.unknownSuffixSize;
    this.unknownPrefixSize = op.lexOptions.unknownPrefixSize;
  }

  /**
   * This constructor creates an UWM with empty data structures.  Only
   * use if loading in the data separately, such as by reading in text
   * lines containing the data.
   */
  public ArabicUnknownWordModel(Options op, Lexicon lex,
                                Index wordIndex,
                                Index tagIndex) {
    this(op, lex, wordIndex, tagIndex, new ClassicCounter<>());
  }

  @Override
  public float score(IntTaggedWord iTW, int loc, double c_Tseen, double total, double smooth, String word) {
    double pb_W_T; // always set below

    //  unknown word model for P(T|S)

    int wordSig = getSignatureIndex(iTW.word, loc, word);
    IntTaggedWord temp = new IntTaggedWord(wordSig, iTW.tag);
    double c_TS = unSeenCounter.getCount(temp);
    temp = new IntTaggedWord(wordSig, nullTag);
    double c_S = unSeenCounter.getCount(temp);
    double c_U = unSeenCounter.getCount(NULL_ITW);
    temp = new IntTaggedWord(nullWord, iTW.tag);
    double c_T = unSeenCounter.getCount(temp);

    double p_T_U = c_T / c_U;

    if (unknownLevel == 0) {
      c_TS = 0;
      c_S = 0;
    }
    double pb_T_S = (c_TS + smooth * p_T_U) / (c_S + smooth);

    double p_T = (c_Tseen / total);
    double p_W = 1.0 / total;
    pb_W_T = Math.log(pb_T_S * p_W / p_T);

    return (float) pb_W_T;
  }


  /**
   * Returns the index of the signature of the word numbered wordIndex, where
   * the signature is the String representation of unknown word features.
   */
  @Override
  public int getSignatureIndex(int index, int sentencePosition, String word) {
    String uwSig = getSignature(word, sentencePosition);
    int sig = wordIndex.addToIndex(uwSig);
    return sig;
  }

  /**
   *  6-9 were added for Arabic. 6 looks for the prefix Al- (and
   * knows that Buckwalter uses various symbols as letters), while 7 just looks
   * for numbers and last letter. 8 looks for Al-, looks for several useful
   * suffixes, and tracks the first letter of the word. (note that the first
   * letter seems a bit more informative than the last letter, overall.)
   * 9 tries to build on 8, but avoiding some of its perceived flaws: really it
   * was using the first AND last letter.
   *
   * @param word The word to make a signature for
   * @param loc Its position in the sentence (mainly so sentence-initial
   *          capitalized words can be treated differently)
   * @return A String that is its signature (equivalence class)
   */
  @Override
  public String getSignature(String word, int loc) {
    StringBuilder sb = new StringBuilder("UNK");
    switch (unknownLevel) {
    case 10://Anna's attempt at improving Chris' attempt, April 2008
    {
      boolean allDigitPlus = ArabicUnknownWordSignatures.allDigitPlus(word);
      int leng = word.length();
      if (allDigitPlus) {
        sb.append("-NUM");
      } else if (word.startsWith("Al") || word.startsWith("\u0627\u0644")) {
        sb.append("-Al");
      } else {
        // the first letters of a word seem more informative overall than the
        // last letters.
        // Alternatively we could add on the first two letters, if there's
        // enough data.
        if (unknownPrefixSize > 0) {
          int min = leng < unknownPrefixSize ? leng: unknownPrefixSize;
          sb.append('-').append(word.substring(0, min));
        }
      }
      if(word.length() == 1) {
        //add in the unicode type for the char
        sb.append(Character.getType(word.charAt(0)));
      }
      sb.append(ArabicUnknownWordSignatures.likelyAdjectivalSuffix(word));
      sb.append(ArabicUnknownWordSignatures.pastTenseVerbNumberSuffix(word));
      sb.append(ArabicUnknownWordSignatures.presentTenseVerbNumberSuffix(word));
      String ans = ArabicUnknownWordSignatures.abstractionNounSuffix(word);
      if (! "".equals(ans)) {
        sb.append(ans);
      } else {
        sb.append(ArabicUnknownWordSignatures.taaMarbuuTaSuffix(word));
      }
      if (unknownSuffixSize > 0 && ! allDigitPlus) {
        int min = leng < unknownSuffixSize ? leng: unknownSuffixSize;
        sb.append('-').append(word.substring(word.length() - min));
      }
      break;
    }
    case 9: // Chris' attempt at improving Roger's Arabic attempt, Nov 2006.
    {
      boolean allDigitPlus = ArabicUnknownWordSignatures.allDigitPlus(word);
      int leng = word.length();
      if (allDigitPlus) {
        sb.append("-NUM");
      } else if (word.startsWith("Al") || word.startsWith("\u0627\u0644")) {
        sb.append("-Al");
      } else {
        // the first letters of a word seem more informative overall than the
        // last letters.
        // Alternatively we could add on the first two letters, if there's
        // enough data.
        if (unknownPrefixSize > 0) {
          int min = leng < unknownPrefixSize ? leng: unknownPrefixSize;
          sb.append('-').append(word.substring(0, min));
        }
      }

      sb.append(ArabicUnknownWordSignatures.likelyAdjectivalSuffix(word));
      sb.append(ArabicUnknownWordSignatures.pastTenseVerbNumberSuffix(word));
      sb.append(ArabicUnknownWordSignatures.presentTenseVerbNumberSuffix(word));
      String ans = ArabicUnknownWordSignatures.abstractionNounSuffix(word);
      if (! "".equals(ans)) {
        sb.append(ans);
      } else {
        sb.append(ArabicUnknownWordSignatures.taaMarbuuTaSuffix(word));
      }
      if (unknownSuffixSize > 0 && ! allDigitPlus) {
        int min = leng < unknownSuffixSize ? leng: unknownSuffixSize;
        sb.append('-').append(word.substring(word.length() - min));
      }
      break;
    }

    case 8: // Roger's attempt at an Arabic UWM, May 2006.
    {
      if (word.startsWith("Al")) {
        sb.append("-Al");
      }
      boolean allDigitPlus = ArabicUnknownWordSignatures.allDigitPlus(word);
      if (allDigitPlus) {
        sb.append("-NUM");
      } else {
        // the first letters of a word seem more informative overall than the
        // last letters.
        // Alternatively we could add on the first two letters, if there's
        // enough data.
        sb.append('-').append(word.charAt(0));
      }
      sb.append(ArabicUnknownWordSignatures.likelyAdjectivalSuffix(word));
      sb.append(ArabicUnknownWordSignatures.pastTenseVerbNumberSuffix(word));
      sb.append(ArabicUnknownWordSignatures.presentTenseVerbNumberSuffix(word));
      sb.append(ArabicUnknownWordSignatures.taaMarbuuTaSuffix(word));
      sb.append(ArabicUnknownWordSignatures.abstractionNounSuffix(word));
      break;
    }

    case 7: {
      // For Arabic with Al's separated off (cdm, May 2006)
      // { -NUM, -lastChar }
      boolean allDigitPlus = ArabicUnknownWordSignatures.allDigitPlus(word);
      if (allDigitPlus) {
        sb.append("-NUM");
      } else {
        sb.append(word.charAt(word.length() - 1));
      }
      break;
    }

    case 6: {
      // For Arabic (cdm, May 2006), with Al- as part of word
      // { -Al, 0 } +
      // { -NUM, -last char(s) }
      if (word.startsWith("Al")) {
        sb.append("-Al");
      }
      boolean allDigitPlus = ArabicUnknownWordSignatures.allDigitPlus(word);
      if (allDigitPlus) {
        sb.append("-NUM");
      } else {
        sb.append(word.charAt(word.length() - 1));
      }
      break;
    }
    default:
      // 0 = do nothing so it just stays as "UNK"
    } // end switch (unknownLevel)
    // log.info("Summarized " + word + " to " + sb.toString());
    return sb.toString();
  } // end getSignature()

  @Override
  public int getUnknownLevel() {
    return unknownLevel;
  }

}