edu.stanford.nlp.trees.international.arabic.ArabicHeadFinder Maven / Gradle / Ivy

Show more of this group Show more artifacts with this name
Show all versions of stanford-corenlp Show documentation
Stanford CoreNLP provides a set of natural language analysis tools which can take raw English language text input and give the base forms of words, their parts of speech, whether they are names of companies, people, etc., normalize dates, times, and numeric quantities, mark up the structure of sentences in terms of phrases and word dependencies, and indicate which noun phrases refer to the same entities. It provides the foundational building blocks for higher level text understanding applications.
There is a newer version: 4.5.7
Show newest version
package edu.stanford.nlp.trees.international.arabic; 
import edu.stanford.nlp.util.logging.Redwood;

import java.util.regex.Pattern;

import edu.stanford.nlp.trees.AbstractCollinsHeadFinder;
import edu.stanford.nlp.trees.Tree;
import edu.stanford.nlp.trees.TreebankLanguagePack;
import edu.stanford.nlp.util.Generics;

/**
 * Find the head of an Arabic tree, using the usual kind of heuristic
 * head finding rules.
 * 
 * Implementation notes.
 * TO DO: make sure that -PRD marked elements are always chosen as heads.
 * (Has this now been successfully done or not??)
 * 
 * Mona: I added the 8 new Nonterm for the merged DT with its following
 * category as a rule the DT nonterm is right headed, the 8 new nonterm DTs
 * are: DTCD, DTRB, DTRP, DTJJ, DTNN, DTNNS, DTNNP, DTNNPS.
 * This was added Dec 7th, 2004.
 *
 * @author Roger Levy
 * @author Mona Diab
 * @author Christopher Manning (added new stuff for ATBp3v3
 */
public class ArabicHeadFinder extends AbstractCollinsHeadFinder  {

  /** A logger for this class */
  private static Redwood.RedwoodChannels log = Redwood.channels(ArabicHeadFinder.class);
  private static final long serialVersionUID = 6203368998430280740L;
  protected TagSet tagSet;

  /* A work in progress. There may well be a better way to parameterize the HeadFinders via tagset. */
  public enum TagSet {
    BIES_COLLAPSED {
      @Override
      String prep()  { return "IN"; }
      @Override
      String noun()  { return "NN"; } // really there should be several here.
      @Override
      String det()  { return "DT"; }
      @Override
      String adj()  { return "JJ"; }
      @Override
      String detPlusNoun()  { return "DTNN"; }  // really there should be several here; major point is that the det part is ignored completely
      @Override
      TreebankLanguagePack langPack()  { return new ArabicTreebankLanguagePack(); }
    },
    ORIGINAL {
      @Override
      String prep()  { return "PREP"; }
      @Override
      String noun()  { return "NOUN"; }
      @Override
      String det()  { return "DET"; }
      @Override
      String adj()  { return "ADJ"; }
      @Override
      String detPlusNoun()  { return "DET+NN"; }
      @Override
      TreebankLanguagePack langPack()  { return new ArabicTreebankLanguagePack(); }
    };

    abstract String prep();
    abstract String noun();
    abstract String adj();
    abstract String det();
    abstract String detPlusNoun();
    abstract TreebankLanguagePack langPack();

    static TagSet tagSet(String str) {
      switch (str) {
        case "BIES_COLLAPSED":
          return BIES_COLLAPSED;
        case "ORIGINAL":
          return ORIGINAL;
        default:
          throw new IllegalArgumentException("Don't know anything about tagset " + str);
      }
    }
  }

  public ArabicHeadFinder() {
    this(new ArabicTreebankLanguagePack());
  }

  /**
   * Construct an ArabicHeadFinder with a String parameter corresponding to the tagset in use.
   * @param tagSet Either "ORIGINAL" or "BIES_COLLAPSED"
   */
  public ArabicHeadFinder(String tagSet) {
    this(TagSet.tagSet(tagSet));
  }

  public ArabicHeadFinder(TagSet tagSet) {
    this(tagSet.langPack(), tagSet);
    //this(new ArabicTreebankLanguagePack(), tagSet);
  }

  public ArabicHeadFinder(TreebankLanguagePack tlp) {
    this(tlp,TagSet.BIES_COLLAPSED);
  }

  protected ArabicHeadFinder(TreebankLanguagePack tlp, TagSet tagSet) {
    super(tlp);
    this.tagSet = tagSet;
    //log.info("##testing: noun tag is " + tagSet.noun());

    nonTerminalInfo = Generics.newHashMap();

    nonTerminalInfo.put("NX", new String[][]{{"left", "DT","DTNN","DTNNS","DTNNP", "DTNNPS", "DTJJ", "DTNOUN_QUANT", "NOUN_QUANT", "MWNP"}});
    nonTerminalInfo.put("ADJP", new String[][]{{"rightdis", tagSet.adj(), "DTJJ", "ADJ_NUM", "DTADJ_NUM", "JJR", "DTJJR", "MWADJP"}, {"right", "ADJP", "VN", tagSet.noun(), "MWNP", "NNP", "NNPS", "NNS", "DTNN", "DTNNS","DTNNP","DTNNPS","DTJJ", "DTNOUN_QUANT", "NOUN_QUANT"}, {"right", "RB", "MWADVP", "CD","DTRB","DTCD"}, {"right", "DT"}}); // sometimes right, sometimes left headed??
    nonTerminalInfo.put("MWADJP", new String[][]{{"rightdis", tagSet.adj(), "DTJJ", "ADJ_NUM", "DTADJ_NUM", "JJR", "DTJJR"}, {"right", tagSet.noun(), "MWNP", "NNP", "NNPS", "NNS", "DTNN", "DTNNS","DTNNP","DTNNPS","DTJJ", "DTNOUN_QUANT", "NOUN_QUANT"}, {"right", "RB", "MWADVP", "CD","DTRB","DTCD"}, {"right", "DT"}}); // sometimes right, sometimes left headed??
    nonTerminalInfo.put("ADVP", new String[][]{{"left", "WRB", "RB", "MWADVP", "ADVP", "WHADVP","DTRB"}, {"left", "CD", "RP", tagSet.noun(), "MWNP", "CC", "MWCONJP", tagSet.adj(), "MWADJP", "DTJJ", "ADJ_NUM", "DTADJ_NUM", "IN", "MWPP", "NP", "NNP", "NOFUNC","DTRP","DTNN","DTNNP","DTNNPS","DTNNS","DTJJ", "DTNOUN_QUANT", "NOUN_QUANT"}}); // NNP is a gerund that they called an unknown (=NNP, believe it or not...)
    nonTerminalInfo.put("MWADVP", new String[][]{{"left", "WRB", "RB", "ADVP", "WHADVP","DTRB"}, {"left", "CD", "RP", tagSet.noun(), "MWNP", "CC", "MWCONJP", tagSet.adj(), "MWADJP", "DTJJ", "ADJ_NUM", "DTADJ_NUM", "IN", "MWPP", "NP", "NNP", "NOFUNC","DTRP","DTNN","DTNNP","DTNNPS","DTNNS","DTJJ", "DTNOUN_QUANT", "NOUN_QUANT"}}); // NNP is a gerund that they called an unknown (=NNP, believe it or not...)
    nonTerminalInfo.put("CONJP", new String[][]{{"right", "IN", "RB", "MWADVP", tagSet.noun(), "MWNP", "NNS","NNP", "NNPS", "DTRB", "DTNN", "DTNNS", "DTNNP", "DTNNPS", "DTNOUN_QUANT", "NOUN_QUANT"}});
    nonTerminalInfo.put("MWCONJP", new String[][]{{"right", "IN", "RB", "MWADVP", tagSet.noun(), "MWNP", "NNS","NNP", "NNPS", "DTRB", "DTNN", "DTNNS", "DTNNP", "DTNNPS", "DTNOUN_QUANT", "NOUN_QUANT"}});
    nonTerminalInfo.put("FRAG", new String[][]{{"left", tagSet.noun(), "MWNP", "NNPS", "NNP","NNS", "DTNN", "DTNNS", "DTNNP", "DTNNPS", "DTNOUN_QUANT", "NOUN_QUANT"}, {"left", "VBP"}});
    nonTerminalInfo.put("MWFRAG", new String[][]{{"left", tagSet.noun(), "MWNP", "NNPS", "NNP","NNS", "DTNN", "DTNNS", "DTNNP", "DTNNPS", "DTNOUN_QUANT", "NOUN_QUANT"}, {"left", "VBP"}});
    nonTerminalInfo.put("INTJ", new String[][]{{"left", "RP", "UH", "DTRP"}});
    nonTerminalInfo.put("LST", new String[][]{{"left"}});
    nonTerminalInfo.put("NAC", new String[][]{{"left", "NP", "SBAR", "PP", "MWP","ADJP", "S", "PRT", "UCP"}, {"left", "ADVP"}}); // note: maybe CC, RB should be the heads?
    nonTerminalInfo.put("NP", new String[][]{{"left", tagSet.noun(), "MWNP", tagSet.detPlusNoun(), "NNS", "NNP", "NNPS", "NP", "PRP", "WHNP", "QP", "WP", "DTNNS", "DTNNPS", "DTNNP", "NOFUNC", "NO_FUNC", "DTNOUN_QUANT", "NOUN_QUANT"}, {"left", tagSet.adj(), "MWADJP", "DTJJ", "JJR", "DTJJR", "ADJ_NUM", "DTADJ_NUM"}, {"right", "CD", "DTCD"}, {"left", "PRP$"}, {"right", "DT"}}); // should the JJ rule be left or right?
    nonTerminalInfo.put("MWNP", new String[][]{{"left", tagSet.noun(), "MWNP", tagSet.detPlusNoun(), "NNS", "NNP", "NNPS", "PRP", "QP", "WP", "DTNNS", "DTNNPS", "DTNNP", "DTNOUN_QUANT", "NOUN_QUANT"}, {"left", tagSet.adj(), "MWADJP", "DTJJ", "JJR", "DTJJR", "ADJ_NUM", "DTADJ_NUM"}, {"right", "CD", "DTCD"}, {"left", "PRP$"}, {"right", "DT"}}); // should the JJ rule be left or right?
    nonTerminalInfo.put("PP", new String[][]{{"left", tagSet.prep(), "MWPP", "PP", "MWP","PRT", "X"}, {"left", "NNP", "RP", tagSet.noun(), "MWNP"}, {"left", "NP"}}); // NN is for a mistaken "fy", and many wsT
    nonTerminalInfo.put("MWPP", new String[][]{{"left", tagSet.prep(), "PP", "MWP","PRT", "X"}, {"left", "NNP", "RP", tagSet.noun(), "MWNP"}, {"left", "NP"}}); // NN is for a mistaken "fy", and many wsT
    nonTerminalInfo.put("PRN", new String[][]{{"left", "NP"}}); // don't get PUNC
    nonTerminalInfo.put("MWPRN", new String[][]{{"left", "IN"}}); // don't get PUNC
    nonTerminalInfo.put("PRT", new String[][]{{"left", "RP", "PRT", "IN", "DTRP"}});
    nonTerminalInfo.put("QP", new String[][]{{"right", "CD", "DTCD", tagSet.noun(), "MWNP", tagSet.adj(), "MWADJP", "NNS", "NNP", "NNPS", "DTNN", "DTNNS", "DTNNP", "DTNNPS", "DTJJ", "DTNOUN_QUANT", "NOUN_QUANT"}});

    nonTerminalInfo.put("S", new String[][]{{"left", "VP", "MWVP", "S"}, {"right", "PP", "MWP","ADVP", "SBAR", "UCP", "ADJP"}}); // really important to put in -PRD sensitivity here!
    nonTerminalInfo.put("MWS", new String[][]{{"left", "VP", "MWVP", "S"}, {"right", "PP", "MWP","ADVP", "SBAR", "UCP", "ADJP"}}); // really important to put in -PRD sensitivity here!
    nonTerminalInfo.put("SQ", new String[][]{{"left", "VP", "MWVP", "PP", "MWP"}}); // to be principled, we need -PRD sensitivity here too.
    nonTerminalInfo.put("SBAR", new String[][]{{"left", "WHNP", "WHADVP", "WRB", "RP", "IN", "SBAR", "CC", "MWCONJP", "WP", "WHPP", "ADVP", "PRT", "RB", "MWADVP", "X", "DTRB", "DTRP"}, {"left", tagSet.noun(), "MWNP", "NNP", "NNS", "NNPS", "DTNN", "DTNNS", "DTNNP", "DTNNPS", "DTNOUN_QUANT", "NOUN_QUANT"}, {"left", "S"}});
    nonTerminalInfo.put("MWSBAR", new String[][]{{"left", "WHNP", "WHADVP", "WRB", "RP", "IN", "SBAR", "CC", "MWCONJP", "WP", "WHPP", "ADVP", "PRT", "RB", "MWADVP", "X", "DTRB", "DTRP"}, {"left", tagSet.noun(), "MWNP", "NNP", "NNS", "NNPS", "DTNN", "DTNNS", "DTNNP", "DTNNPS", "DTNOUN_QUANT", "NOUN_QUANT"}, {"left", "S"}});
    nonTerminalInfo.put("SBARQ", new String[][]{{"left", "WHNP", "WHADVP", "RP", "IN", "SBAR", "CC", "MWCONJP", "WP", "WHPP", "ADVP", "PRT", "RB", "MWADVP", "X"}, {"left", tagSet.noun(), "MWNP", "NNP", "NNS", "NNPS","DTNN", "DTNNS", "DTNNP", "DTNNPS", "DTNOUN_QUANT", "NOUN_QUANT"}, {"left", "S"}}); // copied from SBAR rule -- look more closely when there's time
    nonTerminalInfo.put("UCP", new String[][]{{"left"}});
    nonTerminalInfo.put("VP", new String[][]{{"left", "VBD", "VBN", "VBP", "VBG", "DTVBG", "VN", "DTVN", "VP", "RB", "MWADVP", "X","VB"}, {"left", "IN"}, {"left", "NNP", tagSet.noun(), "MWNP", "DTNN", "DTNNP", "DTNNPS", "DTNNS", "DTNOUN_QUANT", "NOUN_QUANT"}}); // exclude RP because we don't want negation markers as heads -- no useful information?
    nonTerminalInfo.put("MWVP", new String[][]{{"left", "VBD", "VBN", "VBP", "VBG", "DTVBG", "VN", "DTVN", "VP", "MWVP", "RB", "MWADVP", "X","VB"}, {"left", "IN"}, {"left", "NNP", tagSet.noun(), "MWNP", "DTNN", "DTNNP", "DTNNPS", "DTNNS", "DTNOUN_QUANT", "NOUN_QUANT"}}); // exclude RP because we don't want negation markers as heads -- no useful information?

    
    //also, RB is used as gerunds

    nonTerminalInfo.put("WHADVP", new String[][]{{"left", "WRB", "WP"}, {"right", "CC", "MWCONJP"}, {"left", "IN"}});
    nonTerminalInfo.put("WHNP", new String[][]{{"right", "WP"}});
    nonTerminalInfo.put("WHPP", new String[][]{{"left",  "IN", "MWPP", "RB", "MWADVP"}});
    nonTerminalInfo.put("X", new String[][]{{"left"}});

    //Added by Mona 12/7/04 for the newly created DT nonterm cat
    nonTerminalInfo.put("DTNN", new String[][]{{"right"}});
    nonTerminalInfo.put("DTNNS", new String[][]{{"right"}});
    nonTerminalInfo.put("DTNNP", new String[][]{{"right"}});
    nonTerminalInfo.put("DTNNPS", new String[][]{{"right"}});
    nonTerminalInfo.put("DTJJ", new String[][]{{"right"}});
    nonTerminalInfo.put("DTRP", new String[][]{{"right"}});
    nonTerminalInfo.put("DTRB", new String[][]{{"right"}});
    nonTerminalInfo.put("DTCD", new String[][]{{"right"}});
    nonTerminalInfo.put("DTIN", new String[][]{{"right"}});

    // stand-in dependency:
    nonTerminalInfo.put("EDITED", new String[][]{{"left"}});
    nonTerminalInfo.put(tlp.startSymbol(), new String[][]{{"left"}});

    // one stray SINV in the training set...garbage head rule here.
    nonTerminalInfo.put("SINV", new String[][]{{"left","ADJP","VP"}});
  }


  private final Pattern predPattern = Pattern.compile(".*-PRD$");

  /**
   * Predicatively marked elements in a sentence should be noted as heads
   */
  @Override
  protected Tree findMarkedHead(Tree t) {
    String cat = t.value();
    if (cat.equals("S")) {
      Tree[] kids = t.children();
      for (Tree kid : kids) {
        if (predPattern.matcher(kid.value()).matches()) {
          return kid;
        }
      }
    }
    return null;
  }

}