All Downloads are FREE. Search and download functionalities are using the official Maven repository.

edu.stanford.nlp.parser.lexparser.ChineseTreebankParserParams Maven / Gradle / Ivy

Go to download

Stanford CoreNLP provides a set of natural language analysis tools which can take raw English language text input and give the base forms of words, their parts of speech, whether they are names of companies, people, etc., normalize dates, times, and numeric quantities, mark up the structure of sentences in terms of phrases and word dependencies, and indicate which noun phrases refer to the same entities. It provides the foundational building blocks for higher level text understanding applications.

There is a newer version: 4.5.7
Show newest version
package edu.stanford.nlp.parser.lexparser; 
import edu.stanford.nlp.util.logging.Redwood;

import edu.stanford.nlp.io.EncodingPrintWriter;
import edu.stanford.nlp.io.NumberRangesFileFilter;
import edu.stanford.nlp.io.RuntimeIOException;
import edu.stanford.nlp.ling.CategoryWordTag;
import edu.stanford.nlp.ling.CoreLabel;
import edu.stanford.nlp.ling.Label;
import edu.stanford.nlp.ling.SentenceUtils;
import edu.stanford.nlp.ling.Word;
import edu.stanford.nlp.process.WordSegmenter;
import edu.stanford.nlp.process.WordSegmentingTokenizer;
import edu.stanford.nlp.trees.*;
import edu.stanford.nlp.trees.international.pennchinese.*;
import java.util.function.Predicate;
import edu.stanford.nlp.util.Index;
import edu.stanford.nlp.util.ReflectionLoading;
import edu.stanford.nlp.util.StringUtils;

import java.io.IOException;
import java.util.*;


/**
 * Parameter file for parsing the Penn Chinese Treebank.  Includes
 * category enrichments specific to the Penn Chinese Treebank.
 *
 * @author Roger Levy
 * @author Christopher Manning
 * @author Galen Andrew
 */

public class ChineseTreebankParserParams extends AbstractTreebankParserParams  {

  /** A logger for this class */
  private static Redwood.RedwoodChannels log = Redwood.channels(ChineseTreebankParserParams.class);

  /**
   * The variable ctlp stores the same thing as the tlp variable in
   * AbstractTreebankParserParams, but pre-cast to be a
   * ChineseTreebankLanguagePack.
   * todo [cdm 2013]: Just change to method that casts
   */
  private ChineseTreebankLanguagePack ctlp;
  public boolean charTags = false;
  public boolean useCharacterBasedLexicon = false;
  public boolean useMaxentLexicon = false;
  public boolean useMaxentDepGrammar = false;
  public boolean segment = false;
  public boolean segmentMarkov = false;
  public boolean sunJurafskyHeadFinder = false;
  public boolean bikelHeadFinder = false;
  public boolean discardFrags = false;
  public boolean useSimilarWordMap = false;

  public String segmenterClass = null;

  private Lexicon lex;
  private WordSegmenter segmenter;
  private HeadFinder headFinder = null;

  private static void printlnErr(String s) {
    EncodingPrintWriter.err.println(s, ChineseTreebankLanguagePack.ENCODING);
  }

  public ChineseTreebankParserParams() {
    super(new ChineseTreebankLanguagePack());
    ctlp = (ChineseTreebankLanguagePack) super.treebankLanguagePack();
  }

  /**
   * Returns a ChineseHeadFinder
   */
  @Override
  public HeadFinder headFinder() {
    if(headFinder == null) {
      if (sunJurafskyHeadFinder) {
        return new SunJurafskyChineseHeadFinder();
      } else if (bikelHeadFinder) {
        return new BikelChineseHeadFinder();
      } else {
        return new ChineseHeadFinder();
      }
    } else
      return headFinder;
  }

  @Override
  public HeadFinder typedDependencyHeadFinder() {
    if (this.generateOriginalDependencies()) {
      return new ChineseSemanticHeadFinder();
    } else {
      return new UniversalChineseSemanticHeadFinder();
    }

  }

  /**
   * Returns a ChineseLexicon
   */
  @Override
  public Lexicon lex(Options op, Index wordIndex, Index tagIndex) {
    if (useCharacterBasedLexicon) {
      return lex = new ChineseCharacterBasedLexicon(this, wordIndex, tagIndex);
    // } else if (useMaxentLexicon) {
    // return lex = new ChineseMaxentLexicon();
    }
    if (op.lexOptions.uwModelTrainer == null) {
      op.lexOptions.uwModelTrainer = "edu.stanford.nlp.parser.lexparser.ChineseUnknownWordModelTrainer";
    }
    if (segmenterClass != null) {
      try {
        segmenter = ReflectionLoading.loadByReflection(segmenterClass, this,
                                                       wordIndex, tagIndex);
      } catch (ReflectionLoading.ReflectionLoadingException e) {
        segmenter = ReflectionLoading.loadByReflection(segmenterClass);
      }
    }

    ChineseLexicon clex = new ChineseLexicon(op, this, wordIndex, tagIndex);
    if (segmenter != null) {
      lex = new ChineseLexiconAndWordSegmenter(clex, segmenter);
      ctlp.setTokenizerFactory(WordSegmentingTokenizer.factory(segmenter));
    } else {
      lex = clex;
    }

    return lex;
  }

  @Override
  public double[] MLEDependencyGrammarSmoothingParams() {
    return new double[]{5.8, 17.7, 6.5, 0.4};
  }

  @Override
  public TreeReaderFactory treeReaderFactory() {
    final TreeNormalizer tn = new CTBErrorCorrectingTreeNormalizer(splitNPTMP, splitPPTMP, splitXPTMP, charTags);
    return new CTBTreeReaderFactory(tn, discardFrags);
  }

  /**
   * Uses a DiskTreebank with a CHTBTokenizer and a
   * BobChrisTreeNormalizer.
   */
  @Override
  public DiskTreebank diskTreebank() {
    String encoding = inputEncoding;
    if (!java.nio.charset.Charset.isSupported(encoding)) {
      printlnErr("Warning: desired encoding " + encoding + " not accepted. ");
      printlnErr("Using UTF-8 to construct DiskTreebank");
      encoding = "UTF-8";
    }

    return new DiskTreebank(treeReaderFactory(), encoding);
  }


  /**
   * Uses a MemoryTreebank with a CHTBTokenizer and a
   * BobChrisTreeNormalizer
   */
  @Override
  public MemoryTreebank memoryTreebank() {
    String encoding = inputEncoding;
    if (!java.nio.charset.Charset.isSupported(encoding)) {
      System.out.println("Warning: desired encoding " + encoding + " not accepted. ");
      System.out.println("Using UTF-8 to construct MemoryTreebank");
      encoding = "UTF-8";
    }

    return new MemoryTreebank(treeReaderFactory(), encoding);
  }


  /**
   * Returns a ChineseCollinizer
   */
  @Override
  public TreeTransformer collinizer() {
    return new ChineseCollinizer(ctlp);
  }

  /**
   * Returns a ChineseCollinizer that doesn't delete punctuation
   */
  @Override
  public TreeTransformer collinizerEvalb() {
    return new ChineseCollinizer(ctlp, false);
  }

  //   /** Returns a ChineseTreebankLanguagePack */
  //   public TreebankLanguagePack treebankLanguagePack() {
  //     return new ChineseTreebankLanguagePack();
  //   }


  /* --------- not used now
    // Automatically generated by ParentAnnotationStats -- preferably don't edit
    private static final String[] splitters1 = new String[] {"VA^VCD", "NP^NP", "NP^VP", "NP^IP", "NP^DNP", "NP^PP", "NP^LCP", "NP^PRN", "NP^QP", "PP^IP", "PP^NP", "NN^FRAG", "NN^NP", "NT^FRAG", "NT^NP", "NR^FRAG", "NR^NP", "VV^FRAG", "VV^VRD", "VV^VCD", "VV^VP", "VV^VSB", "VP^VP", "VP^IP", "VP^DVP", "IP^ROOT", "IP^IP", "IP^CP", "IP^VP", "IP^PP", "IP^NP", "IP^LCP", "CP^IP", "QP^NP", "QP^PP", "QP^VP", "ADVP^CP", "CC^VP", "CC^NP", "CC^IP", "CC^QP", "PU^NP", "PU^FRAG", "PU^IP", "PU^VP", "PU^PRN", "PU^QP", "PU^LST", "NP^DNP~QP", "NT^NP~NP", "NT^NP~VP", "NT^NP~IP", "NT^NP~LCP", "NT^NP~PP", "NT^NP~PRN", "NT^NP~QP", "NT^NP~DNP", "NP^NP~VP", "NP^NP~NP", "NP^NP~IP", "NP^NP~PP", "NP^NP~DNP", "NP^NP~LCP", "NN^NP~VP", "NN^NP~IP", "NN^NP~NP", "NN^NP~PP", "NN^NP~DNP", "NN^NP~LCP", "NN^NP~UCP", "NN^NP~QP", "NN^NP~PRN", "M^CLP~DP", "M^CLP~QP", "M^CLP~NP", "M^CLP~CLP", "CD^QP~VP", "CD^QP~NP", "CD^QP~QP", "CD^QP~LCP", "CD^QP~PP", "CD^QP~DNP", "CD^QP~DP", "CD^QP~IP", "IP^IP~IP", "IP^IP~ROOT", "IP^IP~VP", "LC^LCP~PP", "LC^LCP~IP", "NP^VP~IP", "NP^VP~VP", "AD^ADVP~IP", "AD^ADVP~QP", "AD^ADVP~VP", "AD^ADVP~NP", "AD^ADVP~PP", "AD^ADVP~ADVP", "NP^IP~ROOT", "NP^IP~IP", "NP^IP~CP", "NP^IP~VP", "DT^DP~PP", "P^PP~IP", "P^PP~NP", "P^PP~VP", "P^PP~DNP", "VV^VP~IP", "VV^VP~VP", "PU^IP~IP", "PU^IP~VP", "PU^IP~ROOT", "PU^IP~CP", "JJ^ADJP~DNP", "JJ^ADJP~ADJP", "NR^NP~IP", "NR^NP~NP", "NR^NP~PP", "NR^NP~VP", "NR^NP~DNP", "NR^NP~LCP", "NR^NP~PRN", "NP^PP~NP", "NP^PP~IP", "NP^PP~DNP", "VA^VP~VP", "VA^VP~IP", "VA^VP~DVP", "VP^VP~VP", "VP^VP~IP", "VP^VP~DVP", "VP^IP~ROOT", "VP^IP~CP", "VP^IP~IP", "VP^IP~VP", "VP^IP~PP", "VP^IP~LCP", "VP^IP~NP", "PN^NP~NP", "PN^NP~IP", "PN^NP~PP"};
    private static final String[] splitters2 = new String[] {"VA^VCD", "NP^NP", "NP^VP", "NP^IP", "NP^DNP", "NP^PP", "NP^LCP", "NN^FRAG", "NN^NP", "NT^FRAG", "NT^NP", "NR^FRAG", "NR^NP", "VV^FRAG", "VV^VRD", "VV^VCD", "VV^VP", "VV^VSB", "VP^VP", "VP^IP", "VP^DVP", "IP^ROOT", "IP^IP", "IP^CP", "IP^VP", "IP^PP", "CP^IP", "ADVP^CP", "CC^VP", "CC^NP", "PU^NP", "PU^FRAG", "PU^IP", "PU^VP", "PU^PRN", "NT^NP~NP", "NT^NP~VP", "NT^NP~IP", "NT^NP~LCP", "NT^NP~PP", "NP^NP~VP", "NP^NP~NP", "NP^NP~IP", "NP^NP~PP", "NP^NP~DNP", "NN^NP~VP", "NN^NP~IP", "NN^NP~NP", "NN^NP~PP", "NN^NP~DNP", "NN^NP~LCP", "NN^NP~UCP", "NN^NP~QP", "NN^NP~PRN", "M^CLP~DP", "CD^QP~VP", "CD^QP~NP", "CD^QP~QP", "CD^QP~LCP", "CD^QP~PP", "CD^QP~DNP", "CD^QP~DP", "LC^LCP~PP", "NP^VP~IP", "NP^VP~VP", "AD^ADVP~IP", "AD^ADVP~QP", "AD^ADVP~VP", "AD^ADVP~NP", "NP^IP~ROOT", "NP^IP~IP", "NP^IP~CP", "NP^IP~VP", "P^PP~IP", "P^PP~NP", "P^PP~VP", "P^PP~DNP", "VV^VP~IP", "VV^VP~VP", "PU^IP~IP", "PU^IP~VP", "PU^IP~ROOT", "PU^IP~CP", "JJ^ADJP~DNP", "NR^NP~IP", "NR^NP~NP", "NR^NP~PP", "NR^NP~VP", "NR^NP~DNP", "NR^NP~LCP", "NP^PP~NP", "VA^VP~VP", "VA^VP~IP", "VP^VP~VP", "VP^IP~ROOT", "VP^IP~CP", "VP^IP~IP", "VP^IP~VP", "VP^IP~PP", "VP^IP~LCP", "VP^IP~NP", "PN^NP~NP"};
    private static final String[] splitters3 = new String[] {"NP^NP", "NP^VP", "NP^IP", "NP^DNP", "NP^PP", "NP^LCP", "NN^FRAG", "NN^NP", "NT^FRAG", "NR^FRAG", "NR^NP", "VV^FRAG", "VV^VRD", "VV^VCD", "VV^VP", "VV^VSB", "VP^VP", "VP^IP", "IP^ROOT", "IP^IP", "IP^CP", "IP^VP", "PU^NP", "PU^FRAG", "PU^IP", "PU^VP", "PU^PRN", "NP^NP~VP", "NN^NP~VP", "NN^NP~IP", "NN^NP~NP", "NN^NP~PP", "NN^NP~DNP", "NN^NP~LCP", "M^CLP~DP", "CD^QP~VP", "CD^QP~NP", "CD^QP~QP", "AD^ADVP~IP", "AD^ADVP~QP", "AD^ADVP~VP", "P^PP~IP", "VV^VP~IP", "VV^VP~VP", "PU^IP~IP", "PU^IP~VP", "NR^NP~IP", "NR^NP~NP", "NR^NP~PP", "NR^NP~VP", "VP^VP~VP", "VP^IP~ROOT", "VP^IP~CP", "VP^IP~IP", "VP^IP~VP"};
    private static final String[] splitters4 = new String[] {"NP^NP", "NP^VP", "NP^IP", "NN^FRAG", "NT^FRAG", "NR^FRAG", "VV^FRAG", "VV^VRD", "VV^VCD", "VP^VP", "VP^IP", "IP^ROOT", "IP^IP", "IP^CP", "IP^VP", "PU^NP", "PU^FRAG", "PU^IP", "PU^VP", "NN^NP~VP", "NN^NP~IP", "NN^NP~NP", "NN^NP~PP", "NN^NP~DNP", "NN^NP~LCP", "CD^QP~VP", "CD^QP~NP", "AD^ADVP~IP", "VV^VP~IP", "VV^VP~VP", "NR^NP~IP", "VP^IP~ROOT", "VP^IP~CP"};
    // these ones were built by hand.
    // one can't tag split under FRAG or everything breaks, because of those
    // big flat FRAGs....
    private static final String[] splitters5 = new String[] {"NN^FRAG", "NT^FRAG", "NR^FRAG", "VV^FRAG", "VV^VCD", "VV^VRD", "NP^NP", "VP^VP", "IP^ROOT", "IP^IP", "PU^NP", "PU^FRAG", "P^PP~VP", "P^PP~IP"};
    private static final String[] splitters6 = new String[] {"VV^VCD", "VV^VRD", "NP^NP", "VP^VP", "IP^ROOT", "IP^IP", "PU^NP", "P^PP~VP", "P^PP~IP"};
    private static final String[] splitters7 = new String[] {"NP^NP", "VP^VP", "IP^ROOT", "IP^IP", "PU^NP", "P^PP~VP", "P^PP~IP"};
    private static final String[] splitters8 = new String[] {"IP^ROOT", "IP^IP", "PU^NP", "P^PP~VP", "P^PP~IP"};
    private static final String[] splitters9 = new String[] {"VV^VCD", "VV^VRD", "NP^NP", "VP^VP", "IP^ROOT", "IP^IP", "P^PP~VP", "P^PP~IP"};
    private static final String[] splitters10 = new String[] {"NP^NP", "VP^VP", "IP^ROOT", "IP^IP", "P^PP~VP", "P^PP~IP"};


    public String[] splitters() {
      switch (selectiveSplitLevel) {
      case 1:
        return splitters1;
      case 2:
        return splitters2;
      case 3:
        return splitters3;
      case 4:
        return splitters4;
      case 5:
        return splitters5;
      case 6:
        return splitters6;
      case 7:
        return splitters7;
      case 8:
        return splitters8;
      case 9:
        return splitters9;
      case 10:
        return splitters10;
      default:
        return new String[0];
      }
    }
  ------------------ */

  @Override
  public String[] sisterSplitters() {
    return StringUtils.EMPTY_STRING_ARRAY;
  }

  /**
   * transformTree does all language-specific tree
   * transformations. Any parameterizations should be inside the
   * specific TreebankLangParserParams class.
   */
  @Override
  public Tree transformTree(Tree t, Tree root) {
    if (t == null || t.isLeaf()) {
      return t;
    }

    String parentStr;
    String grandParentStr;
    Tree parent;
    Tree grandParent;
    if (root == null || t.equals(root)) {
      parent = null;
      parentStr = "";
    } else {
      parent = t.parent(root);
      parentStr = parent.label().value();
    }
    if (parent == null || parent.equals(root)) {
      grandParent = null;
      grandParentStr = "";
    } else {
      grandParent = parent.parent(root);
      grandParentStr = grandParent.label().value();
    }

    String baseParentStr = ctlp.basicCategory(parentStr);
    String baseGrandParentStr = ctlp.basicCategory(grandParentStr);

    CoreLabel lab = (CoreLabel) t.label();
    String word = lab.word();
    String tag = lab.tag();
    String baseTag = ctlp.basicCategory(tag);
    String category = lab.value();
    String baseCategory = ctlp.basicCategory(category);

    if (t.isPreTerminal()) { // it's a POS tag
      List leftAunts = listBasicCategories(SisterAnnotationStats.leftSisterLabels(parent, grandParent));
      List rightAunts = listBasicCategories(SisterAnnotationStats.rightSisterLabels(parent, grandParent));

      // Chinese-specific punctuation splits
      if (chineseSplitPunct && baseTag.equals("PU")) {
        if (ChineseTreebankLanguagePack.chineseDouHaoAcceptFilter().test(word)) {
          tag = tag + "-DOU";
          // System.out.println("Punct: Split dou hao"); // debugging
        } else if (ChineseTreebankLanguagePack.chineseCommaAcceptFilter().test(word)) {
          tag = tag + "-COMMA";
          // System.out.println("Punct: Split comma"); // debugging
        } else if (ChineseTreebankLanguagePack.chineseColonAcceptFilter().test(word)) {
          tag = tag + "-COLON";
          // System.out.println("Punct: Split colon"); // debugging
        } else if (ChineseTreebankLanguagePack.chineseQuoteMarkAcceptFilter().test(word)) {
          if (chineseSplitPunctLR) {
            if (ChineseTreebankLanguagePack.chineseLeftQuoteMarkAcceptFilter().test(word)) {
              tag += "-LQUOTE";
            } else {
              tag += "-RQUOTE";
            }
          } else {
            tag = tag + "-QUOTE";
          }
          // System.out.println("Punct: Split quote"); // debugging
        } else if (ChineseTreebankLanguagePack.chineseEndSentenceAcceptFilter().test(word)) {
          tag = tag + "-ENDSENT";
          // System.out.println("Punct: Split end sent"); // debugging
        } else if (ChineseTreebankLanguagePack.chineseParenthesisAcceptFilter().test(word)) {
          if (chineseSplitPunctLR) {
            if (ChineseTreebankLanguagePack.chineseLeftParenthesisAcceptFilter().test(word)) {
              tag += "-LPAREN";
            } else {
              tag += "-RPAREN";
            }
          } else {
            tag += "-PAREN";
            //printlnErr("Just used -PAREN annotation");
            //printlnErr(word);
            //throw new RuntimeException();
          }
          // System.out.println("Punct: Split paren"); // debugging
        } else if (ChineseTreebankLanguagePack.chineseDashAcceptFilter().test(word)) {
          tag = tag + "-DASH";
          // System.out.println("Punct: Split dash"); // debugging
        } else if (ChineseTreebankLanguagePack.chineseOtherAcceptFilter().test(word)) {
          tag = tag + "-OTHER";
        } else {
          printlnErr("Unknown punct (you should add it to CTLP): " + tag + " |" + word + "|");
        }
      } else if (chineseSplitDouHao) {   // only split DouHao
        if (ChineseTreebankLanguagePack.chineseDouHaoAcceptFilter().test(word) && baseTag.equals("PU")) {
          tag = tag + "-DOU";
        }
      }

      // Chinese-specific POS tag splits (non-punctuation)

      if (tagWordSize) {
        int l = word.length();
        tag += "-" + l + "CHARS";
      }

      if (mergeNNVV && baseTag.equals("NN")) {
        tag = "VV";
      }

      if ((chineseSelectiveTagPA || chineseVerySelectiveTagPA) && (baseTag.equals("CC") || baseTag.equals("P"))) {
        tag += "-" + baseParentStr;
      }
      if (chineseSelectiveTagPA && (baseTag.equals("VV"))) {
        tag += "-" + baseParentStr;
      }

      if (markMultiNtag && tag.startsWith("N")) {
        for (int i = 0; i < parent.numChildren(); i++) {
          if (parent.children()[i].label().value().startsWith("N") && parent.children()[i] != t) {
            tag += "=N";
            //System.out.println("Found multi=N rewrite");
          }
        }
      }

      if (markVVsisterIP && baseTag.equals("VV")) {
        boolean seenIP = false;
        for (int i = 0; i < parent.numChildren(); i++) {
          if (parent.children()[i].label().value().startsWith("IP")) {
            seenIP = true;
          }
        }
        if (seenIP) {
          tag += "-IP";
          //System.out.println("Found VV with IP sister"); // testing
        }
      }

      if (markPsisterIP && baseTag.equals("P")) {
        boolean seenIP = false;
        for (int i = 0; i < parent.numChildren(); i++) {
          if (parent.children()[i].label().value().startsWith("IP")) {
            seenIP = true;
          }
        }
        if (seenIP) {
          tag += "-IP";
        }
      }

      if (markADgrandchildOfIP && baseTag.equals("AD") && baseGrandParentStr.equals("IP")) {
        tag += "~IP";
        //System.out.println("Found AD with IP grandparent"); // testing
      }

      if (gpaAD && baseTag.equals("AD")) {
        tag += "~" + baseGrandParentStr;
        //System.out.println("Found AD with grandparent " + grandParentStr); // testing
      }

      if (markPostverbalP && leftAunts.contains("VV") && baseTag.equals("P")) {
        //System.out.println("Found post-verbal P");
        tag += "^=lVV";
      }

      // end Chinese-specific tag splits

      Label label = new CategoryWordTag(tag, word, tag);
      t.setLabel(label);
    } else {
      // it's a phrasal category
      Tree[] kids = t.children();

      // Chinese-specific category splits
      List leftSis = listBasicCategories(SisterAnnotationStats.leftSisterLabels(t, parent));
      List rightSis = listBasicCategories(SisterAnnotationStats.rightSisterLabels(t, parent));

      if (paRootDtr && baseParentStr.equals("ROOT")) {
        category += "^ROOT";
      }

      if (markIPsisterBA && baseCategory.equals("IP")) {
        if (leftSis.contains("BA")) {
          category += "=BA";
          //System.out.println("Found IP sister of BA");
        }
      }

      if (dominatesV && hasV(t.preTerminalYield())) {
        // mark categories containing a verb
        category += "-v";
      }

      if (markIPsisterVVorP && baseCategory.equals("IP")) {
        // todo: cdm: is just looking for "P" here selective enough??
        if (leftSis.contains("VV") || leftSis.contains("P")) {
          category += "=VVP";
        }
      }

      if (markIPsisDEC && baseCategory.equals("IP")) {
        if (rightSis.contains("DEC")) {
          category += "=DEC";
          //System.out.println("Found prenominal IP");
        }
      }

      if (baseCategory.equals("VP")) {
        // cdm 2008: this used to just check that it startsWith("VP"), but
        // I think that was bad because it also matched VPT verb compounds
        if (chineseSplitVP == 3) {
          boolean hasCC = false;
          boolean hasPU = false;
          boolean hasLexV = false;
          for (Tree kid : kids) {
            if (kid.label().value().startsWith("CC")) {
              hasCC = true;
            } else if (kid.label().value().startsWith("PU")) {
              hasPU = true;
            } else if (StringUtils.lookingAt(kid.label().value(), "(V[ACEV]|VCD|VCP|VNV|VPT|VRD|VSB)")) {
              hasLexV = true;
            }
          }
          if (hasCC || (hasPU && ! hasLexV)) {
            category += "-CRD";
            //System.out.println("Found coordinate VP"); // testing
          } else if (hasLexV) {
            category += "-COMP";
            //System.out.println("Found complementing VP"); // testing
          } else {
            category += "-ADJT";
            //System.out.println("Found adjoining VP"); // testing
          }
        } else if (chineseSplitVP >= 1) {
          boolean hasBA = false;
          for (Tree kid : kids) {
            if (kid.label().value().startsWith("BA")) {
              hasBA = true;
            } else if (chineseSplitVP == 2 && tlp.basicCategory(kid.label().value()).equals("VP")) {
              for (Tree kidkid : kid.children()) {
                if (kidkid.label().value().startsWith("BA")) {
                  hasBA = true;
                }
              }
            }
          }
          if (hasBA) {
            category += "-BA";
          }
        }
      }

      if (markVPadjunct && baseParentStr.equals("VP")) {
        // cdm 2008: This used to use startsWith("VP") but changed to baseCat
        Tree[] sisters = parent.children();
        boolean hasVPsister = false;
        boolean hasCC = false;
        boolean hasPU = false;
        boolean hasLexV = false;
        for (Tree sister : sisters) {
          if (tlp.basicCategory(sister.label().value()).equals("VP")) {
            hasVPsister = true;
          }
          if (sister.label().value().startsWith("CC")) {
            hasCC = true;
          }
          if (sister.label().value().startsWith("PU")) {
            hasPU = true;
          }
          if (StringUtils.lookingAt(sister.label().value(), "(V[ACEV]|VCD|VCP|VNV|VPT|VRD|VSB)")) {
            hasLexV = true;
          }
        }
        if (hasVPsister && !(hasCC || hasPU || hasLexV)) {
          category += "-VPADJ";
          //System.out.println("Found adjunct of VP"); // testing
        }
      }

      if (markNPmodNP && baseCategory.equals("NP") && baseParentStr.equals("NP")) {
        if (rightSis.contains("NP")) {
          category += "=MODIFIERNP";
          //System.out.println("Found NP modifier of NP"); // testing
        }
      }

      if (markModifiedNP && baseCategory.equals("NP") && baseParentStr.equals("NP")) {
        if (rightSis.isEmpty() && (leftSis.contains("ADJP") || leftSis.contains("NP") || leftSis.contains("DNP") || leftSis.contains("QP") || leftSis.contains("CP") || leftSis.contains("PP"))) {
          category += "=MODIFIEDNP";
          //System.out.println("Found modified NP"); // testing
        }
      }

      if (markNPconj && baseCategory.equals("NP") && baseParentStr.equals("NP")) {
        if (rightSis.contains("CC") || rightSis.contains("PU") || leftSis.contains("CC") || leftSis.contains("PU")) {
          category += "=CONJ";
          //System.out.println("Found NP conjunct"); // testing
        }
      }

      if (markIPconj && baseCategory.equals("IP") && baseParentStr.equals("IP")) {
        Tree[] sisters = parent.children();
        boolean hasCommaSis = false;
        boolean hasIPSis = false;
        for (Tree sister : sisters) {
          if (ctlp.basicCategory(sister.label().value()).equals("PU") && ChineseTreebankLanguagePack.chineseCommaAcceptFilter().test(sister.children()[0].label().toString())) {
            hasCommaSis = true;
            //System.out.println("Found CommaSis"); // testing
          }
          if (ctlp.basicCategory(sister.label().value()).equals("IP") && sister != t) {
            hasIPSis = true;
          }
        }
        if (hasCommaSis && hasIPSis) {
          category += "-CONJ";
          //System.out.println("Found IP conjunct"); // testing
        }
      }

      if (unaryIP && baseCategory.equals("IP") && t.numChildren() == 1) {
        category += "-U";
        //System.out.println("Found unary IP"); //testing
      }
      if (unaryCP && baseCategory.equals("CP") && t.numChildren() == 1) {
        category += "-U";
        //System.out.println("Found unary CP"); //testing
      }

      if (splitBaseNP && baseCategory.equals("NP")) {
        if (t.isPrePreTerminal()) {
          category = category + "-B";
        }
      }

      //if (Test.verbose) printlnErr(baseCategory + " " + leftSis.toString()); //debugging

      if (markPostverbalPP && leftSis.contains("VV") && baseCategory.equals("PP")) {
        //System.out.println("Found post-verbal PP");
        category += "=lVV";
      }

      if ((markADgrandchildOfIP || gpaAD) && listBasicCategories(SisterAnnotationStats.kidLabels(t)).contains("AD")) {
        category += "^ADVP";
      }

      if (markCC) {
        // was: for (int i = 0; i < kids.length; i++) {
        // This second version takes an idea from Collins: don't count
        // marginal conjunctions which don't conjoin 2 things.
        for (int i = 1; i < kids.length - 1; i++) {
          String cat2 = kids[i].label().value();
          if (cat2.startsWith("CC")) {
            category += "-CC";
          }
        }
      }

      Label label = new CategoryWordTag(category, word, tag);
      t.setLabel(label);
    }
    return t;
  }


  /**
   * Chinese: Split the dou hao (a punctuation mark separating
   * members of a list) from other punctuation.  Good but included below.
   */
  public boolean chineseSplitDouHao = false;
  /**
   * Chinese: split Chinese punctuation several ways, along the lines
   * of English punctuation plus another category for the dou hao.  Good.
   */
  public boolean chineseSplitPunct = true;
  /**
   * Chinese: split left right/paren quote (if chineseSplitPunct is also
   * true.  Only very marginal gains, but seems positive.
   */
  public boolean chineseSplitPunctLR = false;

  /**
   * Chinese: mark VVs that are sister of IP (communication &
   * small-clause-taking verbs).  Good: give 0.5%
   */
  public boolean markVVsisterIP = true;

  /**
   * Chinese: mark P's that are sister of IP.  Negative effect
   */
  public boolean markPsisterIP = true;

  /**
   * Chinese: mark IP's that are sister of VV or P.  These rarely
   * have punctuation. Small positive effect.
   */
  public boolean markIPsisterVVorP = true;


  /**
   * Chinese: mark ADs that are grandchild of IP.
   */
  public boolean markADgrandchildOfIP = false;
  /**
   * Grandparent annotate all AD.  Seems slightly negative.
   */
  public boolean gpaAD = true;

  // using tagPA on Chinese 100k is negative.

  public boolean chineseVerySelectiveTagPA = false;
  public boolean chineseSelectiveTagPA = false;

  /**
   * Chinese: mark IPs that are sister of BA.  These always have
   * overt NP.  Very slightly positive.
   */
  public boolean markIPsisterBA = true;

  /**
   * Chinese: mark phrases that are adjuncts of VP (these tend to be
   * locatives/temporals, and have a specific distribution).
   * Necessary even with chineseSplitVP==3 and parent annotation because
   * parent annotation happens with unsplit parent categories.
   * Slightly positive.
   */
  public boolean markVPadjunct = true;

  /**
   * Chinese: mark NP modifiers of NPs. Quite positive (0.5%)
   */
  public boolean markNPmodNP = true;

  /**
   * Chinese: mark left-modified NPs (rightmost NPs with a left-side
   * mod).  Slightly positive.
   */
  public boolean markModifiedNP = true;

  /**
   * Chinese: mark NPs that are conjuncts.  Negative on small set.
   */
  public boolean markNPconj = true;

  /**
   * Chinese: mark nominal tags that are part of multi-nominal
   * rewrites.  Doesn't seem any good.
   */
  public boolean markMultiNtag = false;

  /**
   * Chinese: mark IPs that are part of prenominal modifiers. Negative.
   */
  public boolean markIPsisDEC = true;

  /**
   * Chinese: mark IPs that are conjuncts.  Or those that have
   * (adjuncts or subjects)
   */
  public boolean markIPconj = false;
  public boolean markIPadjsubj = false;

  /**
   * Chinese VP splitting.  0 = none;
   * 1 = mark with -BA a VP that directly dominates a BA;
   * 2 = mark with -BA a VP that directly dominates a BA or a VP that
   *     directly dominates a BA
   * 3 = split VPs into VP-COMP, VP-CRD, VP-ADJ.  (Negative value.)
   */
  public int chineseSplitVP = 3;

  /** Chinese: if an IP has no subject (including no empty-category
   * subject), then it should only have an NP (adjunct) daughter if
   * it's a coordinate IP and the NP scopes over the conjunct
   * IPs. (sometimes this NP daughter is adjoined in an IP -> NP
   * IP_coord structure, sometimes the IP conjuncts are at the same
   * level as the NP).  In other cases NP adjuncts should be inside
   * VP.  So: an IP dominating neither a non-subject NP nor another IP
   * should have no NP daughters.  BUT this generalization breaks down
   * when you try to extend it to IPs ignoring their empty subjects.
   * So the simplest thing to do would be to mark non-subject dtrs of
   * IP....  but I think we need to leave the SBJ functional tagging
   * on categories to be consistent about this.
   *
   * Update: I tried retaining SBJ markers with
   * SbjRetainingTreeNormalizer but it works worse than using
   * markVPadjunct.
   */

  /**
   * Chinese: merge NN and VV.  A lark.
   */
  public boolean mergeNNVV = false;

  // XXXX upto in testing

  /**
   * Chinese: unary category marking
   */
  public boolean unaryIP = false;
  public boolean unaryCP = false;

  /**
   * Chinese: parent annotate daughter of root.  Meant only for
   * selectivesplit=false.
   */
  public boolean paRootDtr = false; // true

  /**
   * Chinese: mark P with a left aunt VV, and PP with a left sister
   * VV.  Note that it's necessary to mark both to thread the
   * context-marking.  Used to identify post-verbal P's, which are
   * rare.
   */
  public boolean markPostverbalP = false;
  public boolean markPostverbalPP = false;


  // Not used now
  // /** How selectively to split. */
  // public int selectiveSplitLevel = 1;

  /**
   * Mark base NPs.  Good.
   */
  public boolean splitBaseNP = false;

  /**
   * Annotate tags for number of characters contained.
   */
  public boolean tagWordSize = false;

  /**
   * Mark phrases which are conjunctions.
   * Appears negative, even with 200K words training data.
   */
  public boolean markCC = false;

  /**
   * Whether to retain the -TMP functional tag on various phrasal
   * categories.  On 80K words training, minutely helpful; on 200K
   * words, best option gives 0.6%.  Doing
   * splitNPTMP and splitPPTMP (but not splitXPTMP) is best.
   */
  public boolean splitNPTMP = false;
  public boolean splitPPTMP = false;
  public boolean splitXPTMP = false;

  /**
   * Verbal distance -- mark whether symbol dominates a verb (V*).
   * Seems bad for Chinese.
   */
  public boolean dominatesV = false;


  /**
   * Parameters specific for creating a ChineseLexicon
   */
  public static final boolean DEFAULT_USE_GOOD_TURNING_UNKNOWN_WORD_MODEL = false;
  public boolean useGoodTuringUnknownWordModel = DEFAULT_USE_GOOD_TURNING_UNKNOWN_WORD_MODEL;
  public boolean useCharBasedUnknownWordModel = false;


  /**
   * Parameters for a ChineseCharacterBasedLexicon
   */
  public double lengthPenalty = 5.0;

  public boolean useUnknownCharacterModel = true;

  /**
   * penaltyType should be set as follows:
   * 0: no length penalty
   * 1: quadratic length penalty
   * 2: penalty for continuation chars only
   * TODO: make this an enum
   */
  public int penaltyType = 0;

  @Override
  public void display() {
    String chineseParams = "Using ChineseTreebankParserParams" + " chineseSplitDouHao=" + chineseSplitDouHao + " chineseSplitPunct=" + chineseSplitPunct + " chineseSplitPunctLR=" + chineseSplitPunctLR + " markVVsisterIP=" + markVVsisterIP + " markVPadjunct=" + markVPadjunct + " chineseSplitVP=" + chineseSplitVP + " mergeNNVV=" + mergeNNVV + " unaryIP=" + unaryIP + " unaryCP=" + unaryCP + " paRootDtr=" + paRootDtr + " markPsisterIP=" + markPsisterIP + " markIPsisterVVorP=" + markIPsisterVVorP + " markADgrandchildOfIP=" + markADgrandchildOfIP + " gpaAD=" + gpaAD + " markIPsisterBA=" + markIPsisterBA + " markNPmodNP=" + markNPmodNP + " markNPconj=" + markNPconj + " markMultiNtag=" + markMultiNtag + " markIPsisDEC=" + markIPsisDEC + " markIPconj=" + markIPconj + " markIPadjsubj=" + markIPadjsubj + " markPostverbalP=" + markPostverbalP + " markPostverbalPP=" + markPostverbalPP
            //      + " selSplitLevel=" + selectiveSplitLevel
            + " baseNP=" + splitBaseNP + " headFinder=" + (sunJurafskyHeadFinder ? "sunJurafsky" : (bikelHeadFinder ? "bikel" : "levy")) + " discardFrags=" + discardFrags  + " dominatesV=" + dominatesV;
    printlnErr(chineseParams);
  }


  private List listBasicCategories(List l) {
    List l1 = new ArrayList<>();
    for (String s : l) {
      l1.add(ctlp.basicCategory(s));
    }
    return l1;
  }

  // TODO: Rewrite this as general matching predicate
  private static boolean hasV(List tags) {
    for (Object tag : tags) {
      String str = tag.toString();
      if (str.startsWith("V")) {
        return true;
      }
    }
    return false;
  }

  /**
   * Set language-specific options according to flags.
   * This routine should process the option starting in args[i] (which
   * might potentially be several arguments long if it takes arguments).
   * It should return the index after the last index it consumed in
   * processing.  In particular, if it cannot process the current option,
   * the return value should be i.
   */
  @Override
  public int setOptionFlag(String[] args, int i) {
    // [CDM 2008: there are no generic options!] first, see if it's a generic option
    // int j = super.setOptionFlag(args, i);
    // if(i != j) return j;

    //lang. specific options
    // if (args[i].equalsIgnoreCase("-vSelSplitLevel") &&
    //            (i+1 < args.length)) {
    //   selectiveSplitLevel = Integer.parseInt(args[i+1]);
    //   i+=2;
    // } else
    if (args[i].equalsIgnoreCase("-paRootDtr")) {
      paRootDtr = true;
      i += 1;
    } else if (args[i].equalsIgnoreCase("-unaryIP")) {
      unaryIP = true;
      i += 1;
    } else if (args[i].equalsIgnoreCase("-unaryCP")) {
      unaryCP = true;
      i += 1;
    } else if (args[i].equalsIgnoreCase("-markPostverbalP")) {
      markPostverbalP = true;
      i += 1;
    } else if (args[i].equalsIgnoreCase("-markPostverbalPP")) {
      markPostverbalPP = true;
      i += 1;
    } else if (args[i].equalsIgnoreCase("-baseNP")) {
      splitBaseNP = true;
      i += 1;
    } else if (args[i].equalsIgnoreCase("-markVVsisterIP")) {
      markVVsisterIP = true;
      i += 1;
    } else if (args[i].equalsIgnoreCase("-markPsisterIP")) {
      markPsisterIP = true;
      i += 1;
    } else if (args[i].equalsIgnoreCase("-markIPsisterVVorP")) {
      markIPsisterVVorP = true;
      i += 1;
    } else if (args[i].equalsIgnoreCase("-markIPsisterBA")) {
      markIPsisterBA = true;
      i += 1;
    } else if (args[i].equalsIgnoreCase("-dominatesV")) {
      dominatesV = true;
      i += 1;
    } else if (args[i].equalsIgnoreCase("-gpaAD")) {
      gpaAD = true;
      i += 1;
    } else if (args[i].equalsIgnoreCase("-markVPadjunct")) {
      markVPadjunct = Boolean.valueOf(args[i+1]);
      i += 2;
    } else if (args[i].equalsIgnoreCase("-markNPmodNP")) {
      markNPmodNP = true;
      i += 1;
    } else if (args[i].equalsIgnoreCase("-markModifiedNP")) {
      markModifiedNP = true;
      i += 1;
    } else if (args[i].equalsIgnoreCase("-nomarkModifiedNP")) {
      markModifiedNP = false;
      i += 1;
    } else if (args[i].equalsIgnoreCase("-markNPconj")) {
      markNPconj = true;
      i += 1;
    } else if (args[i].equalsIgnoreCase("-nomarkNPconj")) {
      markNPconj = false;
      i += 1;
    } else if (args[i].equalsIgnoreCase("-chineseSplitPunct")) {
      chineseSplitPunct = true;
      i += 1;
    } else if (args[i].equalsIgnoreCase("-chineseSplitPunctLR")) {
      chineseSplitPunct = true;
      chineseSplitPunctLR = true;
      i += 1;
    } else if (args[i].equalsIgnoreCase("-chineseSelectiveTagPA")) {
      chineseSelectiveTagPA = true;
      i += 1;
    } else if (args[i].equalsIgnoreCase("-chineseVerySelectiveTagPA")) {
      chineseVerySelectiveTagPA = true;
      i += 1;
    } else if (args[i].equalsIgnoreCase("-markIPsisDEC")) {
      markIPsisDEC = true;
      i += 1;
    } else if (args[i].equalsIgnoreCase("-chineseSplitVP")) {
      chineseSplitVP = Integer.parseInt(args[i+1]);
      i += 2;
    } else if (args[i].equalsIgnoreCase("-tagWordSize")) {
      tagWordSize = true;
      i += 1;
    } else if (args[i].equalsIgnoreCase("-vanilla")) {
      chineseSplitDouHao = false;
      chineseSplitPunct = false;
      chineseSplitPunctLR = false;
      markVVsisterIP = false;
      markPsisterIP = false;
      markIPsisterVVorP = false;
      markADgrandchildOfIP = false;
      gpaAD = false;
      markIPsisterBA = false;
      markVPadjunct = false;
      markNPmodNP = false;
      markModifiedNP = false;
      markNPconj = false;
      markMultiNtag = false;
      markIPsisDEC = false;
      markIPconj = false;
      markIPadjsubj = false;
      chineseSplitVP = 0;
      mergeNNVV = false;
      unaryIP = false;
      unaryCP = false;
      paRootDtr = false;
      markPostverbalP = false;
      markPostverbalPP = false;
      splitBaseNP = false;
      // selectiveSplitLevel = 0;
      i += 1;
    } else if (args[i].equalsIgnoreCase("-acl03chinese")) {
      chineseSplitDouHao = false;
      chineseSplitPunct = true;
      chineseSplitPunctLR = true;
      markVVsisterIP = true;
      markPsisterIP = true;
      markIPsisterVVorP = true;
      markADgrandchildOfIP = false;
      gpaAD = true;
      markIPsisterBA = false;
      markVPadjunct = true;
      markNPmodNP = true;
      markModifiedNP = true;
      markNPconj = true;
      markMultiNtag = false;
      markIPsisDEC = true;
      markIPconj = false;
      markIPadjsubj = false;
      chineseSplitVP = 3;
      mergeNNVV = false;
      unaryIP = true;
      unaryCP = true;
      paRootDtr = true;
      markPostverbalP = false;
      markPostverbalPP = false;
      splitBaseNP = false;
      // selectiveSplitLevel = 0;
      i += 1;
    } else if (args[i].equalsIgnoreCase("-chineseFactored")) {
      chineseSplitDouHao = false;
      chineseSplitPunct = true;
      chineseSplitPunctLR = true;
      markVVsisterIP = true;
      markPsisterIP = true;
      markIPsisterVVorP = true;
      markADgrandchildOfIP = false;
      gpaAD = true;
      markIPsisterBA = true;
      markVPadjunct = true;
      markNPmodNP = true;
      markModifiedNP = true;
      markNPconj = true;
      markMultiNtag = false;
      markIPsisDEC = true;
      markIPconj = false;
      markIPadjsubj = false;
      chineseSplitVP = 3;
      mergeNNVV = false;
      unaryIP = true;
      unaryCP = true;
      paRootDtr = true;
      markPostverbalP = false;
      markPostverbalPP = false;
      splitBaseNP = false;
      // selectiveSplitLevel = 0;
      chineseVerySelectiveTagPA = true;
      i += 1;
    } else if (args[i].equalsIgnoreCase("-chinesePCFG")) {
      chineseSplitDouHao = false;
      chineseSplitPunct = true;
      chineseSplitPunctLR = true;
      markVVsisterIP = true;
      markPsisterIP = false;
      markIPsisterVVorP = true;
      markADgrandchildOfIP = false;
      gpaAD = false;
      markIPsisterBA = true;
      markVPadjunct = true;
      markNPmodNP = true;
      markModifiedNP = true;
      markNPconj = false;
      markMultiNtag = false;
      markIPsisDEC = false;
      markIPconj = false;
      markIPadjsubj = false;
      chineseSplitVP = 0;
      mergeNNVV = false;
      unaryIP = false;
      unaryCP = false;
      paRootDtr = false;
      markPostverbalP = false;
      markPostverbalPP = false;
      splitBaseNP = false;
      // selectiveSplitLevel = 0;
      chineseVerySelectiveTagPA = true;
      i += 1;
    } else if (args[i].equalsIgnoreCase("-sunHead")) {
      sunJurafskyHeadFinder = true;
      i++;
    } else if (args[i].equalsIgnoreCase("-bikelHead")) {
      bikelHeadFinder = true;
      i++;
    } else if (args[i].equalsIgnoreCase("-discardFrags")) {
      discardFrags = true;
      i++;
    } else if (args[i].equalsIgnoreCase("-charLex")) {
      useCharacterBasedLexicon = true;
      i++;
    } else if (args[i].equalsIgnoreCase("-charUnk")) {
      useCharBasedUnknownWordModel = true;
      i++;
    } else if (args[i].equalsIgnoreCase("-rad")) {
      useUnknownCharacterModel = true;
      i++;
    } else if (args[i].equalsIgnoreCase("-lengthPenalty") && (i + 1 < args.length)) {
      lengthPenalty = Double.parseDouble(args[i + 1]);
      i += 2;
    } else if (args[i].equalsIgnoreCase("-penaltyType") && (i + 1 < args.length)) {
      penaltyType = Integer.parseInt(args[i + 1]);
      i += 2;
    } else if (args[i].equalsIgnoreCase("-gtUnknown")) {
      useGoodTuringUnknownWordModel = true;
      i++;
    } else if (args[i].equalsIgnoreCase("-maxentUnk")) {
      // useMaxentUnknownWordModel = true;
      i++;
    } else if (args[i].equalsIgnoreCase("-tuneSigma")) {
      // ChineseMaxentLexicon.tuneSigma = true;
      i++;
    } else if (args[i].equalsIgnoreCase("-trainCountThresh") && (i + 1 < args.length)) {
      // ChineseMaxentLexicon.trainCountThreshold = Integer.parseInt(args[i + 1]);
      i += 2;
    } else if (args[i].equalsIgnoreCase("-markCC")) {
      markCC = true;
      i++;
    } else if (args[i].equalsIgnoreCase("-segmentMarkov") || args[i].equalsIgnoreCase("-segmentWords")) {
      segment = true;
      segmentMarkov = true;
      segmenterClass = "edu.stanford.nlp.parser.lexparser.ChineseMarkovWordSegmenter";
      i++;
    } else if (args[i].equalsIgnoreCase("-segmentMaxMatch")) {
      segment = true;
      segmentMarkov = false;
      segmenterClass = "edu.stanford.nlp.parser.lexparser.MaxMatchSegmenter";
      i++;
    } else if (args[i].equalsIgnoreCase("-segmentDPMaxMatch")) {
      segment = true;
      segmentMarkov = false;
      segmenterClass = "edu.stanford.nlp.wordseg.MaxMatchSegmenter";
      i++;
    } else if (args[i].equalsIgnoreCase("-maxentLex")) {
      // useMaxentLexicon = true;
      i++;
    } else if (args[i].equalsIgnoreCase("-fixUnkFunctionWords")) {
      // ChineseMaxentLexicon.fixUnkFunctionWords = true;
      i++;
    } else if (args[i].equalsIgnoreCase("-similarWordSmoothing")) {
      useSimilarWordMap = true;
      i++;
    } else if (args[i].equalsIgnoreCase("-maxentLexSeenTagsOnly")) {
      // useMaxentLexicon = true;
      // ChineseMaxentLexicon.seenTagsOnly = true;
      i++;
    } else if (args[i].equalsIgnoreCase("-maxentLexFeatLevel") && (i + 1 < args.length)) {
      // ChineseMaxentLexicon.featureLevel = Integer.parseInt(args[i + 1]);
      i += 2;
    } else if (args[i].equalsIgnoreCase("-maxentDepGrammarFeatLevel") && (i + 1 < args.length)) {
      depGramFeatureLevel = Integer.parseInt(args[i + 1]);
      i += 2;
    } else if (args[i].equalsIgnoreCase("-maxentDepGrammar")) {
      // useMaxentDepGrammar = true;
      i++;
    } else if (args[i].equalsIgnoreCase("-splitNPTMP")) {
      splitNPTMP = true;
      i++;
    } else if (args[i].equalsIgnoreCase("-splitPPTMP")) {
      splitPPTMP = true;
      i++;
    } else if (args[i].equalsIgnoreCase("-splitXPTMP")) {
      splitXPTMP = true;
      i++;
    } else if (args[i].equalsIgnoreCase("-segmenter")) {
      segment = true;
      segmentMarkov = false;
      segmenterClass = args[i + 1];
      i += 2;
    } else if (args[i].equalsIgnoreCase("-headFinder") && (i + 1 < args.length)) {
      try {
        headFinder = (HeadFinder) Class.forName(args[i + 1]).newInstance();
      } catch (Exception e) {
        log.info(e);
        log.info(this.getClass().getName() + ": Could not load head finder " + args[i + 1]);
        throw new RuntimeException(e);
      }
      i+=2;
    }

    return i;
  }

  private int depGramFeatureLevel = 0;

  @Override
  public Extractor dependencyGrammarExtractor(final Options op, Index wordIndex, Index tagIndex) {
    /* ----------
    if (useMaxentDepGrammar) {
      return new Extractor() {
        public Object extract(Collection trees) {
          ChineseWordFeatureExtractor wfe = new ChineseWordFeatureExtractor(trees);
          ChineseWordFeatureExtractor wfe2 = new ChineseWordFeatureExtractor(trees);
          wfe.setFeatureLevel(2);
          wfe2.turnOffWordFeatures = true;
          wfe2.setFeatureLevel(depGramFeatureLevel);
          MaxentDependencyGrammar dg = new MaxentDependencyGrammar(op.tlpParams, wfe, wfe2, true, false, false);
          dg.train(trees);
          return dg;
        }

        public Object extract(Iterator iterator, Function f) {
          throw new UnsupportedOperationException();
        }
      };
    } else ------- */
    if (useSimilarWordMap) {
      return new MLEDependencyGrammarExtractor(op, wordIndex, tagIndex) {
        @Override
        public MLEDependencyGrammar formResult() {
          wordIndex.addToIndex(Lexicon.UNKNOWN_WORD);
          ChineseSimWordAvgDepGrammar dg = new ChineseSimWordAvgDepGrammar(tlpParams, directional, useDistance, useCoarseDistance, op.trainOptions.basicCategoryTagsInDependencyGrammar, op, wordIndex, tagIndex);
          if (lex == null) {
            throw new RuntimeException("Attempt to create ChineseSimWordAvgDepGrammar before Lexicon!!!");
          } else {
            dg.setLex(lex);
          }
          for (IntDependency dependency : dependencyCounter.keySet()) {
            dg.addRule(dependency, dependencyCounter.getCount(dependency));
          }
          return dg;
        }

     };
    } else {
      return new MLEDependencyGrammarExtractor(op, wordIndex, tagIndex);
    }
  }

  /**
   * Return a default sentence for the language (for testing)
   */
  @Override
  public ArrayList defaultTestSentence() {
    return SentenceUtils.toUntaggedList("\u951f\u65a4\u62f7", "\u951f\u65a4\u62f7", "\u5b66\u6821", "\u951f\u65a4\u62f7", "\u5b66\u4e60", "\u951f\u65a4\u62f7");
  }


  private static final long serialVersionUID = 2;


  @Override
  public List
    readGrammaticalStructureFromFile(String filename)
  {
    try {
      if (this.generateOriginalDependencies()) {
        return ChineseGrammaticalStructure.
            readCoNLLXGrammaticalStructureCollection(filename);
      } else {
        return UniversalChineseGrammaticalStructure.
            readCoNLLXGrammaticalStructureCollection(filename);
      }
    } catch (IOException e) {
      throw new RuntimeIOException(e);
    }
  }

  @Override
  public GrammaticalStructure getGrammaticalStructure(Tree t,
                                                      Predicate filter,
                                                      HeadFinder hf) {
    if (this.generateOriginalDependencies()) {
      return new ChineseGrammaticalStructure(t, filter, hf);
    } else {
      return new UniversalChineseGrammaticalStructure(t, filter, hf);
    }
  }

  @Override
  public boolean supportsBasicDependencies() {
    return true;
  }

  @Override
  public boolean generateOriginalDependencies() {
    return generateOriginalDependencies;
  }

  /**
   * For testing: loads a treebank and prints the trees.
   */
  public static void main(String[] args) {
    TreebankLangParserParams tlpp = new ChineseTreebankParserParams();
    System.out.println("Default encoding is: " +
                       tlpp.diskTreebank().encoding());

    if (args.length < 2) {
      printlnErr("Usage: edu.stanford.nlp.parser.lexparser.ChineseTreebankParserParams treesPath fileRange");
    } else {
      Treebank m = tlpp.diskTreebank();
      m.loadPath(args[0], new NumberRangesFileFilter(args[1], false));

      for (Tree t : m ) {
        t.pennPrint(tlpp.pw());
      }
      System.out.println("There were " + m.size() + " trees.");
    }
  }

}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy