edu.stanford.nlp.wordseg.ChineseSegmenterFeatureFactory Maven / Gradle / Ivy

Show more of this group Show more artifacts with this name
Show all versions of stanford-corenlp Show documentation
Stanford CoreNLP provides a set of natural language analysis tools which can take raw English language text input and give the base forms of words, their parts of speech, whether they are names of companies, people, etc., normalize dates, times, and numeric quantities, mark up the structure of sentences in terms of phrases and word dependencies, and indicate which noun phrases refer to the same entities. It provides the foundational building blocks for higher level text understanding applications.
There is a newer version: 4.5.7
Show newest version
package edu.stanford.nlp.wordseg;

import java.util.*;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import java.io.Serializable;


import edu.stanford.nlp.util.logging.Redwood;

import edu.stanford.nlp.ling.CoreLabel;
import edu.stanford.nlp.ling.CoreAnnotations;
import edu.stanford.nlp.sequences.FeatureFactory;
import edu.stanford.nlp.sequences.SeqClassifierFlags;
import edu.stanford.nlp.sequences.Clique;
import edu.stanford.nlp.trees.international.pennchinese.RadicalMap;
import edu.stanford.nlp.util.Generics;
import edu.stanford.nlp.util.PaddedList;

/**
 * A Chinese segmenter Feature Factory for GALE project. (modified from Sighan Bakeoff 2005.)
 * This is supposed to have all the good closed-track features from Sighan bakeoff 2005,
 * and some other "open-track" features
 *
 * This will also be used to do a character-based chunking!
 * 
 * c is Chinese character ("char").  c means current, n means next and p means previous.
 * 
 *
 * 
 * 
 * 
 * 
 * 
 * 
 * 
 * 
 * 
 * 
 * 
 * Feature Templates
Current position clique
useWord1 CONSTANT, cc, nc, pc, pc+cc, if (As|Msr|Pk|Hk) cc+nc, pc,nc 
 *
 * @author Huihsin Tseng
 * @author Pichuan Chang
 */

public class ChineseSegmenterFeatureFactory extends FeatureFactory implements Serializable {
  /**
   *
   */

  private static final long serialVersionUID = 3387166382968763350L;
  private static TagAffixDetector taDetector = null;

  private static Redwood.RedwoodChannels logger = Redwood.channels(ChineseSegmenterFeatureFactory.class);

  public void init(SeqClassifierFlags flags) {
    super.init(flags);
  }


  /**
   * Extracts all the features from the input data at a certain index.
   *
   * @param cInfo The complete data set as a List of WordInfo
   * @param loc  The index at which to extract features.
   */
  public Collection getCliqueFeatures(PaddedList cInfo, int loc, Clique clique) {
    Collection features = Generics.newHashSet();

    if (clique == cliqueC) {
      addAllInterningAndSuffixing(features, featuresC(cInfo, loc), "C");
    } else if (clique == cliqueCpC) {
      addAllInterningAndSuffixing(features, featuresCpC(cInfo, loc), "CpC");
      addAllInterningAndSuffixing(features, featuresCnC(cInfo, loc-1), "CnC");
    } 
    // else if (clique == cliqueCpCp2C) {
    //   addAllInterningAndSuffixing(features, featuresCpCp2C(cInfo, loc), "CpCp2C");
    // } else if (clique == cliqueCpCp2Cp3C) {
    //   addAllInterningAndSuffixing(features, featuresCpCp2Cp3C(cInfo, loc), "CpCp2Cp3C");
    // } else if (clique == cliqueCpCp2Cp3Cp4C) {
    //   addAllInterningAndSuffixing(features, featuresCpCp2Cp3Cp4C(cInfo, loc), "CpCp2Cp3Cp4C");
    // } else if (clique == cliqueCpCp2Cp3Cp4Cp5C) {
    //   addAllInterningAndSuffixing(features, featuresCpCp2Cp3Cp4Cp5C(cInfo, loc), "CpCp2Cp3Cp4Cp5C");
    // }

    return features;
  }



  private static Pattern patE = Pattern.compile("[a-z]");
  private static Pattern patEC = Pattern.compile("[A-Z]");
  private static String isEnglish(String Ep, String Ec) {
    String chp = Ep;
    String chc = Ec;
    Matcher mp = patE.matcher(chp);   // previous char is [a-z]
    Matcher mc = patE.matcher(chc);   //  current char is [a-z]
    Matcher mpC = patEC.matcher(chp); // previous char is [A-Z]
    Matcher mcC = patEC.matcher(chc); //  current char is [A-Z]
    if (mp.matches() && mcC.matches()){
      return "BND"; // [a-z][A-Z]
    } else if (mp.matches() && mc.matches()){
      return "ENG"; // [a-z][a-z]
    } else if (mpC.matches() && mcC.matches()){
      return "BCC"; // [A-Z][A-Z]
    } else if (mp.matches() && !mc.matches() && !mcC.matches()){
      return "e1";  // [a-z][^A-Za-z]
    } else if (mc.matches() && !mp.matches() && !mpC.matches()) {
      return "e2";  // [^A-Za-z][a-z]
    } else if (mpC.matches() && !mc.matches() && !mcC.matches()){
      return "e3";  // [A-Z][^A-Za-z]
    } else if (mcC.matches() && !mp.matches() && !mpC.matches()) {
      return "e4";  // [^A-Za-z][A-Z]
    } else {
      return "";
    }
  }//is English

  private static Pattern patP = Pattern.compile("[\u00b7\\-\\.]");
  private static String isEngPU(String Ep) {
    Matcher mp = patP.matcher(Ep);
    if (mp.matches()){
      return "1:EngPU";
    } else {
      return "";
    }
  }//is EnglishPU



  public Collection featuresC(PaddedList cInfo, int loc) {
    Collection features = new ArrayList<>();
    CoreLabel c = cInfo.get(loc);
    CoreLabel c1 = cInfo.get(loc + 1);
    CoreLabel c2 = cInfo.get(loc + 2);
    CoreLabel c3 = cInfo.get(loc + 3);
    CoreLabel p = cInfo.get(loc - 1);
    CoreLabel p2 = cInfo.get(loc - 2);
    CoreLabel p3 = cInfo.get(loc - 3);
    String charc = c.get(CoreAnnotations.CharAnnotation.class);
    String charc1 = c1.get(CoreAnnotations.CharAnnotation.class);
    String charc2 = c2.get(CoreAnnotations.CharAnnotation.class);
    String charc3 = c3.get(CoreAnnotations.CharAnnotation.class);
    String charp = p.get(CoreAnnotations.CharAnnotation.class);
    String charp2 = p2.get(CoreAnnotations.CharAnnotation.class);
    String charp3 = p3.get(CoreAnnotations.CharAnnotation.class);

    /**
     * N-gram features. N is upto 2.
     */
    if (flags.useWord1) {
      // features.add(charc +"c");
      // features.add(charc1+"c1");
      // features.add(charp +"p");
      // features.add(charp +charc  +"pc");
      // if(flags.useAs || flags.useMsr || flags.usePk || flags.useHk){ //msr, as
      //   features.add(charc +charc1 +"cc1");
      //   features.add(charp + charc1 +"pc1");
      // }

      features.add(charc +"::c");
      features.add(charc1+"::c1");
      features.add(charp +"::p");
      features.add(charp2 +"::p2");
      // trying to restore the features that Huishin described in SIGHAN 2005 paper
      features.add(charc +charc1  +"::cn");
      features.add(charp +charc  +"::pc");
      features.add(charp +charc1  +"::pn");
      features.add(charp2 +charp  +"::p2p");
      features.add(charp2 +charc  +"::p2c");
      features.add(charc2 +charc  +"::n2c");

      features.add("|word1");
    }

    return features;
  }

  private static CorpusDictionary outDict = null;

  public Collection featuresCpC(PaddedList cInfo, int loc) {
    Collection features = new ArrayList<>();
    CoreLabel c = cInfo.get(loc);
    CoreLabel c1 = cInfo.get(loc + 1);
    CoreLabel c2 = cInfo.get(loc + 2);
    CoreLabel c3 = cInfo.get(loc + 3);
    CoreLabel p = cInfo.get(loc - 1);
    CoreLabel p2 = cInfo.get(loc - 2);
    CoreLabel p3 = cInfo.get(loc - 3);
    String charc = c.get(CoreAnnotations.CharAnnotation.class);
    if (charc == null) charc = "";
    String charc1 = c1.get(CoreAnnotations.CharAnnotation.class);
    if (charc1 == null) charc1 = "";
    String charc2 = c2.get(CoreAnnotations.CharAnnotation.class);
    if (charc2 == null) charc2 = "";
    String charc3 = c3.get(CoreAnnotations.CharAnnotation.class);
    if (charc3 == null) charc3 = "";
    String charp = p.get(CoreAnnotations.CharAnnotation.class);
    if (charp == null) charp = "";
    String charp2 = p2.get(CoreAnnotations.CharAnnotation.class);
    if (charp2 == null) charp2 = "";
    String charp3 = p3.get(CoreAnnotations.CharAnnotation.class);
    if (charp3 == null) charp3 = "";


    /*
     * N-gram features. N is upto 2.
     */

    if (flags.useWord2) {
      // features.add(charc +"c");
      // features.add(charc1+"c1");
      // features.add(charp +"p");
      // features.add(charp +charc  +"pc");
      // if( flags.useMsr ){
      //   features.add(charc +charc1 +"cc1");
      //   features.add(charp + charc1 +"pc1");
      // }

      features.add(charc +"::c");
      features.add(charc1+"::c1");
      features.add(charp +"::p");
      features.add(charp2 +"::p2");
      // trying to restore the features that Huishin described in SIGHAN 2005 paper
      features.add(charc +charc1  +"::cn");
      features.add(charp +charc  +"::pc");
      features.add(charp +charc1  +"::pn");
      features.add(charp2 +charp  +"::p2p");
      features.add(charp2 +charc  +"::p2c");
      features.add(charc2 +charc  +"::n2c");

      features.add("|word2");
    }

    /*
      Radical N-gram features. N is upto 4.
      Smoothing method of N-gram, because there are too many characters in Chinese.
      (It works better than N-gram when they are used individually. less sparse)
    */

    char rcharc, rcharc1,rcharc2, rcharc3, rcharp, rcharp1,rcharp2,rcharp3;
    if (charc.length()==0) { rcharc='n'; } else { rcharc=RadicalMap.getRadical(charc.charAt(0));}
    if (charc1.length()==0) { rcharc1='n'; } else { rcharc1=RadicalMap.getRadical(charc1.charAt(0));}
    if (charc2.length()==0) { rcharc2='n'; } else { rcharc2=RadicalMap.getRadical(charc2.charAt(0));}
    if (charc3.length()==0) { rcharc3='n'; } else { rcharc3=RadicalMap.getRadical(charc3.charAt(0));}
    if (charp.length()==0) { rcharp='n'; } else { rcharp=RadicalMap.getRadical(charp.charAt(0));}
    if (charp2.length()==0) { rcharp2='n'; } else { rcharp2=RadicalMap.getRadical(charp2.charAt(0));}
    if (charp3.length()==0) { rcharp3='n'; } else { rcharp3=RadicalMap.getRadical(charp3.charAt(0));}

    if(flags.useRad2){
      features.add(rcharc+"rc");
      features.add(rcharc1+"rc1");
      features.add(rcharp+"rp");
      features.add(rcharp  +  rcharc+"rpc");
      features.add(rcharc +rcharc1 +"rcc1");
      features.add(rcharp +  rcharc  +rcharc1 +"rpcc1");
      features.add("|rad2");
    }

    /* non-word dictionary:SEEM bi-gram marked as non-word */
    if (flags.useDict2) {
      NonDict2 nd = new NonDict2(flags);
      features.add(nd.checkDic(charp+charc, flags)+"nondict");
      features.add("|useDict2");
    }

    if (flags.useOutDict2){
      if (outDict == null) {
        logger.info("reading "+flags.outDict2+" as a seen lexicon");
        outDict = new CorpusDictionary(flags.outDict2, true);
      }
      features.add(outDict.getW(charp+charc)+"outdict");       // -1 0
      features.add(outDict.getW(charc+charc1)+"outdict");      // 0 1
      features.add(outDict.getW(charp2+charp)+"outdict");      // -2 -1
      features.add(outDict.getW(charp2+charp+charc)+"outdict");      // -2 -1 0
      features.add(outDict.getW(charp3+charp2+charp)+"outdict");      // -3 -2 -1
      features.add(outDict.getW(charp+charc+charc1)+"outdict");      // -1 0 1
      features.add(outDict.getW(charc+charc1+charc2)+"outdict");      // 0 1 2
      features.add(outDict.getW(charp+charc+charc1+charc2)+"outdict");      // -1 0 1 2
    }

    /*
      (CTB/ASBC/HK/PK/MSR) POS information of each characters.
      If a character falls into some function categories,
      it is very likely there is a boundary.
      A lot of Chinese function words belong to single characters.
      This feature is also good for numbers and punctuations.
      DE* are grouped into DE.
    */
    if (flags.useCTBChar2 || flags.useASBCChar2 || flags.useHKChar2
        || flags.usePKChar2 || flags.useMSRChar2) {
      String[] tagsets;
      // the "useChPos" now only works for CTB and PK
      if (flags.useChPos) {
        if(flags.useCTBChar2) {
          tagsets = new String[]{"AD", "AS", "BA", "CC", "CD", "CS", "DE", "DT", "ETC", "IJ", "JJ", "LB", "LC", "M",  "NN",  "NR", "NT", "OD", "P", "PN", "PU", "SB", "SP", "VA", "VC", "VE", "VV" };
        } else if (flags.usePKChar2) {
          //tagsets = new String[]{"r", "j", "t", "a", "nz", "l", "vn", "i", "m", "ns", "nr", "v", "n", "q", "Ng", "b", "d", "nt"};
          tagsets = new String[]{"2","3","4"};
        } else {
          throw new RuntimeException("only support settings for CTB and PK now.");
        }
      } else {
        //logger.info("Using Derived features");
        tagsets = new String[]{"2","3","4"};
      }

      if (taDetector == null) {
        taDetector = new TagAffixDetector(flags);
      }
      for (String tagset : tagsets) {
        features.add(taDetector.checkDic(tagset + "p", charp) + taDetector.checkDic(tagset + "i", charp) + taDetector.checkDic(tagset + "s", charc) + taDetector.checkInDic(charp) + taDetector.checkInDic(charc) + tagset + "prep-sufc");
        // features.add("|ctbchar2");  // Added a constant feature several times!!
      }
    }

    /*
      In error analysis, we found English words and numbers are often separated.
      Rule 1: isNumber feature: check if the current and previous char is a number.
      Rule 2: Disambiguation of time point and time duration.
      Rule 3: isEnglish feature: check if the current and previous character is an english letter.
      Rule 4: English name feature: check if the current char is a conjunct pu for English first and last name, since there is no space between two names.
      Most of PUs are a good indicator for word boundary, but - and .  is a strong indicator that there is no boundry within a previous , a follow char and it.
    */

    if (flags.useRule2) {
      /* Reduplication features */
      // previous character == current character
      if(charp.equals(charc)){ features.add("11");}
      // previous character == next character
      if(charp.equals(charc1)){ features.add("22");}

      // current character == next next character
      // fire only when usePk and useHk are both false.
      // Notice: this should be (almost) the same as the "22" feature, but we keep it for now.
      if( !flags.usePk && !flags.useHk) {
        if(charc.equals(charc2)){features.add("33");}
      }

      char cur1 = ' ';
      char cur2 = ' ';
      char cur =  ' ';
      char pre =  ' ';
      // actually their length must be either 0 or 1
      if (charc1.length() > 0) { cur1 = charc1.charAt(0); }
      if (charc2.length() > 0) { cur2 = charc2.charAt(0); }
      if (charc.length() > 0) { cur = charc.charAt(0); }
      if (charp.length() > 0) { pre = charp.charAt(0); }

      String prer= String.valueOf(rcharp); // the radical of previous character

      Pattern E = Pattern.compile("[a-zA-Z]");
      Pattern N = Pattern.compile("[0-9]");
      Matcher m = E.matcher(charp);
      Matcher ce = E.matcher(charc);
      Matcher pe = E.matcher(charp2);
      Matcher cn = N.matcher(charc);
      Matcher pn = N.matcher(charp2);


      // if current and previous characters are numbers...
      if (cur >= '0' && cur <= '9'&& pre >= '0' && pre <= '9'){
        if (cur == '9' && pre == '1' && cur1 == '9'&& cur2 >= '0' && cur2 <= '9'){ //199x
          features.add("YR");
        }else{
          features.add("2N");
        }

      // if current and previous characters are not both numbers
      // but previous char is a number
      // i.e. patterns like "1N" , "2A", etc
      } else if (pre >= '0' && pre <= '9'){
        features.add("1N");

      // if previous character is an English character
      } else if(m.matches()){
        features.add("E");

      // if the previous character contains no radical (and it exist)
      } else if(prer.equals(".") && charp.length() == 1){
        // fire only when usePk and useHk are both false. Not sure why. -pichuan
        if(!flags.useHk && !flags.usePk ){
          if(ce.matches()){
            features.add("PU+E");
          }
          if(pe.matches()){
            features.add("E+PU");
          }
          if(cn.matches()){
            features.add("PU+N");
          }
          if(pn.matches()){
            features.add("N+PU");
          }
        }
        features.add("PU");
      }

      String engType = isEnglish(charp, charc);
      String engPU = isEngPU(charp);
      if ( ! engType.equals(""))
        features.add(engType);
      if ( ! engPU.equals("") && ! engType.equals(""))
        features.add(engPU + engType);
    }//end of use rule


    // features using "Character.getType" information!
    String origS = c.get(CoreAnnotations.OriginalCharAnnotation.class);
    char origC = ' ';
    if (origS.length() > 0) { origC = origS.charAt(0); }
    int type = Character.getType(origC);
    switch (type) {
    case Character.UPPERCASE_LETTER: // A-Z and full-width A-Z
    case Character.LOWERCASE_LETTER: // a-z and full-width a-z
      features.add("CHARTYPE-LETTER");
      break;
    case Character.DECIMAL_DIGIT_NUMBER:
      features.add("CHARTYPE-DECIMAL_DIGIT_NUMBER");
      break;
    case Character.OTHER_LETTER: // mostly chinese chars
      features.add("CHARTYPE-OTHER_LETTER");
      break;
    default: // other types
      features.add("CHARTYPE-MISC");
    }

    return features;
  }


  public Collection featuresCnC(PaddedList cInfo, int loc) {
    Collection features = new ArrayList<>();
    CoreLabel c = cInfo.get(loc);
    CoreLabel c1 = cInfo.get(loc + 1);
    CoreLabel p = cInfo.get(loc - 1);
    String charc = c.get(CoreAnnotations.CharAnnotation.class);
    String charc1 = c1.get(CoreAnnotations.CharAnnotation.class);
    String charp = p.get(CoreAnnotations.CharAnnotation.class);


    if (flags.useWordn) {
      features.add(charc +"c");
      features.add(charc1+"c1");
      features.add(charp +"p");
      features.add(charp +charc  +"pc");

      if(flags.useAs || flags.useMsr||flags.usePk||flags.useHk){
        features.add(charc +charc1 +"cc1");
        features.add(charp + charc1 +"pc1");
      }
      features.add("|wordn");
    }
    return features;
  }//end of CnC


}//end of Class
Feature	Templates
	Current position clique
useWord1	CONSTANT, cc, nc, pc, pc+cc, if (As\|Msr\|Pk\|Hk) cc+nc, pc,nc