edu.stanford.nlp.trees.international.pennchinese.ChineseUtils Maven / Gradle / Ivy

Show more of this group Show more artifacts with this name
Show all versions of stanford-corenlp Show documentation
Stanford CoreNLP provides a set of natural language analysis tools which can take raw English language text input and give the base forms of words, their parts of speech, whether they are names of companies, people, etc., normalize dates, times, and numeric quantities, mark up the structure of sentences in terms of phrases and word dependencies, and indicate which noun phrases refer to the same entities. It provides the foundational building blocks for higher level text understanding applications.
There is a newer version: 4.5.7
Show newest version
package edu.stanford.nlp.trees.international.pennchinese;

import edu.stanford.nlp.io.IOUtils;
import edu.stanford.nlp.io.EncodingPrintWriter;
import edu.stanford.nlp.util.StringUtils;
import edu.stanford.nlp.util.logging.Redwood;

import java.io.*;
import java.util.regex.Pattern;

/** This class contains a few String constants and
 *  static methods for dealing with Chinese text.
 *  
 *  Warning: The code contains a version that uses codePoint methods
 *  to handle full Unicode.  But it seems to tickle some bugs in
 *  Sun's JDK 1.5.  It works correctly with JDK 1.6+.  By default it is
 *  enabled. The version that only handles BMP characters can be used by editing the code.  The
 *  latter prints a warning message if it sees a high-surrogate character.
 *
 *  @author Christopher Manning
 */
public class ChineseUtils  {

  /** A logger for this class */
  private static final Redwood.RedwoodChannels log = Redwood.channels(ChineseUtils.class);

  /** Whether to only support BMP character normalization.
   *  If set to true, this is more limited, but avoids bugs in JDK 1.5.
   */
  private static final boolean ONLY_BMP = false;

  // These are good Unicode whitespace regexes for any language!
  public static final String ONEWHITE = "[\\s\\p{Zs}]";
  public static final String WHITE = ONEWHITE + "*";
  public static final String WHITEPLUS = ONEWHITE + "+";

  // Chinese numbers 1-10
  public static final String NUMBERS = "[\u4e00\u4e8c\u4e09\u56db\u4e94\u516d\u4e03\u516b\u4e5d\u5341]";

  // List of characters similar to \u00B7 listed in the Unicode 5.0 manual
  public static final String MID_DOT_REGEX_STR = "[\u00B7\u0387\u2022\u2024\u2027\u2219\u22C5\u30FB]";


  // These are the constants for the normalize method
  public static final int LEAVE = 0;
  public static final int ASCII = 1;
  public static final int NORMALIZE = 1; // Unicode normalization moves to low
  public static final int FULLWIDTH = 2;
  public static final int DELETE = 3;
  public static final int DELETE_EXCEPT_BETWEEN_ASCII = 4;
  public static final int MAX_LEGAL = 4;

  // private int[] puaChars = { 0xE005 };
  // private int[] uniChars = { 0x42B5 };


  // not instantiable
  private ChineseUtils() {}

  public static boolean isNumber(char c) {
    return (StringUtils.matches(String.valueOf(c), NUMBERS) || Character.isDigit(c));
  }

  public static String normalize(String in) {
    return normalize(in, FULLWIDTH, ASCII);
  }

  public static String normalize(String in, int ascii, int spaceChar) {
    return normalize(in, ascii, spaceChar, LEAVE);
  }

  /** This will normalize a Unicode String in various ways.  This routine
   *  correctly handles characters outside the basic multilingual plane.
   *
   *  @param in The String to be normalized
   *  @param ascii For characters conceptually in the ASCII range of
   *      ! through ~ (U+0021 through U+007E or U+FF01 through U+FF5E),
   *      if this is ChineseUtils.LEAVE, then do nothing,
   *      if it is ASCII then map them from the Chinese Full Width range
   *      to ASCII values, and if it is FULLWIDTH then do the reverse.
   *  @param spaceChar For characters that satisfy Character.isSpaceChar(),
   *      if this is ChineseUtils.LEAVE, then do nothing,
   *      if it is ASCII then map them to the space character U+0020, and
   *      if it is FULLWIDTH then map them to U+3000.
   *  @param midDot For a set of 7 characters that are roughly middle dot characters,
   *      if this is ChineseUtils.LEAVE, then do nothing,
   *      if it is NORMALIZE then map them to the extended Latin character U+00B7, and
   *      if it is FULLWIDTH then map them to U+30FB.
   *  @return The in String normalized according to the other arguments.
   */
  public static String normalize(String in,
                                 int ascii,
                                 int spaceChar,
                                 int midDot) {
    if (ascii < 0 || ascii > MAX_LEGAL ||
        spaceChar < 0 || spaceChar > MAX_LEGAL) {
      throw new IllegalArgumentException("ChineseUtils: Unknown parameter option");
    }
    if (ONLY_BMP) {
      return normalizeBMP(in, ascii, spaceChar, midDot);
    } else {
      return normalizeUnicode(in, ascii, spaceChar, midDot);
    }
  }


  private static String normalizeBMP(String in, int ascii, int spaceChar, int midDot) {
    StringBuilder out = new StringBuilder();
    int len = in.length();
    for (int i = 0; i < len; i++) {
      char cp = in.charAt(i);
      if (Character.isHighSurrogate(cp)) {
        if (i + 1 < len) {
          log.warn("ChineseUtils.normalize warning: non-BMP codepoint U+" +
                  Integer.toHexString(Character.codePointAt(in, i)) + " in " + in);
        } else {
          log.warn("ChineseUtils.normalize warning: unmatched high surrogate character U+" +
                  Integer.toHexString(Character.codePointAt(in, i)) + " in " + in);
        }
      }
      Character.UnicodeBlock cub = Character.UnicodeBlock.of(cp);
      if (cub == Character.UnicodeBlock.PRIVATE_USE_AREA ||
              cub == Character.UnicodeBlock.SUPPLEMENTARY_PRIVATE_USE_AREA_A ||
              cub == Character.UnicodeBlock.SUPPLEMENTARY_PRIVATE_USE_AREA_B) {
        EncodingPrintWriter.err.println("ChineseUtils.normalize warning: private use area codepoint U+" + Integer.toHexString(cp) + " in " + in);
      }
      boolean delete = false;
      switch (ascii) {
        case LEAVE:
          break;
        case ASCII:
          if (cp >= '\uFF01' && cp <= '\uFF5E') {
            cp -= (0xFF00 - 0x0020);
          }
          break;
        case FULLWIDTH:
          if (cp >= '\u0021' && cp <= '\u007E') {
            cp += (0xFF00 - 0x0020);
          }
          break;
        default:
          throw new IllegalArgumentException("ChineseUtils: Unsupported parameter option: ascii=" + ascii);
      }
      switch (spaceChar) {
        case LEAVE:
          break;
        case ASCII:
          if (Character.isSpaceChar(cp)) {
            cp = ' ';
          }
          break;
        case FULLWIDTH:
          if (Character.isSpaceChar(cp)) {
            cp = '\u3000';
          }
          break;
        case DELETE:
          if (Character.isSpaceChar(cp)) {
            delete = true;
          }
          break;
        case DELETE_EXCEPT_BETWEEN_ASCII:
          char cpp = 0;
          if (i > 0) { cpp = in.charAt(i - 1); }
          char cpn = 0;
          if (i < (len - 1)) { cpn = in.charAt(i + 1); }
          // EncodingPrintWriter.out.println("cp: " + cp + "; cpp: " + cpp + "cpn: " + cpn +
          //      "; isSpace: " + Character.isSpaceChar(cp) + "; isAsciiLHL: " + isAsciiLowHigh(cpp) +
          //      "; isAsciiLHR: " + isAsciiLowHigh(cpn), "UTF-8");
          if (Character.isSpaceChar(cp) && ! (isAsciiLowHigh(cpp) && isAsciiLowHigh(cpn))) {
            delete = true;
          }
      }
      switch (midDot) {
        case LEAVE:
          break;
        case NORMALIZE:
          if (isMidDot(cp)) {
            cp = '\u00B7';
          }
          break;
        case FULLWIDTH:
          if (isMidDot(cp)) {
            cp = '\u30FB';
          }
          break;
        case DELETE:
          if (isMidDot(cp)) {
            delete = true;
          }
          break;
        default:
          throw new IllegalArgumentException("ChineseUtils: Unsupported parameter option: midDot=" + midDot);
      }
      if ( ! delete) {
        out.append(cp);
      }
    } // end for
    return out.toString();
  }

  private static String normalizeUnicode(String in, int ascii, int spaceChar, int midDot) {
    StringBuilder out = new StringBuilder();
    int len = in.length();
    // Do it properly with codepoints, for non-BMP Unicode as well
    // int numCP = in.codePointCount(0, len);
    int cpp = 0; // previous codepoint
    for (int offset = 0, cp; offset < len; offset += Character.charCount(cp)) {
      // int offset = in.offsetByCodePoints(0, offset);
      cp = in.codePointAt(offset);
      Character.UnicodeBlock cub = Character.UnicodeBlock.of(cp);
      if (cub == Character.UnicodeBlock.PRIVATE_USE_AREA ||
              cub == Character.UnicodeBlock.SUPPLEMENTARY_PRIVATE_USE_AREA_A ||
              cub == Character.UnicodeBlock.SUPPLEMENTARY_PRIVATE_USE_AREA_B) {
        EncodingPrintWriter.err.println("ChineseUtils.normalize warning: private use area codepoint U+" + Integer.toHexString(cp) + " in " + in);
      }
      boolean delete = false;
      switch (ascii) {
        case LEAVE:
          break;
        case ASCII:
          if (cp >= '\uFF01' && cp <= '\uFF5E') {
            cp -= (0xFF00 - 0x0020);
          }
          break;
        case FULLWIDTH:
          if (cp >= '\u0021' && cp <= '\u007E') {
            cp += (0xFF00 - 0x0020);
          }
          break;
        default:
          throw new IllegalArgumentException("ChineseUtils: Unsupported parameter option: ascii=" + ascii);
      }
      switch (spaceChar) {
        case LEAVE:
          break;
        case ASCII:
          if (Character.isSpaceChar(cp)) {
            cp = ' ';
          }
          break;
        case FULLWIDTH:
          if (Character.isSpaceChar(cp)) {
            cp = '\u3000';
          }
          break;
        case DELETE:
          if (Character.isSpaceChar(cp)) {
            delete = true;
          }
          break;
        case DELETE_EXCEPT_BETWEEN_ASCII:
          int nextOffset = offset + Character.charCount(cp);
          int cpn = 0;
          if (nextOffset < len) {
            cpn = in.codePointAt(nextOffset);
          }
          if (Character.isSpaceChar(cp) && ! (isAsciiLowHigh(cpp) && isAsciiLowHigh(cpn))) {
            delete = true;
          }
      }
      switch (midDot) {
        case LEAVE:
          break;
        case NORMALIZE:
          if (isMidDot(cp)) {
            cp = '\u00B7';
          }
          break;
        case FULLWIDTH:
          if (isMidDot(cp)) {
            cp = '\u30FB';
          }
          break;
        case DELETE:
          if (isMidDot(cp)) {
            delete = true;
          }
          break;
        default:
          throw new IllegalArgumentException("ChineseUtils: Unsupported parameter option: midDot=" + midDot);
      }
      if ( ! delete) {
        out.appendCodePoint(cp);
      }
      cpp = cp;
    } // end for
    return out.toString();
  }

  private static boolean isMidDot(int cp) {
    return cp == '\u00B7' || cp == '\u0387' || cp == '\u2022' ||
            cp == '\u2024' || cp == '\u2027' || cp == '\u2219' ||
            cp == '\u22C5' || cp == '\u30FB';
  }

  private static boolean isAsciiLowHigh(int cp) {
    return cp >= '\uFF01' && cp <= '\uFF5E' ||
        cp >= '\u0021' && cp <= '\u007E';
  }

  /** Mainly for testing.  Usage:
   *  {@code ChineseUtils ascii spaceChar word*}
   *
   *  ascii and spaceChar are integers: 0 = leave, 1 = ascii, 2 = fullwidth.
   *  The words listed are then normalized and sent to stdout.
   *  If no words are given, the program reads from and normalizes stdin.
   *  Input is assumed to be in UTF-8.
   *
   *  @param args Command line arguments as above
   *  @throws IOException If any problems accessing command-line files
   */
  public static void main(String[] args) throws IOException {
    if (args.length < 3) {
      log.info("usage: ChineseUtils ascii space midDot word*");
      log.info("  First 3 args are int flags; a filter or maps args as words; assumes UTF-8");
      return;
    }
    int i = Integer.parseInt(args[0]);
    int j = Integer.parseInt(args[1]);
    int midDot = Integer.parseInt(args[2]);
    if (args.length > 3) {
      for (int k = 3; k < args.length; k++) {
        EncodingPrintWriter.out.println(normalize(args[k], i, j, midDot));
      }
    } else {
      BufferedReader r = IOUtils.readerFromStdin("UTF-8");
      for (String line; (line = r.readLine()) != null; ) {
        EncodingPrintWriter.out.println(normalize(line, i, j, midDot));
      }
    }
  }


  // year, month, day chars.  Sometime try adding \u53f7 and see if it helps...
  private static final Pattern dateChars = Pattern.compile("[\u5E74\u6708\u65E5]+");
  // year, month, day chars.  Adding \u53F7 and seeing if it helps...
  private static final Pattern dateCharsPlus = Pattern.compile("[\u5E74\u6708\u65E5\u53f7]+");

  // number chars (Chinese and Western).
  // You get U+25CB circle masquerading as zero in mt data - or even in Sighan 2003 ctb
  // add U+25EF for good measure (larger geometric circle)
  // private static final Pattern numberChars = Pattern.compile("[0-9０-９" +
  //      "一二三四五六七八九十" +
  //      "零〇百千万亿兩○◯〡-〩〸-〺]");
  private static final Pattern numberChars = Pattern.compile("[0-9\uff10-\uff19" +
        "\u4e00\u4e8c\u4e09\u56db\u4e94\u516d\u4e03\u516b\u4E5D\u5341" +
        "\u96F6\u3007\u767E\u5343\u4E07\u4ebf\u5169\u25cb\u25ef\u3021-\u3029\u3038-\u303A]+");
  // A-Za-z, narrow and full width
  private static final Pattern letterChars = Pattern.compile("[A-Za-z\uFF21-\uFF3A\uFF41-\uFF5A]+");
  private static final Pattern periodChars = Pattern.compile("[\ufe52\u2027\uff0e.\u70B9]+");

  // two punctuation classes for Low and Ng style features.
  private static final Pattern separatingPuncChars = Pattern.compile("[]!\"(),;:<=>?\\[\\\\`{|}~^\u3001-\u3003\u3008-\u3011\u3014-\u301F\u3030" +
        "\uff3d\uff01\uff02\uff08\uff09\uff0c\uff1b\uff1a\uff1c\uff1d\uff1e\uff1f" +
        "\uff3b\uff3c\uff40\uff5b\uff5c\uff5d\uff5e\uff3e]+");
  private static final Pattern ambiguousPuncChars = Pattern.compile("[-#$%&'*+/@_\uff0d\uff03\uff04\uff05\uff06\uff07\uff0a\uff0b\uff0f\uff20\uff3f]+");
  private static final Pattern midDotPattern = Pattern.compile(ChineseUtils.MID_DOT_REGEX_STR + "+");


  public static String shapeOf(CharSequence input,
                               boolean augmentedDateChars,
                               boolean useMidDotShape) {
    String shape;
    if (augmentedDateChars && dateCharsPlus.matcher(input).matches()) {
      shape = "D";
    } else if (input.charAt(0) == '第') {
      return "o"; // detect those Chinese ordinals!
    } else if (dateChars.matcher(input).matches()) {
      shape = "D";
    } else if (numberChars.matcher(input).matches()) {
      shape = "N";
    } else if (letterChars.matcher(input).matches()) {
      shape = "L";
    } else if (periodChars.matcher(input).matches()) {
      shape = "P";
    } else if (separatingPuncChars.matcher(input).matches()) {
      shape = "S";
    } else if (ambiguousPuncChars.matcher(input).matches()) {
      shape = "A";
    } else if (useMidDotShape && midDotPattern.matcher(input).matches()) {
      shape = "M";
    } else {
      shape = "C";
    }
    return shape;
  }

}