edu.stanford.nlp.process.ChineseDocumentToSentenceProcessor Maven / Gradle / Ivy

Show more of this group Show more artifacts with this name
Show all versions of stanford-corenlp Show documentation
Stanford CoreNLP provides a set of natural language analysis tools which can take raw English language text input and give the base forms of words, their parts of speech, whether they are names of companies, people, etc., normalize dates, times, and numeric quantities, mark up the structure of sentences in terms of phrases and word dependencies, and indicate which noun phrases refer to the same entities. It provides the foundational building blocks for higher level text understanding applications.
There is a newer version: 4.5.7
Show newest version
package edu.stanford.nlp.process;

import java.io.File;
import java.io.IOException;
import java.io.OutputStreamWriter;
import java.io.PrintWriter;
import java.io.Reader;
import java.io.Serializable;
import java.io.StringReader;
import java.net.URL;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import java.util.Properties;
import java.util.Set;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import javax.swing.text.MutableAttributeSet;
import javax.swing.text.html.HTML;
import javax.swing.text.html.HTMLEditorKit;
import javax.swing.text.html.parser.ParserDelegator;

import edu.stanford.nlp.io.IOUtils;
import edu.stanford.nlp.ling.Word;
import edu.stanford.nlp.objectbank.ObjectBank;
import edu.stanford.nlp.trees.international.pennchinese.ChineseUtils;
import edu.stanford.nlp.util.Generics;
import edu.stanford.nlp.util.Pair;
import edu.stanford.nlp.util.StringUtils;
import edu.stanford.nlp.util.logging.Redwood;

import static edu.stanford.nlp.trees.international.pennchinese.ChineseUtils.WHITEPLUS;

/**
 * Convert a Chinese Document into a List of sentence Strings.
 *
 * @author Pi-Chuan Chang
 */
public class ChineseDocumentToSentenceProcessor implements Serializable  {

  /** A logger for this class */
  private static Redwood.RedwoodChannels log = Redwood.channels(ChineseDocumentToSentenceProcessor.class);

  // todo: This class is a mess. We should try to get it out of core

  private static final long serialVersionUID = 4054964767812217460L;

  private static final Set fullStopsSet = Generics.newHashSet(Arrays.asList(new Character[]{'。', '！', '？', '!', '?'}));
  // not \uff0e . (too often separates English first/last name, etc.)

  private static final Set rightMarkSet = Generics.newHashSet(Arrays.asList(new Character[]{'”', '’', '》', '』', '〉', '」', '＞', '＇', '）', '\'', '"', ')', ']', '>'}));

  // private final String normalizationTableFile;

  private static final String encoding = "UTF-8";
  private final List> normalizationTable;


  public ChineseDocumentToSentenceProcessor() {
    this(null);
  }

  private static final Pattern PAIR_PATTERN = Pattern.compile("([^\\s]+)\\s+([^\\s]+)");

  /** @param normalizationTableFile A file listing character pairs for
   *     normalization.  Currently the normalization table must be in UTF-8.
   *     If this parameter is {@code null}, the default normalization
   *     of the zero-argument constructor is used.
   */
  public ChineseDocumentToSentenceProcessor(String normalizationTableFile) {
    // this.normalizationTableFile = normalizationTableFile;
    if (normalizationTableFile != null) {
      normalizationTable = new ArrayList<>();
      for (String line : ObjectBank.getLineIterator(new File(normalizationTableFile), encoding)) {
        Matcher pairMatcher = PAIR_PATTERN.matcher(line);
        if (pairMatcher.find()) {
          normalizationTable.add(new Pair<>(pairMatcher.group(1), pairMatcher.group(2)));
        } else {
          log.info("Didn't match: "+line);
        }
      }
    } else {
      normalizationTable = null;
    }
  }
  /*
  public ChineseDocumentToSentenceProcessor(String normalizationTableFile, String encoding) {
    log.info("WARNING: ChineseDocumentToSentenceProcessor ignores normalizationTableFile argument!");
    log.info("WARNING: ChineseDocumentToSentenceProcessor ignores encoding argument!");
    // encoding is never read locally
    this.encoding = encoding;
  }
  */


  /** This should now become disused, and other people should call
   *  ChineseUtils directly!  CDM June 2006.
   */
  public String normalization(String in) {
    //log.info("BEFOR NORM: "+in);
    String norm = ChineseUtils.normalize(in);
    String out = normalize(norm);
    //log.info("AFTER NORM: "+out);
    return out;
  }

  private static final Pattern WHITEPLUS_PATTERN = Pattern.compile(WHITEPLUS);
  private static final Pattern START_WHITEPLUS_PATTERN = Pattern.compile('^' + WHITEPLUS);
  private static final Pattern END_WHITEPLUS_PATTERN = Pattern.compile(WHITEPLUS + '$');

  private String normalize(String inputString) {
    if (normalizationTable == null) {
      return inputString;
    }

    Pattern replacePattern = WHITEPLUS_PATTERN;
    Matcher replaceMatcher = replacePattern.matcher(inputString);
    inputString = replaceMatcher.replaceAll(" ");

    for (Pair p : normalizationTable) {
      replacePattern = Pattern.compile(p.first(), Pattern.LITERAL);
      replaceMatcher = replacePattern.matcher(inputString);
      String escape = p.second();
      if (escape.equals("$")) {escape="\\$";}
      inputString = replaceMatcher.replaceAll(escape);
    }
    return inputString;
  }


  /** usage: java ChineseDocumentToSentenceProcessor [-segmentIBM]
   *  -file filename [-encoding encoding]
   *  
   *  The -segmentIBM option is for IBM GALE-specific splitting of an
   *  XML element into sentences.
   */
  public static void main(String[] args) throws Exception {
    //String encoding = "GB18030";
    Properties props = StringUtils.argsToProperties(args);
    // log.info("Here are the properties:");
    // props.list(System.err);
    boolean alwaysAddS = props.containsKey("alwaysAddS");
    ChineseDocumentToSentenceProcessor cp;
    if (! props.containsKey("file")) {
      log.info("usage: java ChineseDocumentToSentenceProcessor [-segmentIBM] -file filename [-encoding encoding]");
      return;
    }
    cp = new ChineseDocumentToSentenceProcessor();
    if (props.containsKey("encoding")) {
      log.info("WARNING: for now the default encoding is "+cp.encoding+". It's not changeable for now");
    }
    String input = IOUtils.slurpFileNoExceptions(props.getProperty("file"),
                                                     cp.encoding);
    // String input = StringUtils.slurpGBURLNoExceptions(new URL(props.getProperty("file")));

    if (props.containsKey("segmentIBM")) {
      Tokenizer tok = WhitespaceTokenizer.
        newWordWhitespaceTokenizer(new StringReader(input), true);
      String parseInside = props.getProperty("parseInside");
      if (parseInside == null) parseInside = "";

      Pattern p1, p2, p3, p4;
      PrintWriter pw = new PrintWriter(new OutputStreamWriter(System.out, cp.encoding), true);
      StringBuilder buff = new StringBuilder();
      StringBuilder sgmlbuff = new StringBuilder();
      String lastSgml = "";

      p1 = Pattern.compile("<.*>");
      p2 = Pattern.compile("\uFEFF?<[\\p{Alpha}]+");
      p3 = Pattern.compile("[A-Za-z0-9=\"]+>");
      p4 = Pattern.compile("<(?:" + parseInside + ")[ >]");

      boolean inSGML = false;
      int splitItems = 0;
      int numAdded = 0;
      while (tok.hasNext()) {
        String s = tok.next().word();
        // pw.println("The token is |" + s + "|");
        if (p2.matcher(s).matches()) {
          inSGML = true;
          sgmlbuff.append(s).append(" ");
        } else if (p1.matcher(s).matches() || inSGML && p3.matcher(s).matches() || "\n".equals(s)) {
          inSGML = false;
          if (buff.toString().trim().length() > 0) {
            // pw.println("Dumping sentences");
            // pw.println("Buff is " + buff);
            boolean processIt = false;
            if (parseInside.equals("")) {
              processIt = true;
            } else if (p4.matcher(lastSgml).find()) {
              processIt = true;
            }
            if (processIt) {
              List sents = ChineseDocumentToSentenceProcessor.fromPlainText(buff.toString(), true);
              // pw.println("Sents is " + sents);
              // pw.println();
              if (alwaysAddS || sents.size() > 1) {
                int i = 1;
                for (String str : sents) {
                  pw.print("");
                  pw.print(str);
                  pw.println("");
                  i++;
                }
                if (sents.size() > 1) {
                  splitItems++;
                  numAdded += sents.size() - 1;
                }
              } else if (sents.size() == 1) {
                pw.print(sents.get(0));
              }
            } else {
              pw.print(buff);
            }
            buff = new StringBuilder();
          }
          sgmlbuff.append(s);
          // pw.println("sgmlbuff is " + sgmlbuff);
          pw.print(sgmlbuff);
          lastSgml = sgmlbuff.toString();
          sgmlbuff = new StringBuilder();
        } else {
          if (inSGML) {
            sgmlbuff.append(s).append(" ");
          } else {
            buff.append(s).append(" ");
          }
          // pw.println("Buff is now |" + buff + "|");
        }
      } // end while (tok.hasNext()) {
      // empty remaining buffers
      pw.flush();
      pw.close();
      log.info("Split " + splitItems + " segments, adding " +
                         numAdded + " sentences.");
    } else {
      List sent = cp.fromHTML(input);
      PrintWriter pw = new PrintWriter(new OutputStreamWriter(System.err, cp.encoding), true);

      for (String a : sent) {
        pw.println(a);
      }
    }
  }


  /**
   * Strip off HTML tags before processing.
   * Only the simplest tag stripping is implemented.
   *
   * @param inputString Chinese document text which contains HTML tags
   * @return a List of sentence strings
   */
  public static List fromHTML(String inputString) throws IOException {
    //HTMLParser parser = new HTMLParser();
    //return fromPlainText(parser.parse(inputString));
    List ans = new ArrayList<>();
    MyHTMLParser parser = new MyHTMLParser();
    List sents = parser.parse(inputString);
    for (String s : sents) {
      ans.addAll(fromPlainText(s));
    }
    return ans;
  }


  /**
   * @param contentString Chinese document text
   * @return a List of sentence strings
   * @throws IOException
   */
  public static List fromPlainText(String contentString) throws IOException {
    return fromPlainText(contentString, false);
  }

  public static List fromPlainText(String contentString, boolean segmented) throws IOException {
    if (segmented) {
      contentString = ChineseUtils.normalize(contentString,
                                             ChineseUtils.LEAVE,
                                             ChineseUtils.ASCII);
    } else {
      contentString = ChineseUtils.normalize(contentString,
                                             ChineseUtils.FULLWIDTH,
                                             ChineseUtils.ASCII);
    }

    String sentenceString = "";

    char[] content = contentString.toCharArray();
    boolean sentenceEnd = false;
    List sentenceList = new ArrayList<>();

    int lastCh = -1;
    for (Character c : content) {
      // EncodingPrintWriter.out.println("Char is |" + c + "|", "UTF-8");
      String newChar = c.toString();

      if ( ! sentenceEnd) {
        if (segmented && fullStopsSet.contains(c) &&
            (lastCh == -1 || Character.isSpaceChar(lastCh))) {
          // require it to be a standalone punctuation mark -- cf. URLs
          sentenceString += newChar;
          sentenceEnd = true;
        } else if ( ! segmented && fullStopsSet.contains(c)) {
          // EncodingPrintWriter.out.println("  End of sent char", "UTF-8");
          sentenceString += newChar;
          sentenceEnd = true;
        } else {
          sentenceString += newChar;
        }
      } else { // sentenceEnd == true
        if (rightMarkSet.contains(c)) {
          sentenceString += newChar;
          // EncodingPrintWriter.out.println("  Right mark char", "UTF-8");
        } else if (newChar.matches("\\s")) {
          sentenceString += newChar;
        } else if (fullStopsSet.contains(c)) {
          // EncodingPrintWriter.out.println("  End of sent char (2+)", "UTF-8");
          sentenceString += newChar;
        } else { // otherwise
          if (sentenceString.length() > 0) {
            sentenceEnd = false;
          }
          sentenceString = removeWhitespace(sentenceString, segmented);
          if (sentenceString.length() > 0) {
            //log.info("<<< "+sentenceString+" >>>");
            sentenceList.add(sentenceString);
          }
          sentenceString = "";
          sentenceString += newChar;
        }
      }
      lastCh = c.charValue();
    } // end for (Character c : content)

    sentenceString = removeWhitespace(sentenceString, segmented);
    if (sentenceString.length() > 0) {
      //log.info("<<< "+sentenceString+" >>>");
      sentenceList.add(sentenceString);
    }
    return sentenceList;
  }

  /** In non-segmented mode, all whitespace is removed,
   *  in segmented mode only leading and trailing whitespace goes away.
   *
   */
  private static String removeWhitespace(String str, boolean segmented) {
    if (str.length() > 0) {
      //System.out.println("Add: "+sentenceString);
      Pattern replacePattern = START_WHITEPLUS_PATTERN;
      Matcher replaceMatcher = replacePattern.matcher(str);
      str = replaceMatcher.replaceAll("");
      replacePattern = END_WHITEPLUS_PATTERN;
      replaceMatcher = replacePattern.matcher(str);
      str = replaceMatcher.replaceAll("");

      if ( ! segmented) {
        replacePattern = WHITEPLUS_PATTERN;
        replaceMatcher = replacePattern.matcher(str);
        str = replaceMatcher.replaceAll("");
      }
    }
    return str;
  }



  static class MyHTMLParser extends HTMLEditorKit.ParserCallback {

    protected StringBuffer textBuffer;
    protected List sentences;
    protected String title;
    protected boolean isTitle;
    protected boolean isBody;
    protected boolean isScript;
    protected boolean isBreak;

    public MyHTMLParser() {
      super();
      title = "";
      isTitle = false;
      isBody = false;
      isScript = false;
      isBreak = false;
    }


    @Override
    public void handleText(char[] data, int pos) {
      if (data.length == 0) return;

      if (isTitle) {
        title = new String(data);
      } else if (isBody && !isScript) {
        //textBuffer.append(data).append(" ");
      }
      //if (isBreak) {
      if (true) {
        textBuffer.append(data);
        String text = textBuffer.toString();
        text = text.replaceAll("\u00a0","");
        text = text.trim();
        if (text.length()==0) return;


        sentences.add(text);
        textBuffer = new StringBuffer(500);
      }
    }

    /**
     * Sets a flag if the start tag is the "TITLE" element start tag.
     */
    @Override
    public void handleStartTag(HTML.Tag tag, MutableAttributeSet attrSet, int pos) {
      if (tag == HTML.Tag.TITLE) {
        isTitle = true;
      } else if (tag == HTML.Tag.BODY) {
        isBody = true;
      } else if (tag == HTML.Tag.SCRIPT) {
        isScript = true;
      }

      isBreak = tag.breaksFlow();
    }

    /**
     * Sets a flag if the end tag is the "TITLE" element end tag
     */
    @Override
    public void handleEndTag(HTML.Tag tag, int pos) {
      if (tag == HTML.Tag.TITLE) {
        isTitle = false;
      } else if (tag == HTML.Tag.BODY) {
        isBody = false;
      } else if (tag == HTML.Tag.SCRIPT) {
        isScript = false;
      }
    }

    public List parse(URL url) throws IOException {
      return (parse(IOUtils.slurpURL(url)));
    }

    public List parse(Reader r) throws IOException {
      return parse(IOUtils.slurpReader(r));
    }

    /**
     * The parse method that actually does the work.
     * Now it first gets rid of singleton tags before running.
     * @throws IOException
     */
    public List parse(String text) throws IOException {
      text = text.replaceAll("/>", ">");
      text = text.replaceAll("<\\?","<");
      StringReader r = new StringReader(text);
      textBuffer = new StringBuffer(200);
      sentences = new ArrayList<>();
      new ParserDelegator().parse(r, this, true);
      return sentences;
    }

    public String title() {
      return title;
    }

    /*
    public static void main(String[] args) throws IOException {
      MyHTMLParser parser = new MyHTMLParser();
      String input = StringUtils.slurpGBURLNoExceptions(new URL(args[0]));
      List result = parser.parse(input);
      PrintWriter orig = new PrintWriter("file.orig");
      PrintWriter parsed = new PrintWriter("file.parsed");
      log.info("output to file.orig");
      orig.println(input);
      for (String s : result) {
        log.info("output to file.parsed");
        parsed.println(s);
        parsed.println("-----------------------------------------");
      }
      orig.close();
      parsed.close();
    }
    */

  }


} // end class ChineseDocumentToSentenceProcessor