org.apache.lucene.analysis.cn.smart.hhmm.HHMMSegmenter Maven / Gradle / Ivy

Go to download
/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.lucene.analysis.cn.smart.hhmm;

import org.apache.lucene.analysis.cn.smart.CharType;
import org.apache.lucene.analysis.cn.smart.Utility;
import org.apache.lucene.analysis.cn.smart.WordType;

import java.util.List;

/**
 * Finds the optimal segmentation of a sentence into Chinese words
 *
 */
public class HHMMSegmenter {

//  private static WordDictionary wordDict = WordDictionary.getInstance();

	private WordDictionary wordDict;
	private BigramDictionary bigramDict;
	public HHMMSegmenter(WordDictionary wordDict, BigramDictionary bigramDict) {
		this.wordDict = wordDict;
		this.bigramDict = bigramDict;
	}
  /**
   * Create the {@link SegGraph} for a sentence.
   * 
   * @param sentence input sentence, without start and end markers
   * @return {@link SegGraph} corresponding to the input sentence.
   */
  @SuppressWarnings("fallthrough")
  private SegGraph createSegGraph(String sentence) {
    int i = 0, j;
    int length = sentence.length();
    int foundIndex;
    int[] charTypeArray = getCharTypes(sentence);
    StringBuilder wordBuf = new StringBuilder();
    SegToken token;
    int frequency = 0; // the number of times word appears.
    boolean hasFullWidth;
    int wordType;
    char[] charArray;

    SegGraph segGraph = new SegGraph();
    while (i < length) {
      hasFullWidth = false;
      switch (charTypeArray[i]) {
        case CharType.SPACE_LIKE:
          i++;
          break;
        case CharType.HANZI:
          j = i + 1;
          wordBuf.delete(0, wordBuf.length());
          // It doesn't matter if a single Chinese character (Hanzi) can form a phrase or not, 
          // it will store that single Chinese character (Hanzi) in the SegGraph.  Otherwise, it will 
          // cause word division.
          wordBuf.append(sentence.charAt(i));
          charArray = new char[] { sentence.charAt(i) };
          frequency = wordDict.getFrequency(charArray);
          token = new SegToken(charArray, i, j, WordType.CHINESE_WORD,
              frequency);
          segGraph.addToken(token);

          foundIndex = wordDict.getPrefixMatch(charArray);
          while (j <= length && foundIndex != -1) {
            if (wordDict.isEqual(charArray, foundIndex) && charArray.length > 1) {
              // It is the phrase we are looking for; In other words, we have found a phrase SegToken
              // from i to j.  It is not a monosyllabic word (single word).
              frequency = wordDict.getFrequency(charArray);
              token = new SegToken(charArray, i, j, WordType.CHINESE_WORD,
                  frequency);
              segGraph.addToken(token);
            }

            while (j < length && charTypeArray[j] == CharType.SPACE_LIKE)
              j++;

            if (j < length && charTypeArray[j] == CharType.HANZI) {
              wordBuf.append(sentence.charAt(j));
              charArray = new char[wordBuf.length()];
              wordBuf.getChars(0, charArray.length, charArray, 0);
              // idArray has been found (foundWordIndex!=-1) as a prefix before.  
              // Therefore, idArray after it has been lengthened can only appear after foundWordIndex.  
              // So start searching after foundWordIndex.
              foundIndex = wordDict.getPrefixMatch(charArray, foundIndex);
              j++;
            } else {
              break;
            }
          }
          i++;
          break;
        case CharType.FULLWIDTH_LETTER:
          hasFullWidth = true; /* intentional fallthrough */
        case CharType.LETTER:
          j = i + 1;
          while (j < length
              && (charTypeArray[j] == CharType.LETTER || charTypeArray[j] == CharType.FULLWIDTH_LETTER)) {
            if (charTypeArray[j] == CharType.FULLWIDTH_LETTER)
              hasFullWidth = true;
            j++;
          }
          // Found a Token from i to j. Type is LETTER char string.
          charArray = Utility.STRING_CHAR_ARRAY;
          frequency = wordDict.getFrequency(charArray);
          wordType = hasFullWidth ? WordType.FULLWIDTH_STRING : WordType.STRING;
          token = new SegToken(charArray, i, j, wordType, frequency);
          segGraph.addToken(token);
          i = j;
          break;
        case CharType.FULLWIDTH_DIGIT:
          hasFullWidth = true; /* intentional fallthrough */
        case CharType.DIGIT:
          j = i + 1;
          while (j < length
              && (charTypeArray[j] == CharType.DIGIT || charTypeArray[j] == CharType.FULLWIDTH_DIGIT)) {
            if (charTypeArray[j] == CharType.FULLWIDTH_DIGIT)
              hasFullWidth = true;
            j++;
          }
          // Found a Token from i to j. Type is NUMBER char string.
          charArray = Utility.NUMBER_CHAR_ARRAY;
          frequency = wordDict.getFrequency(charArray);
          wordType = hasFullWidth ? WordType.FULLWIDTH_NUMBER : WordType.NUMBER;
          token = new SegToken(charArray, i, j, wordType, frequency);
          segGraph.addToken(token);
          i = j;
          break;
        case CharType.DELIMITER:
          j = i + 1;
          // No need to search the weight for the punctuation.  Picking the highest frequency will work.
          frequency = Utility.MAX_FREQUENCE;
          charArray = new char[] { sentence.charAt(i) };
          token = new SegToken(charArray, i, j, WordType.DELIMITER, frequency);
          segGraph.addToken(token);
          i = j;
          break;
        default:
          j = i + 1;
          // Treat the unrecognized char symbol as unknown string.
          // For example, any symbol not in GB2312 is treated as one of these.
          charArray = Utility.STRING_CHAR_ARRAY;
          frequency = wordDict.getFrequency(charArray);
          token = new SegToken(charArray, i, j, WordType.STRING, frequency);
          segGraph.addToken(token);
          i = j;
          break;
      }
    }

    // Add two more Tokens: "beginning xx beginning"
    charArray = Utility.START_CHAR_ARRAY;
    frequency = wordDict.getFrequency(charArray);
    token = new SegToken(charArray, -1, 0, WordType.SENTENCE_BEGIN, frequency);
    segGraph.addToken(token);

    // "end xx end"
    charArray = Utility.END_CHAR_ARRAY;
    frequency = wordDict.getFrequency(charArray);
    token = new SegToken(charArray, length, length + 1, WordType.SENTENCE_END,
        frequency);
    segGraph.addToken(token);

    return segGraph;
  }

  /**
   * Get the character types for every character in a sentence.
   * 
   * @see Utility#getCharType(char)
   * @param sentence input sentence
   * @return array of character types corresponding to character positions in the sentence
   */
  private static int[] getCharTypes(String sentence) {
    int length = sentence.length();
    int[] charTypeArray = new int[length];
    // the type of each character by position
    for (int i = 0; i < length; i++) {
      charTypeArray[i] = Utility.getCharType(sentence.charAt(i));
    }

    return charTypeArray;
  }

  /**
   * Return a list of {@link SegToken} representing the best segmentation of a sentence
   * @param sentence input sentence
   * @return best segmentation as a {@link List}
   */
  public List process(String sentence) {
    SegGraph segGraph = createSegGraph(sentence);
    BiSegGraph biSegGraph = new BiSegGraph(segGraph, bigramDict);
    List shortPath = biSegGraph.getShortPath();
    return shortPath;
  }
}