org.apache.lucene.analysis.ko.Viterbi Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of lucene-analysis-nori Show documentation
Apache Lucene (module: nori)
There is a newer version: 9.12.0
/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.lucene.analysis.ko;

import java.io.IOException;
import java.util.EnumMap;
import org.apache.lucene.analysis.ko.dict.CharacterDefinition;
import org.apache.lucene.analysis.ko.dict.KoMorphData;
import org.apache.lucene.analysis.ko.dict.TokenInfoDictionary;
import org.apache.lucene.analysis.ko.dict.UnknownDictionary;
import org.apache.lucene.analysis.ko.dict.UserDictionary;
import org.apache.lucene.analysis.morph.ConnectionCosts;
import org.apache.lucene.analysis.morph.Dictionary;
import org.apache.lucene.analysis.morph.GraphvizFormatter;
import org.apache.lucene.analysis.morph.MorphData;
import org.apache.lucene.analysis.morph.TokenInfoFST;
import org.apache.lucene.analysis.morph.TokenType;
import org.apache.lucene.util.fst.FST;

/** {@link org.apache.lucene.analysis.morph.Viterbi} subclass for Korean morphological analysis. */
final class Viterbi
    extends org.apache.lucene.analysis.morph.Viterbi<
        Token, org.apache.lucene.analysis.morph.Viterbi.Position> {

  private final EnumMap> dictionaryMap =
      new EnumMap<>(TokenType.class);

  private final UnknownDictionary unkDictionary;
  private final CharacterDefinition characterDefinition;

  private final boolean discardPunctuation;
  private final KoreanTokenizer.DecompoundMode mode;
  private final boolean outputUnknownUnigrams;

  private GraphvizFormatter dotOut;

  Viterbi(
      TokenInfoFST fst,
      FST.BytesReader fstReader,
      TokenInfoDictionary dictionary,
      TokenInfoFST userFST,
      FST.BytesReader userFSTReader,
      UserDictionary userDictionary,
      ConnectionCosts costs,
      UnknownDictionary unkDictionary,
      CharacterDefinition characterDefinition,
      boolean discardPunctuation,
      KoreanTokenizer.DecompoundMode mode,
      boolean outputUnknownUnigrams) {
    super(
        fst, fstReader, dictionary, userFST, userFSTReader, userDictionary, costs, Position.class);
    this.unkDictionary = unkDictionary;
    this.characterDefinition = characterDefinition;
    this.discardPunctuation = discardPunctuation;
    this.mode = mode;
    this.outputUnknownUnigrams = outputUnknownUnigrams;
    this.enableSpacePenaltyFactor = true;
    this.outputLongestUserEntryOnly = true;
    dictionaryMap.put(TokenType.KNOWN, dictionary);
    dictionaryMap.put(TokenType.UNKNOWN, unkDictionary);
    dictionaryMap.put(TokenType.USER, userDictionary);
  }

  @Override
  protected int processUnknownWord(boolean anyMatches, Position posData) throws IOException {
    final char firstCharacter = (char) buffer.get(pos);
    if (!anyMatches || characterDefinition.isInvoke(firstCharacter)) {

      // Find unknown match:
      int characterId = characterDefinition.getCharacterClass(firstCharacter);
      // NOTE: copied from UnknownDictionary.lookup:
      int unknownWordLength;
      if (!characterDefinition.isGroup(firstCharacter)) {
        unknownWordLength = 1;
      } else {
        // Extract unknown word. Characters with the same script are considered to be part of
        // unknown word
        unknownWordLength = 1;
        Character.UnicodeScript scriptCode = Character.UnicodeScript.of(firstCharacter);
        final boolean isPunct = isPunctuation(firstCharacter);
        final boolean isDigit = Character.isDigit(firstCharacter);
        for (int posAhead = pos + 1; unknownWordLength < MAX_UNKNOWN_WORD_LENGTH; posAhead++) {
          int next = buffer.get(posAhead);
          if (next == -1) {
            break;
          }
          char ch = (char) next;
          int chType = Character.getType(ch);
          Character.UnicodeScript sc = Character.UnicodeScript.of(next);
          boolean sameScript =
              isSameScript(scriptCode, sc)
                  // Non-spacing marks inherit the script of their base character,
                  // following recommendations from UTR #24.
                  || chType == Character.NON_SPACING_MARK;

          if (sameScript
              // split on punctuation
              && isPunctuation(ch, chType) == isPunct
              // split on digit
              && Character.isDigit(ch) == isDigit
              && characterDefinition.isGroup(ch)) {
            unknownWordLength++;
          } else {
            break;
          }
          // Update the script code and character class if the original script
          // is Inherited or Common.
          if (isCommonOrInherited(scriptCode) && isCommonOrInherited(sc) == false) {
            scriptCode = sc;
            characterId = characterDefinition.getCharacterClass(ch);
          }
        }
      }

      unkDictionary.lookupWordIds(
          characterId, wordIdRef); // characters in input text are supposed to be the same
      if (VERBOSE) {
        System.out.println(
            "    UNKNOWN word len=" + unknownWordLength + " " + wordIdRef.length + " wordIDs");
      }
      for (int ofs = 0; ofs < wordIdRef.length; ofs++) {
        add(
            unkDictionary.getMorphAttributes(),
            posData,
            pos,
            pos + unknownWordLength,
            wordIdRef.ints[wordIdRef.offset + ofs],
            TokenType.UNKNOWN,
            false);
      }
    }
    // TODO: should return meaningful value?
    return 0;
  }

  void setGraphvizFormatter(GraphvizFormatter dotOut) {
    this.dotOut = dotOut;
  }

  @Override
  protected void backtrace(Position endPosData, int fromIDX) {
    final int endPos = endPosData.getPos();

    if (endPos == lastBackTracePos) {
      return;
    }

    if (VERBOSE) {
      System.out.println(
          "\n  backtrace: endPos="
              + endPos
              + " pos="
              + pos
              + "; "
              + (pos - lastBackTracePos)
              + " characters; last="
              + lastBackTracePos
              + " cost="
              + endPosData.getCost(fromIDX));
    }

    final char[] fragment = buffer.get(lastBackTracePos, endPos - lastBackTracePos);

    if (dotOut != null) {
      dotOut.onBacktrace(
          this::getDict, positions, lastBackTracePos, endPosData, fromIDX, fragment, end);
    }

    int pos = endPos;
    int bestIDX = fromIDX;

    // TODO: sort of silly to make Token instances here; the
    // back trace has all info needed to generate the
    // token.  So, we could just directly set the attrs,
    // from the backtrace, in incrementToken w/o ever
    // creating Token; we'd have to defer calling freeBefore
    // until after the backtrace was fully "consumed" by
    // incrementToken.

    while (pos > lastBackTracePos) {
      // System.out.println("BT: back pos=" + pos + " bestIDX=" + bestIDX);
      final Position posData = positions.get(pos);
      assert bestIDX < posData.getCount();

      int backPos = posData.getBackPos(bestIDX);
      int backWordPos = posData.getBackWordPos(bestIDX);
      assert backPos >= lastBackTracePos
          : "backPos=" + backPos + " vs lastBackTracePos=" + lastBackTracePos;
      // the length of the word without the whitespaces at the beginning.
      int length = pos - backWordPos;
      TokenType backType = posData.getBackType(bestIDX);
      int backID = posData.getBackID(bestIDX);
      int nextBestIDX = posData.getBackIndex(bestIDX);
      // the start of the word after the whitespace at the beginning.
      final int fragmentOffset = backWordPos - lastBackTracePos;
      assert fragmentOffset >= 0;

      final Dictionary dict = getDict(backType);

      if (outputUnknownUnigrams && backType == TokenType.UNKNOWN) {
        // outputUnknownUnigrams converts unknown word into unigrams:
        for (int i = length - 1; i >= 0; i--) {
          int charLen = 1;
          if (i > 0 && Character.isLowSurrogate(fragment[fragmentOffset + i])) {
            i--;
            charLen = 2;
          }
          final DictionaryToken token =
              new DictionaryToken(
                  TokenType.UNKNOWN,
                  unkDictionary.getMorphAttributes(),
                  CharacterDefinition.NGRAM,
                  fragment,
                  fragmentOffset + i,
                  charLen,
                  backWordPos + i,
                  backWordPos + i + charLen);
          pending.add(token);
          if (VERBOSE) {
            System.out.println("    add token=" + pending.get(pending.size() - 1));
          }
        }
      } else {
        final DictionaryToken token =
            new DictionaryToken(
                backType,
                dict.getMorphAttributes(),
                backID,
                fragment,
                fragmentOffset,
                length,
                backWordPos,
                backWordPos + length);
        if (token.getPOSType() == POS.Type.MORPHEME
            || mode == KoreanTokenizer.DecompoundMode.NONE) {
          if (shouldFilterToken(token) == false) {
            pending.add(token);
            if (VERBOSE) {
              System.out.println("    add token=" + pending.get(pending.size() - 1));
            }
          }
        } else {
          KoMorphData.Morpheme[] morphemes = token.getMorphemes();
          if (morphemes == null) {
            pending.add(token);
            if (VERBOSE) {
              System.out.println("    add token=" + pending.get(pending.size() - 1));
            }
          } else {
            int endOffset = backWordPos + length;
            int posLen = 0;
            // decompose the compound
            for (int i = morphemes.length - 1; i >= 0; i--) {
              final KoMorphData.Morpheme morpheme = morphemes[i];
              final Token compoundToken;
              if (token.getPOSType() == POS.Type.COMPOUND) {
                assert endOffset - morpheme.surfaceForm().length() >= 0;
                compoundToken =
                    new DecompoundToken(
                        morpheme.posTag(),
                        morpheme.surfaceForm(),
                        endOffset - morpheme.surfaceForm().length(),
                        endOffset,
                        backType);
              } else {
                compoundToken =
                    new DecompoundToken(
                        morpheme.posTag(),
                        morpheme.surfaceForm(),
                        token.getStartOffset(),
                        token.getEndOffset(),
                        backType);
              }
              if (i == 0 && mode == KoreanTokenizer.DecompoundMode.MIXED) {
                compoundToken.setPositionIncrement(0);
              }
              ++posLen;
              endOffset -= morpheme.surfaceForm().length();
              pending.add(compoundToken);
              if (VERBOSE) {
                System.out.println("    add token=" + pending.get(pending.size() - 1));
              }
            }
            if (mode == KoreanTokenizer.DecompoundMode.MIXED) {
              token.setPositionLength(Math.max(1, posLen));
              pending.add(token);
              if (VERBOSE) {
                System.out.println("    add token=" + pending.get(pending.size() - 1));
              }
            }
          }
        }
      }
      if (discardPunctuation == false && backWordPos != backPos) {
        // Add a token for whitespaces between terms
        int offset = backPos - lastBackTracePos;
        int len = backWordPos - backPos;
        // System.out.println(offset + " " + fragmentOffset + " " + len + " " + backWordPos + " " +
        // backPos);
        unkDictionary.lookupWordIds(characterDefinition.getCharacterClass(' '), wordIdRef);
        DictionaryToken spaceToken =
            new DictionaryToken(
                TokenType.UNKNOWN,
                unkDictionary.getMorphAttributes(),
                wordIdRef.ints[wordIdRef.offset],
                fragment,
                offset,
                len,
                backPos,
                backPos + len);
        pending.add(spaceToken);
      }

      pos = backPos;
      bestIDX = nextBestIDX;
    }

    lastBackTracePos = endPos;

    if (VERBOSE) {
      System.out.println("  freeBefore pos=" + endPos);
    }
    // Notify the circular buffers that we are done with
    // these positions:
    buffer.freeBefore(endPos);
    positions.freeBefore(endPos);
  }

  /** Returns the space penalty associated with the provided {@link POS.Tag}. */
  @Override
  protected int computeSpacePenalty(MorphData morphData, int wordID, int numSpaces) {
    final POS.Tag leftPOS = ((KoMorphData) morphData).getLeftPOS(wordID);
    int spacePenalty = 0;
    if (numSpaces > 0) {
      // TODO we should extract the penalty (left-space-penalty-factor) from the dicrc file.
      switch (leftPOS) {
        case EP:
        case EF:
        case EC:
        case ETN:
        case ETM:
        case JKS:
        case JKC:
        case JKG:
        case JKO:
        case JKB:
        case JKV:
        case JKQ:
        case JX:
        case JC:
        case VCP:
        case XSA:
        case XSN:
        case XSV:
          spacePenalty = 3000;
          break;
        case IC:
        case MAG:
        case MAJ:
        case MM:
        case NA:
        case NNB:
        case NNBC:
        case NNG:
        case NNP:
        case NP:
        case NR:
        case SC:
        case SE:
        case SF:
        case SH:
        case SL:
        case SN:
        case SP:
        case SSC:
        case SSO:
        case SY:
        case UNA:
        case UNKNOWN:
        case VA:
        case VCN:
        case VSV:
        case VV:
        case VX:
        case XPN:
        case XR:
        default:
          break;
      }
    }
    return spacePenalty;
  }

  Dictionary getDict(TokenType type) {
    return dictionaryMap.get(type);
  }

  private boolean shouldFilterToken(Token token) {
    return discardPunctuation && isPunctuation(token.getSurfaceForm()[token.getOffset()]);
  }

  private static boolean isPunctuation(char ch) {
    return isPunctuation(ch, Character.getType(ch));
  }

  private static boolean isPunctuation(char ch, int cid) {
    // special case for Hangul Letter Araea (interpunct)
    if (ch == 0x318D) {
      return true;
    }
    switch (cid) {
      case Character.SPACE_SEPARATOR:
      case Character.LINE_SEPARATOR:
      case Character.PARAGRAPH_SEPARATOR:
      case Character.CONTROL:
      case Character.FORMAT:
      case Character.DASH_PUNCTUATION:
      case Character.START_PUNCTUATION:
      case Character.END_PUNCTUATION:
      case Character.CONNECTOR_PUNCTUATION:
      case Character.OTHER_PUNCTUATION:
      case Character.MATH_SYMBOL:
      case Character.CURRENCY_SYMBOL:
      case Character.MODIFIER_SYMBOL:
      case Character.OTHER_SYMBOL:
      case Character.INITIAL_QUOTE_PUNCTUATION:
      case Character.FINAL_QUOTE_PUNCTUATION:
        return true;
      default:
        return false;
    }
  }

  private static boolean isCommonOrInherited(Character.UnicodeScript script) {
    return script == Character.UnicodeScript.INHERITED || script == Character.UnicodeScript.COMMON;
  }

  /** Determine if two scripts are compatible. */
  private static boolean isSameScript(
      Character.UnicodeScript scriptOne, Character.UnicodeScript scriptTwo) {
    return scriptOne == scriptTwo
        || isCommonOrInherited(scriptOne)
        || isCommonOrInherited(scriptTwo);
  }
}