opennlp.tools.tokenize.DictionaryDetokenizer Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of opennlp-tools Show documentation
There is a newer version: 2.5.0
/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License. You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package opennlp.tools.tokenize;

import java.util.HashSet;
import java.util.Set;

/**
 * A rule based detokenizer. Simple rules which indicate in which direction a token should be
 * moved are looked up in a {@link DetokenizationDictionary} object.
 *
 * @see Detokenizer
 * @see DetokenizationDictionary
 */
public class DictionaryDetokenizer implements Detokenizer {

  private final DetokenizationDictionary dict;

  public DictionaryDetokenizer(DetokenizationDictionary dict) {
    this.dict = dict;
  }

  public DetokenizationOperation[] detokenize(String[] tokens) {

    DetokenizationOperation[] operations = new DetokenizationOperation[tokens.length];

    Set matchingTokens = new HashSet<>();

    for (int i = 0; i < tokens.length; i++) {
      DetokenizationDictionary.Operation dictOperation = dict.getOperation(tokens[i]);

      if (dictOperation == null) {
        operations[i] = Detokenizer.DetokenizationOperation.NO_OPERATION;
      }
      else if (DetokenizationDictionary.Operation.MOVE_LEFT.equals(dictOperation)) {
        operations[i] = Detokenizer.DetokenizationOperation.MERGE_TO_LEFT;
      }
      else if (DetokenizationDictionary.Operation.MOVE_RIGHT.equals(dictOperation)) {
        operations[i] = Detokenizer.DetokenizationOperation.MERGE_TO_RIGHT;
      }
      else if (DetokenizationDictionary.Operation.MOVE_BOTH.equals(dictOperation)) {
        operations[i] = Detokenizer.DetokenizationOperation.MERGE_BOTH;
      }
      else if (DetokenizationDictionary.Operation.RIGHT_LEFT_MATCHING.equals(dictOperation)) {

        if (matchingTokens.contains(tokens[i])) {
          // The token already occurred once, move it to the left
          // and clear the occurrence flag
          operations[i] = Detokenizer.DetokenizationOperation.MERGE_TO_LEFT;
          matchingTokens.remove(tokens[i]);
        }
        else {
          // First time this token is seen, move it to the right
          // and remember it
          operations[i] = Detokenizer.DetokenizationOperation.MERGE_TO_RIGHT;
          matchingTokens.add(tokens[i]);
        }
      }
      else {
        throw new IllegalStateException("Unknown operation: " + dictOperation);
      }
    }

    return operations;
  }

  public String detokenize(String[] tokens, String splitMarker) {

    DetokenizationOperation[] operations = detokenize(tokens);

    if (tokens.length != operations.length)
      throw new IllegalArgumentException("tokens and operations array must have same length: tokens=" +
          tokens.length + ", operations=" + operations.length + "!");


    StringBuilder untokenizedString = new StringBuilder();

    for (int i = 0; i < tokens.length; i++) {

      // attach token to string buffer
      untokenizedString.append(tokens[i]);

      boolean isAppendSpace;
      boolean isAppendSplitMarker;

      // if this token is the last token do not attach a space
      if (i + 1 == operations.length) {
        isAppendSpace = false;
        isAppendSplitMarker = false;
      }
      // if next token move left, no space after this token,
      // its safe to access next token
      else if (operations[i + 1].equals(DetokenizationOperation.MERGE_TO_LEFT)
          || operations[i + 1].equals(DetokenizationOperation.MERGE_BOTH)) {
        isAppendSpace = false;
        isAppendSplitMarker = true;
      }
      // if this token is move right, no space
      else if (operations[i].equals(DetokenizationOperation.MERGE_TO_RIGHT)
          || operations[i].equals(DetokenizationOperation.MERGE_BOTH)) {
        isAppendSpace = false;
        isAppendSplitMarker = true;
      }
      else {
        isAppendSpace = true;
        isAppendSplitMarker = false;
      }

      if (isAppendSpace) {
        untokenizedString.append(' ');
      }

      if (isAppendSplitMarker && splitMarker != null) {
        untokenizedString.append(splitMarker);
      }
    }

    return untokenizedString.toString();
  }
}