opennlp.tools.tokenize.TokenSample Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of opennlp-tools Show documentation
There is a newer version: 2.5.0
/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License. You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */


package opennlp.tools.tokenize;

import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.List;

import opennlp.tools.tokenize.Detokenizer.DetokenizationOperation;
import opennlp.tools.util.Span;

/**
 * A {@link TokenSample} is text with token spans.
 */
public class TokenSample {

  public static final String DEFAULT_SEPARATOR_CHARS = "";

  private final String separatorChars = DEFAULT_SEPARATOR_CHARS;

  private final String text;

  private final List tokenSpans;

  /**
   * Initializes the current instance.
   *
   * @param text the text which contains the tokens.
   * @param tokenSpans the spans which mark the begin and end of the tokens.
   */
  public TokenSample(String text, Span tokenSpans[]) {

    if (text == null)
      throw new IllegalArgumentException("text must not be null!");

    if (tokenSpans == null)
      throw new IllegalArgumentException("tokenSpans must not be null! ");

    this.text = text;
    this.tokenSpans = Collections.unmodifiableList(new ArrayList(Arrays.asList(tokenSpans)));

    for (Span tokenSpan : tokenSpans) {
      if (tokenSpan.getStart() < 0 || tokenSpan.getStart() > text.length() ||
          tokenSpan.getEnd() > text.length() || tokenSpan.getEnd() < 0) {
        throw new IllegalArgumentException("Span " + tokenSpan.toString() +
            " is out of bounds, text length: " + text.length() + "!");
      }
    }
  }

  public TokenSample(Detokenizer detokenizer, String tokens[]) {

    StringBuilder sentence = new StringBuilder();

    DetokenizationOperation[] operations = detokenizer.detokenize(tokens);

    List mergedTokenSpans = new ArrayList();

    for (int i = 0; i < operations.length; i++) {

      boolean isSeparateFromPreviousToken = i > 0 &&
          !isMergeToRight(operations[i - 1]) &&
          !isMergeToLeft(operations[i]);

      if (isSeparateFromPreviousToken) {
        sentence.append(' ');
      }

      int beginIndex = sentence.length();
      sentence.append(tokens[i]);
      mergedTokenSpans.add(new Span(beginIndex, sentence.length()));
    }

    text = sentence.toString();
    tokenSpans = Collections.unmodifiableList(mergedTokenSpans);
  }

  private boolean isMergeToRight(DetokenizationOperation operation) {
    return DetokenizationOperation.MERGE_TO_RIGHT.equals(operation)
        || DetokenizationOperation.MERGE_BOTH.equals(operation);
  }

  private boolean isMergeToLeft(DetokenizationOperation operation) {
    return DetokenizationOperation.MERGE_TO_LEFT.equals(operation)
        || DetokenizationOperation.MERGE_BOTH.equals(operation);
  }

  /**
   * Retrieves the text.
   */
  public String getText() {
    return text;
  }

  /**
   * Retrieves the token spans.
   */
  public Span[] getTokenSpans() {
    return tokenSpans.toArray(new Span[tokenSpans.size()]);
  }

  @Override
  public String toString() {

    StringBuilder sentence = new StringBuilder();

    int lastEndIndex = -1;
    for (Span token : tokenSpans) {

      if (lastEndIndex != -1) {

        // If there are no chars between last token
        // and this token insert the separator chars
        // otherwise insert a space

        String separator;
        if (lastEndIndex == token.getStart())
          separator = separatorChars;
        else
          separator = " ";

        sentence.append(separator);
      }

      sentence.append(token.getCoveredText(text));

      lastEndIndex = token.getEnd();
    }

    return sentence.toString();
  }

  private static void addToken(StringBuilder sample, List tokenSpans, String token, boolean isNextMerged) {

    int tokenSpanStart = sample.length();
    sample.append(token);
    int tokenSpanEnd = sample.length();

    tokenSpans.add(new Span(tokenSpanStart, tokenSpanEnd));

    if (!isNextMerged)
        sample.append(" ");
  }

  public static TokenSample parse(String sampleString, String separatorChars) {

    if (sampleString == null) {
        throw new IllegalArgumentException("sampleString must not be null!");
    }
    if (separatorChars == null) {
        throw new IllegalArgumentException("separatorChars must not be null!");
    }

    Span whitespaceTokenSpans[] = WhitespaceTokenizer.INSTANCE.tokenizePos(sampleString);

    // Pre-allocate 20% for newly created tokens
    List realTokenSpans = new ArrayList((int) (whitespaceTokenSpans.length * 1.2d));

    StringBuilder untaggedSampleString = new StringBuilder();

    for (Span whiteSpaceTokenSpan : whitespaceTokenSpans) {
      String whitespaceToken = whiteSpaceTokenSpan.getCoveredText(sampleString).toString();

      boolean wasTokenReplaced = false;

      int tokStart = 0;
      int tokEnd;
      while ((tokEnd = whitespaceToken.indexOf(separatorChars, tokStart)) > -1) {

        String token = whitespaceToken.substring(tokStart, tokEnd);

        addToken(untaggedSampleString, realTokenSpans, token, true);

        tokStart = tokEnd + separatorChars.length();
        wasTokenReplaced = true;
      }

      if (wasTokenReplaced) {
        // If the token contains the split chars at least once
        // a span for the last token must still be added
        String token = whitespaceToken.substring(tokStart);

        addToken(untaggedSampleString, realTokenSpans, token, false);
      }
      else {
        // If it does not contain the split chars at lest once
        // just copy the original token span

        addToken(untaggedSampleString, realTokenSpans, whitespaceToken, false);
      }
    }

    return new TokenSample(untaggedSampleString.toString(), realTokenSpans.toArray(
        new Span[realTokenSpans.size()]));
  }

  @Override
  public boolean equals(Object obj) {
    if (this == obj) {
      return true;
    } else if (obj instanceof TokenSample) {
      TokenSample a = (TokenSample) obj;

      return getText().equals(a.getText())
          && Arrays.equals(getTokenSpans(), a.getTokenSpans());
    } else {
      return false;
    }
  }
}