opennlp.tools.tokenize.TokenSample Maven / Gradle / Ivy

Go to download
/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreemnets.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License. You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */


package opennlp.tools.tokenize;

import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.List;

import opennlp.tools.tokenize.Detokenizer.DetokenizationOperation;
import opennlp.tools.util.Span;

/**
 * A {@link TokenSample} is text with token spans.
 */
public class TokenSample {

  public static final String DEFAULT_SEPARATOR_CHARS = "";
  
  private final String separatorChars = DEFAULT_SEPARATOR_CHARS;
  
  private final String text;

  private final List tokenSpans;

  /**
   * Initializes the current instance.
   *
   * @param text the text which contains the tokens.
   * @param tokenSpans the spans which mark the begin and end of the tokens.
   */
  public TokenSample(String text, Span tokenSpans[]) {
    
    if (text == null)
      throw new IllegalArgumentException("text must not be null!");
    
    if (tokenSpans == null)
      throw new IllegalArgumentException("tokenSpans must not be null! ");
    
    this.text = text;
    this.tokenSpans = Collections.unmodifiableList(new ArrayList(Arrays.asList(tokenSpans)));

    for (int i = 0; i < tokenSpans.length; i++) {
      if (tokenSpans[i].getStart() < 0 || tokenSpans[i].getStart() > text.length() ||
          tokenSpans[i].getEnd() > text.length() || tokenSpans[i].getEnd() < 0) {
        throw new IllegalArgumentException("Span " + tokenSpans[i].toString() +
            " is out of bounds!");
      }
    }
  }

  public TokenSample(Detokenizer detokenizer, String tokens[]) {
    
    StringBuilder sentence = new StringBuilder();
    
    DetokenizationOperation[] operations = detokenizer.detokenize(tokens);
    
    List mergedTokenSpans = new ArrayList();
    
    for (int i = 0; i < operations.length; i++) {
      
      boolean isSeparateFromPreviousToken = i > 0 && 
          !DetokenizationOperation.MERGE_TO_RIGHT.equals(operations[i - 1]) && 
          !DetokenizationOperation.MERGE_TO_LEFT.equals(operations[i]);
      
      if (isSeparateFromPreviousToken) {
        sentence.append(' ');
      }
      
      int beginIndex = sentence.length();
      sentence.append(tokens[i]);
      mergedTokenSpans.add(new Span(beginIndex, sentence.length()));
    }
    
    text = sentence.toString();
    tokenSpans = Collections.unmodifiableList(mergedTokenSpans);
  }
  
  /**
   * Retrieves the text.
   */
  public String getText() {
    return text;
  }

  /**
   * Retrieves the token spans.
   */
  public Span[] getTokenSpans() {
    return tokenSpans.toArray(new Span[tokenSpans.size()]);
  }

  @Override
  public String toString() {
    
    StringBuilder sentence = new StringBuilder();
    
    int lastEndIndex = -1;
    for (Span token : tokenSpans) {
      
      if (lastEndIndex != -1) {

        // If there are no chars between last token
        // and this token insert the separator chars
        // otherwise insert a space
        
        String separator = "";
        if (lastEndIndex == token.getStart())
          separator = separatorChars;
        else
          separator = " ";
        
        sentence.append(separator);
      }
      
      sentence.append(token.getCoveredText(text));
      
      lastEndIndex = token.getEnd();
    }
    
    return sentence.toString();
  }
  
  private static void addToken(StringBuilder sample, List tokenSpans, String token, boolean isNextMerged) {
    
    int tokenSpanStart = sample.length();
    sample.append(token);
    int tokenSpanEnd = sample.length();
    
    tokenSpans.add(new Span(tokenSpanStart, tokenSpanEnd));
    
    if (!isNextMerged)
        sample.append(" ");
  }
  
  public static TokenSample parse(String sampleString, String separatorChars) {
    
    if (sampleString == null || separatorChars == null)
        throw new IllegalArgumentException("arguments must not be null!");
    
    Span whitespaceTokenSpans[] = WhitespaceTokenizer.INSTANCE.tokenizePos(sampleString);
    
    // Pre-allocate 20% for newly created tokens
    List realTokenSpans = new ArrayList((int) (whitespaceTokenSpans.length * 1.2d));
    
    StringBuilder untaggedSampleString = new StringBuilder();
    
    for (Span whiteSpaceTokenSpan : whitespaceTokenSpans) {
      String whitespaceToken = whiteSpaceTokenSpan.getCoveredText(sampleString).toString();
      
      boolean wasTokenReplaced = false;
      
      int tokStart = 0;
      int tokEnd = -1;
      while ((tokEnd = whitespaceToken.indexOf(separatorChars, tokStart)) > -1) {
        
        String token = whitespaceToken.substring(tokStart, tokEnd);
        
        addToken(untaggedSampleString, realTokenSpans, token, true);
        
        tokStart = tokEnd + separatorChars.length();
        wasTokenReplaced = true;
      }
      
      if (wasTokenReplaced) {
        // If the token contains the split chars at least once
        // a span for the last token must still be added
        String token = whitespaceToken.substring(tokStart);
        
        addToken(untaggedSampleString, realTokenSpans, token, false);
      }
      else {
        // If it does not contain the split chars at lest once
        // just copy the original token span
        
        addToken(untaggedSampleString, realTokenSpans, whitespaceToken, false);
      }
    }
    
    return new TokenSample(untaggedSampleString.toString(), (Span[]) realTokenSpans.toArray(
        new Span[realTokenSpans.size()]));
  }
}