All Downloads are FREE. Search and download functionalities are using the official Maven repository.

opennlp.tools.chunker.ChunkSample Maven / Gradle / Ivy

/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License. You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package opennlp.tools.chunker;

import java.io.Serializable;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.List;
import java.util.Objects;

import opennlp.tools.util.Span;

/**
 * Class for holding chunks for a single unit of text.
 */
public class ChunkSample implements Serializable {

  private final List sentence;
  private final List tags;
  private final List preds;

  /**
   * Initializes the current instance.
   *
   * @param sentence
   *          training sentence
   * @param tags
   *          POS Tags for the sentence
   * @param preds
   *          Chunk tags in B-* I-* notation
   */
  public ChunkSample(String[] sentence, String[] tags, String[] preds) {

    validateArguments(sentence.length, tags.length, preds.length);

    this.sentence = Collections.unmodifiableList(new ArrayList<>(Arrays.asList(sentence)));
    this.tags = Collections.unmodifiableList(new ArrayList<>(Arrays.asList(tags)));
    this.preds = Collections.unmodifiableList(new ArrayList<>(Arrays.asList(preds)));
  }

  /**
   * Initializes the current instance.
   *
   * @param sentence
   *          training sentence
   * @param tags
   *          POS Tags for the sentence
   * @param preds
   *          Chunk tags in B-* I-* notation
   */
  public ChunkSample(List sentence, List tags, List preds) {

    validateArguments(sentence.size(), tags.size(), preds.size());

    this.sentence = Collections.unmodifiableList(new ArrayList<>(sentence));
    this.tags = Collections.unmodifiableList(new ArrayList<>(tags));
    this.preds = Collections.unmodifiableList(new ArrayList<>(preds));
  }

  /** Gets the training sentence */
  public String[] getSentence() {
    return sentence.toArray(new String[sentence.size()]);
  }

  /** Gets the POS Tags for the sentence */
  public String[] getTags() {
    return tags.toArray(new String[tags.size()]);
  }

  /** Gets the Chunk tags in B-* I-* notation */
  public String[] getPreds() {
    return preds.toArray(new String[preds.size()]);
  }

  /** Gets the phrases as an array of spans */
  public Span[] getPhrasesAsSpanList() {
    return phrasesAsSpanList(getSentence(), getTags(), getPreds());
  }

  /**
   * Static method to create arrays of spans of phrases
   *
   * @param aSentence
   *          training sentence
   * @param aTags
   *          POS Tags for the sentence
   * @param aPreds
   *          Chunk tags in B-* I-* notation
   *
   * @return the phrases as an array of spans
   */
  public static Span[] phrasesAsSpanList(String[] aSentence, String[] aTags,
      String[] aPreds) {

    validateArguments(aSentence.length, aTags.length, aPreds.length);

    // initialize with the list maximum size
    List phrases = new ArrayList<>(aSentence.length);
    String startTag = "";
    int startIndex = 0;
    boolean foundPhrase = false;

    for (int ci = 0, cn = aPreds.length; ci < cn; ci++) {
      String pred = aPreds[ci];
      if (pred.startsWith("B-")
          || !pred.equals("I-" + startTag) && !pred.equals("O")) { // start
        if (foundPhrase) { // handle the last
          phrases.add(new Span(startIndex, ci, startTag));
        }
        startIndex = ci;
        startTag = pred.substring(2);
        foundPhrase = true;
      } else if (pred.equals("I-" + startTag)) { // middle
        // do nothing
      } else if (foundPhrase) { // end
        phrases.add(new Span(startIndex, ci, startTag));
        foundPhrase = false;
        startTag = "";
      }
    }
    if (foundPhrase) { // leftover
      phrases.add(new Span(startIndex, aPreds.length, startTag));
    }

    return phrases.toArray(new Span[phrases.size()]);
  }

  private static void validateArguments(int sentenceSize, int tagsSize, int predsSize)
      throws IllegalArgumentException {
    if (sentenceSize != tagsSize || tagsSize != predsSize)
      throw new IllegalArgumentException(
          "All arrays must have the same length: " +
              "sentenceSize: " + sentenceSize +
              ", tagsSize: " + tagsSize +
              ", predsSize: " + predsSize + "!");
  }

  /**
   * Creates a nice to read string for the phrases formatted as following: 
* * [NP Rockwell_NNP ] [VP said_VBD ] [NP the_DT agreement_NN ] [VP calls_VBZ ] [SBAR for_IN ] * [NP it_PRP ] [VP to_TO supply_VB ] [NP 200_CD additional_JJ so-called_JJ shipsets_NNS ] * [PP for_IN ] [NP the_DT planes_NNS ] ._. * * * @return a nice to read string representation of the chunk phases */ public String nicePrint() { Span[] spans = getPhrasesAsSpanList(); StringBuilder result = new StringBuilder(" "); for (int tokenIndex = 0; tokenIndex < sentence.size(); tokenIndex++) { for (Span span : spans) { if (span.getStart() == tokenIndex) { result.append("[").append(span.getType()).append(" "); } if (span.getEnd() == tokenIndex) { result.append("]").append(' '); } } result.append(sentence.get(tokenIndex)).append("_").append(tags.get(tokenIndex)).append(' '); } if (sentence.size() > 1) result.setLength(result.length() - 1); for (Span span : spans) { if (span.getEnd() == sentence.size()) { result.append(']'); } } return result.toString(); } @Override public String toString() { StringBuilder chunkString = new StringBuilder(); for (int ci = 0; ci < preds.size(); ci++) { chunkString.append(sentence.get(ci)).append(" ").append(tags.get(ci)) .append(" ").append(preds.get(ci)).append("\n"); } return chunkString.toString(); } @Override public int hashCode() { return Objects.hash(Arrays.hashCode(getSentence()), Arrays.hashCode(getTags()), Arrays.hashCode(getPreds())); } @Override public boolean equals(Object obj) { if (this == obj) { return true; } if (obj instanceof ChunkSample) { ChunkSample a = (ChunkSample) obj; return Arrays.equals(getSentence(), a.getSentence()) && Arrays.equals(getTags(), a.getTags()) && Arrays.equals(getPreds(), a.getPreds()); } return false; } }




© 2015 - 2024 Weber Informatics LLC | Privacy Policy