All Downloads are FREE. Search and download functionalities are using the official Maven repository.

opennlp.tools.formats.ad.ADPOSSampleStream Maven / Gradle / Ivy

/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License. You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package opennlp.tools.formats.ad;

import java.io.IOException;
import java.io.InputStream;
import java.io.UnsupportedEncodingException;
import java.util.ArrayList;
import java.util.List;
import java.util.StringTokenizer;

import opennlp.tools.formats.ad.ADSentenceStream.Sentence;
import opennlp.tools.formats.ad.ADSentenceStream.SentenceParser.Leaf;
import opennlp.tools.formats.ad.ADSentenceStream.SentenceParser.Node;
import opennlp.tools.formats.ad.ADSentenceStream.SentenceParser.TreeElement;
import opennlp.tools.postag.POSSample;
import opennlp.tools.util.InputStreamFactory;
import opennlp.tools.util.ObjectStream;
import opennlp.tools.util.PlainTextByLineStream;

/**
 * Note: Do not use this class, internal use only!
 */
public class ADPOSSampleStream implements ObjectStream {

  private final ObjectStream adSentenceStream;
  private boolean expandME;
  private boolean isIncludeFeatures;

  /**
   * Creates a new {@link POSSample} stream from a line stream, i.e.
   * {@link ObjectStream}<{@link String}>, that could be a
   * {@link PlainTextByLineStream} object.
   *
   * @param lineStream
   *          a stream of lines as {@link String}
   * @param expandME
   *          if true will expand the multiword expressions, each word of the
   *          expression will have the POS Tag that was attributed to the
   *          expression plus the prefix B- or I- (CONLL convention)
   * @param includeFeatures
   *          if true will combine the POS Tag with the feature tags
   */
  public ADPOSSampleStream(ObjectStream lineStream, boolean expandME,
      boolean includeFeatures) {
    this.adSentenceStream = new ADSentenceStream(lineStream);
    this.expandME = expandME;
    this.isIncludeFeatures = includeFeatures;
  }

  /**
   * Creates a new {@link POSSample} stream from a {@link InputStream}
   *
   * @param in
   *          the Corpus {@link InputStream}
   * @param charsetName
   *          the charset of the Arvores Deitadas Corpus
   * @param expandME
   *          if true will expand the multiword expressions, each word of the
   *          expression will have the POS Tag that was attributed to the
   *          expression plus the prefix B- or I- (CONLL convention)
   * @param includeFeatures
   *          if true will combine the POS Tag with the feature tags
   */
  public ADPOSSampleStream(InputStreamFactory in, String charsetName,
      boolean expandME, boolean includeFeatures) throws IOException {

    try {
      this.adSentenceStream = new ADSentenceStream(new PlainTextByLineStream(in, charsetName));
      this.expandME = expandME;
      this.isIncludeFeatures = includeFeatures;
    } catch (UnsupportedEncodingException e) {
      // UTF-8 is available on all JVMs, will never happen
      throw new IllegalStateException(e);
    }
  }

  public POSSample read() throws IOException {
    Sentence paragraph;
    while ((paragraph = this.adSentenceStream.read()) != null) {
      Node root = paragraph.getRoot();
      List sentence = new ArrayList<>();
      List tags = new ArrayList<>();
      process(root, sentence, tags);

      return new POSSample(sentence, tags);
    }
    return null;
  }

  private void process(Node node, List sentence, List tags) {
    if (node != null) {
      for (TreeElement element : node.getElements()) {
        if (element.isLeaf()) {
          processLeaf((Leaf) element, sentence, tags);
        } else {
          process((Node) element, sentence, tags);
        }
      }
    }
  }

  private void processLeaf(Leaf leaf, List sentence, List tags) {
    if (leaf != null) {
      String lexeme = leaf.getLexeme();
      String tag = leaf.getFunctionalTag();

      if (tag == null) {
        tag = leaf.getLexeme();
      }

      if (isIncludeFeatures && leaf.getMorphologicalTag() != null) {
        tag += " " + leaf.getMorphologicalTag();
      }
      tag = tag.replaceAll("\\s+", "=");

      if (tag == null)
        tag = lexeme;

      if (expandME && lexeme.contains("_")) {
        StringTokenizer tokenizer = new StringTokenizer(lexeme, "_");

        if (tokenizer.countTokens() > 0) {
          List toks = new ArrayList<>(tokenizer.countTokens());
          List tagsWithCont = new ArrayList<>(
              tokenizer.countTokens());
          toks.add(tokenizer.nextToken());
          tagsWithCont.add("B-" + tag);
          while (tokenizer.hasMoreTokens()) {
            toks.add(tokenizer.nextToken());
            tagsWithCont.add("I-" + tag);
          }

          sentence.addAll(toks);
          tags.addAll(tagsWithCont);
        } else {
          sentence.add(lexeme);
          tags.add(tag);
        }

      } else {
        sentence.add(lexeme);
        tags.add(tag);
      }
    }

  }

  public void reset() throws IOException, UnsupportedOperationException {
    adSentenceStream.reset();
  }

  public void close() throws IOException {
    adSentenceStream.close();
  }
}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy