opennlp.tools.formats.ad.ADSentenceSampleStream Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of opennlp-tools Show documentation
There is a newer version: 2.5.0
/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License. You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package opennlp.tools.formats.ad;

import java.io.FileInputStream;
import java.io.IOException;
import java.io.UnsupportedEncodingException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import opennlp.tools.formats.ad.ADSentenceStream.Sentence;
import opennlp.tools.sentdetect.SentenceSample;
import opennlp.tools.sentdetect.lang.Factory;
import opennlp.tools.util.InputStreamFactory;
import opennlp.tools.util.ObjectStream;
import opennlp.tools.util.PlainTextByLineStream;
import opennlp.tools.util.Span;

/**
 * Note: Do not use this class, internal use only!
 */
public class ADSentenceSampleStream implements ObjectStream {

  private final ObjectStream adSentenceStream;

  private int text = -1;
  private int para = -1;
  private boolean isSameText;
  private boolean isSamePara;
  private Sentence sent;
  private boolean isIncludeTitles = true;
  private boolean isTitle;

  private final char[] ptEosCharacters;

  /**
   * Creates a new {@link SentenceSample} stream from a line stream, i.e.
   * {@link ObjectStream}<{@link String}>, that could be a
   * {@link PlainTextByLineStream} object.
   *
   * @param lineStream
   *          a stream of lines as {@link String}
   * @param includeHeadlines
   *          if true will output the sentences marked as news headlines
   */
  public ADSentenceSampleStream(ObjectStream lineStream, boolean includeHeadlines) {
    this.adSentenceStream = new ADSentenceStream(lineStream);
    ptEosCharacters = Factory.ptEosCharacters;
    Arrays.sort(ptEosCharacters);
    this.isIncludeTitles = includeHeadlines;
  }

  /**
   * Creates a new {@link SentenceSample} stream from a {@link FileInputStream}
   *
   * @param in
   *          input stream from the corpus
   * @param charsetName
   *          the charset to use while reading the corpus
   * @param includeHeadlines
   *          if true will output the sentences marked as news headlines
   */
  public ADSentenceSampleStream(InputStreamFactory in, String charsetName,
      boolean includeHeadlines) throws IOException {
    try {
      this.adSentenceStream = new ADSentenceStream(new PlainTextByLineStream(
          in, charsetName));
    } catch (UnsupportedEncodingException e) {
      // UTF-8 is available on all JVMs, will never happen
      throw new IllegalStateException(e);
    }
    ptEosCharacters = Factory.ptEosCharacters;
    Arrays.sort(ptEosCharacters);
    this.isIncludeTitles = includeHeadlines;
  }

  // The Arvores Deitadas Corpus has information about texts and paragraphs.
  public SentenceSample read() throws IOException {

    if (sent == null) {
      sent = this.adSentenceStream.read();
      updateMeta();
      if (sent == null) {
        return null;
      }
    }

    StringBuilder document = new StringBuilder();
    List sentences = new ArrayList<>();
    do {
      do {
        if (!isTitle || (isTitle && isIncludeTitles)) {
          if (hasPunctuation(sent.getText())) {
            int start = document.length();
            document.append(sent.getText());
            sentences.add(new Span(start, document.length()));
            document.append(" ");
          }

        }
        sent = this.adSentenceStream.read();
        updateMeta();
      } while (isSamePara);
      // break; // got one paragraph!
    } while (isSameText);

    String doc;
    if (document.length() > 0) {
      doc = document.substring(0, document.length() - 1);
    } else {
      doc = document.toString();
    }

    return new SentenceSample(doc,
        sentences.toArray(new Span[sentences.size()]));
  }

  private boolean hasPunctuation(String text) {
    text = text.trim();
    if (text.length() > 0) {
      char lastChar = text.charAt(text.length() - 1);
      if (Arrays.binarySearch(ptEosCharacters, lastChar) >= 0) {
        return true;
      }
    }
    return false;
  }

  // there are some different types of metadata depending on the corpus.
  // todo: merge this patterns
  private Pattern meta1 = Pattern
      .compile("^(?:[a-zA-Z\\-]*(\\d+)).*?p=(\\d+).*");

  private void updateMeta() {
    if (this.sent != null) {
      String meta = this.sent.getMetadata();
      Matcher m = meta1.matcher(meta);
      int currentText;
      int currentPara;
      if (m.matches()) {
        currentText = Integer.parseInt(m.group(1));
        currentPara = Integer.parseInt(m.group(2));
      } else {
        throw new RuntimeException("Invalid metadata: " + meta);
      }
      isSamePara = isSameText = false;
      if (currentText == text)
        isSameText = true;

      if (isSameText && currentPara == para)
        isSamePara = true;

      isTitle = meta.contains("title");

      text = currentText;
      para = currentPara;

    } else {
      this.isSamePara = this.isSameText = false;
    }
  }

  public void reset() throws IOException, UnsupportedOperationException {
    adSentenceStream.reset();
  }

  public void close() throws IOException {
    adSentenceStream.close();
  }
}