opennlp.tools.formats.LeipzigDoccatSampleStream Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of opennlp-tools Show documentation
There is a newer version: 2.5.0
/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License. You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package opennlp.tools.formats;

import java.io.IOException;
import java.io.InputStream;
import java.util.HashMap;
import java.util.Map;

import opennlp.tools.doccat.DocumentSample;
import opennlp.tools.tokenize.SimpleTokenizer;
import opennlp.tools.util.FilterObjectStream;
import opennlp.tools.util.PlainTextByLineStream;

/**
 * Stream filter to produce document samples out of a Leipzig sentences.txt file.
 * In the Leipzig corpus the encoding of the various senences.txt file is defined by
 * the language. The language must be specified to produce the category tags and is used
 * to determine the correct input encoding.
 * 
 * The input text is tokenized with the {@link SimpleTokenizer}. The input text classified
 * by the language model must also be tokenized by the {@link SimpleTokenizer} to produce
 * exactly the same tokenization during testing and training.
 */
public class LeipzigDoccatSampleStream extends
    FilterObjectStream {
  
  private final String language;
  private final int sentencesPerDocument;

  /**
   * Creates a new LeipzigDoccatSampleStream with the specified parameters.
   * 
   * @param language the Leipzig input sentences.txt file
   * @param sentencesPerDocument the number of sentences which should be grouped into once {@link DocumentSample}
   * @param in the InputStream pointing to the contents of the sentences.txt input file
   */
  LeipzigDoccatSampleStream(String language, int sentencesPerDocument, 
      InputStream in) throws IOException {
    super(new PlainTextByLineStream(in, mapLanguageToEncoding(language)));
    this.language = language;
    this.sentencesPerDocument = sentencesPerDocument;
  }
  
  /**
   * Maps the language to the file encoding, if the encoding
   * cannot be specified an IOException is thrown.
   * 
   * @return
   * @throws IOException
   */
  private static String mapLanguageToEncoding(String language) throws IOException {
    
    if (language == null)
      throw new NullPointerException("language parameter must not be null!");
    
    
    Map encodingMap = new HashMap();
    encodingMap.put("cat", "ISO-8859-1");
    encodingMap.put("de", "ISO-8859-1");
    encodingMap.put("dk", "ISO-8859-1");
    encodingMap.put("ee", "ISO-8859-4");
    encodingMap.put("en", "ISO-8859-1");
    encodingMap.put("fi", "ISO-8859-1");
    encodingMap.put("fr", "ISO-8859-1");
    encodingMap.put("it", "ISO-8859-1");
    encodingMap.put("jp", "UTF-8");
    encodingMap.put("kr", "UTF-8");
    encodingMap.put("nl", "ISO-8859-1");
    encodingMap.put("no", "ISO-8859-1");
    encodingMap.put("se", "ISO-8859-1");
    encodingMap.put("sorb", "ISO-8859-2");
    encodingMap.put("tr", "ISO-8859-9");
    
    String encoding = encodingMap.get(language);
    
    if (encoding != null) {
      return encoding;
    }
    else {
      throw new IOException("Encoding for language " + language + " is not specified!");
    }
  }
  
  public DocumentSample read() throws IOException {

    int count = 0;

    StringBuilder sampleText = new StringBuilder();

    String line;
    while (count < sentencesPerDocument && (line = samples.read()) != null) {

      String tokens[] = SimpleTokenizer.INSTANCE.tokenize(line);
      
      if (tokens.length == 0) {
        throw new IOException("Empty lines are not allowed!");
      }
        
      // Always skip first token, that is the sentence number!
      for (int i = 1; i < tokens.length; i++) {
        sampleText.append(tokens[i]);
        sampleText.append(' ');
      }
      
      count++;
    }

    
    if (sampleText.length() > 0) {
      return new DocumentSample(language, sampleText.toString());
    }
  
    return null;
  }
}