All Downloads are FREE. Search and download functionalities are using the official Maven repository.

opennlp.tools.formats.ConllXPOSSampleStream Maven / Gradle / Ivy

There is a newer version: 2.5.0
Show newest version
/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License. You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package opennlp.tools.formats;

import java.io.BufferedReader;
import java.io.IOException;
import java.io.StringReader;
import java.nio.charset.Charset;
import java.util.ArrayList;
import java.util.List;

import opennlp.tools.postag.POSSample;
import opennlp.tools.util.FilterObjectStream;
import opennlp.tools.util.InputStreamFactory;
import opennlp.tools.util.InvalidFormatException;
import opennlp.tools.util.ObjectStream;
import opennlp.tools.util.ParagraphStream;
import opennlp.tools.util.PlainTextByLineStream;

/**
 * Parses the data from the CONLL 06 shared task into POS Samples.
 * 

* More information about the data format can be found here:
* http://www.cnts.ua.ac.be/conll2006/ *

* Note: Do not use this class, internal use only! */ public class ConllXPOSSampleStream extends FilterObjectStream { public ConllXPOSSampleStream(ObjectStream lineStream) { super(new ParagraphStream(lineStream)); } public ConllXPOSSampleStream(InputStreamFactory in, Charset charset) throws IOException { super(new ParagraphStream(new PlainTextByLineStream(in, charset))); } public POSSample read() throws IOException { // The CONLL-X data has a word per line and each line is tab separated // in the following format: // ID, FORM, LEMMA, CPOSTAG, POSTAG, ... (max 10 fields) // One paragraph contains a whole sentence and, the token // and tag will be read from the FORM and POSTAG field. String paragraph = samples.read(); POSSample sample = null; if (paragraph != null) { // paragraph get lines BufferedReader reader = new BufferedReader(new StringReader(paragraph)); List tokens = new ArrayList<>(100); List tags = new ArrayList<>(100); String line; while ((line = reader.readLine()) != null) { final int minNumberOfFields = 5; String[] parts = line.split("\t"); if (parts.length >= minNumberOfFields) { tokens.add(parts[1]); tags.add(parts[4]); } else { throw new InvalidFormatException("Every non-empty line must have at least " + minNumberOfFields + " fields: '" + line + "'!"); } } // just skip empty samples and read next sample if (tokens.size() == 0) sample = read(); sample = new POSSample(tokens.toArray(new String[tokens.size()]), tags.toArray(new String[tags.size()])); } return sample; } }





© 2015 - 2024 Weber Informatics LLC | Privacy Policy