opennlp.tools.formats.ad.ADChunkSampleStream Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of opennlp-tools Show documentation
There is a newer version: 2.5.3
/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License. You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package opennlp.tools.formats.ad;

import java.io.IOException;
import java.io.InputStream;
import java.io.UnsupportedEncodingException;
import java.util.ArrayList;
import java.util.List;

import opennlp.tools.chunker.ChunkSample;
import opennlp.tools.formats.ad.ADParagraphStream.Paragraph;
import opennlp.tools.formats.ad.ADParagraphStream.ParagraphParser.Leaf;
import opennlp.tools.formats.ad.ADParagraphStream.ParagraphParser.Node;
import opennlp.tools.formats.ad.ADParagraphStream.ParagraphParser.TreeElement;
import opennlp.tools.namefind.NameSample;
import opennlp.tools.util.ObjectStream;
import opennlp.tools.util.PlainTextByLineStream;

/**
 * Parser for Floresta Sita(c)tica Arvores Deitadas corpus, output to for the
 * Portuguese Chunker training.
 * 
 * The heuristic to extract chunks where based o paper 'A Machine Learning
 * Approach to Portuguese Clause Identification', (Eraldo Fernandes, Cicero
 * Santos and Ruy Milidiú).

 * 

 * Data can be found on this web site:

 * http://www.linguateca.pt/floresta/corpus.html
 * 

 * Information about the format:

 * Susana Afonso.
 * "Árvores deitadas: Descrição do formato e das opções de análise na Floresta Sintáctica"
 * .

 * 12 de Fevereiro de 2006.
 * http://www.linguateca.pt/documentos/Afonso2006ArvoresDeitadas.pdf
 * 

 * Detailed info about the NER tagset:
 * http://beta.visl.sdu.dk/visl/pt/info/portsymbol.html#semtags_names
 * 
 * Note: Do not use this class, internal use only!
 */
public class ADChunkSampleStream implements ObjectStream {

	private final ObjectStream adSentenceStream;

	private int start = -1;
	private int end = -1;

	private int index = 0;

	/**
	 * Creates a new {@link NameSample} stream from a line stream, i.e.
	 * {@link ObjectStream}< {@link String}>, that could be a
	 * {@link PlainTextByLineStream} object.
	 * 
	 * @param lineStream
	 *          a stream of lines as {@link String}
	 */
	public ADChunkSampleStream(ObjectStream lineStream) {
		this.adSentenceStream = new ADParagraphStream(lineStream);
	}

	/**
	 * Creates a new {@link NameSample} stream from a {@link InputStream}
	 * 
	 * @param in
	 *          the Corpus {@link InputStream}
	 * @param charsetName
	 *          the charset of the Arvores Deitadas Corpus
	 */
	public ADChunkSampleStream(InputStream in, String charsetName) {

		try {
			this.adSentenceStream = new ADParagraphStream(new PlainTextByLineStream(
					in, charsetName));
		} catch (UnsupportedEncodingException e) {
			// UTF-8 is available on all JVMs, will never happen
			throw new IllegalStateException(e);
		}
	}

	public ChunkSample read() throws IOException {

		Paragraph paragraph;
		while ((paragraph = this.adSentenceStream.read()) != null) {

			if (end > -1 && index >= end) {
				// leave
				return null;
			}

			if (start > -1 && index < start) {
				index++;
				// skip this one
			} else {
				Node root = paragraph.getRoot();
				List sentence = new ArrayList();
				List tags = new ArrayList();
				List target = new ArrayList();

				processRoot(root, sentence, tags, target);

				if (sentence.size() > 0) {
					index++;
					return new ChunkSample(sentence, tags, target);
				}

			}

		}
		return null;
	}

	private void processRoot(Node root, List sentence, List tags,
			List target) {
		if (root != null) {
			TreeElement[] elements = root.getElements();
			for (int i = 0; i < elements.length; i++) {
				if (elements[i].isLeaf()) {
					processLeaf((Leaf) elements[i], false, "O", sentence, tags, target);
				} else {
					processNode((Node) elements[i], sentence, tags, target);
				}
			}
		}
	}

	private void processNode(Node node, List sentence, List tags,
			List target) {
		String phraseTag = getChunkTag(node.getSyntacticTag());

		TreeElement[] elements = node.getElements();
		for (int i = 0; i < elements.length; i++) {
			if (elements[i].isLeaf()) {
				boolean isIntermediate = false;
				if ( i > 0 && elements[i - 1].isLeaf() && phraseTag != null && !phraseTag.equals("O")) {
					isIntermediate = true;
				}
				processLeaf((Leaf) elements[i], isIntermediate, phraseTag, sentence,
						tags, target);
			} else {
				processNode((Node) elements[i], sentence, tags, target);
			}
		}
	}

	private void processLeaf(Leaf leaf, boolean isIntermediate, String phraseTag,
			List sentence, List tags, List target) {
		String chunkTag;
		
		
		
		if (leaf.getSyntacticTag() != null
				&& phraseTag.equals("O")) {
			if(leaf.getSyntacticTag().endsWith("v-fin")) {
				phraseTag = "VP";
			} else if(leaf.getSyntacticTag().endsWith(":n")) {
				phraseTag = "NP";
			}
		}

		if (!phraseTag.equals("O")) {
			if (isIntermediate) {
				chunkTag = "I-" + phraseTag;
			} else {
				chunkTag = "B-" + phraseTag;
			}
		} else {
			chunkTag = phraseTag;
		}

		sentence.add(leaf.getLexeme());
		if (leaf.getSyntacticTag() == null) {
			tags.add(leaf.getLexeme());
		} else {
			tags.add(getMorphologicalTag(leaf.getSyntacticTag()));
		}
		target.add(chunkTag);
	}

	private String getMorphologicalTag(String tag) {
		return tag.substring(tag.lastIndexOf(":") + 1);
	}

	private String getChunkTag(String tag) {
		
		String phraseTag = tag.substring(tag.lastIndexOf(":") + 1);
		
		if (phraseTag.equals("np") || phraseTag.equals("ap")
				|| phraseTag.equals("advp") || phraseTag.equals("vp")
				|| phraseTag.equals("pp")) {
			phraseTag = phraseTag.toUpperCase();
		} else {
			phraseTag = "O";
		}
		return phraseTag;
	}

	public void setStart(int aStart) {
		this.start = aStart;
	}

	public void setEnd(int aEnd) {
		this.end = aEnd;
	}

	public void reset() throws IOException, UnsupportedOperationException {
		adSentenceStream.reset();
	}

	public void close() throws IOException {
		adSentenceStream.close();
	}

}