All Downloads are FREE. Search and download functionalities are using the official Maven repository.

opennlp.tools.formats.ad.ADChunkSampleStream Maven / Gradle / Ivy

There is a newer version: 2.5.3
Show newest version
/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License. You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package opennlp.tools.formats.ad;

import java.io.IOException;
import java.io.InputStream;
import java.io.UnsupportedEncodingException;
import java.util.ArrayList;
import java.util.List;

import opennlp.tools.chunker.ChunkSample;
import opennlp.tools.formats.ad.ADParagraphStream.Paragraph;
import opennlp.tools.formats.ad.ADParagraphStream.ParagraphParser.Leaf;
import opennlp.tools.formats.ad.ADParagraphStream.ParagraphParser.Node;
import opennlp.tools.formats.ad.ADParagraphStream.ParagraphParser.TreeElement;
import opennlp.tools.namefind.NameSample;
import opennlp.tools.util.ObjectStream;
import opennlp.tools.util.PlainTextByLineStream;

/**
 * Parser for Floresta Sita(c)tica Arvores Deitadas corpus, output to for the
 * Portuguese Chunker training.
 * 

* The heuristic to extract chunks where based o paper 'A Machine Learning * Approach to Portuguese Clause Identification', (Eraldo Fernandes, Cicero * Santos and Ruy Milidiú).
*

* Data can be found on this web site:
* http://www.linguateca.pt/floresta/corpus.html *

* Information about the format:
* Susana Afonso. * "Árvores deitadas: Descrição do formato e das opções de análise na Floresta Sintáctica" * .
* 12 de Fevereiro de 2006. * http://www.linguateca.pt/documentos/Afonso2006ArvoresDeitadas.pdf *

* Detailed info about the NER tagset: * http://beta.visl.sdu.dk/visl/pt/info/portsymbol.html#semtags_names *

* Note: Do not use this class, internal use only! */ public class ADChunkSampleStream implements ObjectStream { private final ObjectStream adSentenceStream; private int start = -1; private int end = -1; private int index = 0; /** * Creates a new {@link NameSample} stream from a line stream, i.e. * {@link ObjectStream}< {@link String}>, that could be a * {@link PlainTextByLineStream} object. * * @param lineStream * a stream of lines as {@link String} */ public ADChunkSampleStream(ObjectStream lineStream) { this.adSentenceStream = new ADParagraphStream(lineStream); } /** * Creates a new {@link NameSample} stream from a {@link InputStream} * * @param in * the Corpus {@link InputStream} * @param charsetName * the charset of the Arvores Deitadas Corpus */ public ADChunkSampleStream(InputStream in, String charsetName) { try { this.adSentenceStream = new ADParagraphStream(new PlainTextByLineStream( in, charsetName)); } catch (UnsupportedEncodingException e) { // UTF-8 is available on all JVMs, will never happen throw new IllegalStateException(e); } } public ChunkSample read() throws IOException { Paragraph paragraph; while ((paragraph = this.adSentenceStream.read()) != null) { if (end > -1 && index >= end) { // leave return null; } if (start > -1 && index < start) { index++; // skip this one } else { Node root = paragraph.getRoot(); List sentence = new ArrayList(); List tags = new ArrayList(); List target = new ArrayList(); processRoot(root, sentence, tags, target); if (sentence.size() > 0) { index++; return new ChunkSample(sentence, tags, target); } } } return null; } private void processRoot(Node root, List sentence, List tags, List target) { if (root != null) { TreeElement[] elements = root.getElements(); for (int i = 0; i < elements.length; i++) { if (elements[i].isLeaf()) { processLeaf((Leaf) elements[i], false, "O", sentence, tags, target); } else { processNode((Node) elements[i], sentence, tags, target); } } } } private void processNode(Node node, List sentence, List tags, List target) { String phraseTag = getChunkTag(node.getSyntacticTag()); TreeElement[] elements = node.getElements(); for (int i = 0; i < elements.length; i++) { if (elements[i].isLeaf()) { boolean isIntermediate = false; if ( i > 0 && elements[i - 1].isLeaf() && phraseTag != null && !phraseTag.equals("O")) { isIntermediate = true; } processLeaf((Leaf) elements[i], isIntermediate, phraseTag, sentence, tags, target); } else { processNode((Node) elements[i], sentence, tags, target); } } } private void processLeaf(Leaf leaf, boolean isIntermediate, String phraseTag, List sentence, List tags, List target) { String chunkTag; if (leaf.getSyntacticTag() != null && phraseTag.equals("O")) { if(leaf.getSyntacticTag().endsWith("v-fin")) { phraseTag = "VP"; } else if(leaf.getSyntacticTag().endsWith(":n")) { phraseTag = "NP"; } } if (!phraseTag.equals("O")) { if (isIntermediate) { chunkTag = "I-" + phraseTag; } else { chunkTag = "B-" + phraseTag; } } else { chunkTag = phraseTag; } sentence.add(leaf.getLexeme()); if (leaf.getSyntacticTag() == null) { tags.add(leaf.getLexeme()); } else { tags.add(getMorphologicalTag(leaf.getSyntacticTag())); } target.add(chunkTag); } private String getMorphologicalTag(String tag) { return tag.substring(tag.lastIndexOf(":") + 1); } private String getChunkTag(String tag) { String phraseTag = tag.substring(tag.lastIndexOf(":") + 1); if (phraseTag.equals("np") || phraseTag.equals("ap") || phraseTag.equals("advp") || phraseTag.equals("vp") || phraseTag.equals("pp")) { phraseTag = phraseTag.toUpperCase(); } else { phraseTag = "O"; } return phraseTag; } public void setStart(int aStart) { this.start = aStart; } public void setEnd(int aEnd) { this.end = aEnd; } public void reset() throws IOException, UnsupportedOperationException { adSentenceStream.reset(); } public void close() throws IOException { adSentenceStream.close(); } }





© 2015 - 2025 Weber Informatics LLC | Privacy Policy