All Downloads are FREE. Search and download functionalities are using the official Maven repository.

opennlp.tools.formats.ad.ADParagraphStream Maven / Gradle / Ivy

There is a newer version: 2.5.0
Show newest version
/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License. You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package opennlp.tools.formats.ad;

import java.io.BufferedReader;
import java.io.IOException;
import java.io.StringReader;
import java.util.ArrayList;
import java.util.List;
import java.util.Stack;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import opennlp.tools.formats.ad.ADParagraphStream.ParagraphParser.Node;
import opennlp.tools.util.FilterObjectStream;
import opennlp.tools.util.ObjectStream;

/**
 * Stream filter which merges text lines into paragraphs, following the Arvores
 * Deitadas syntax.
 * 

* Information about the format:
* Susana Afonso. * "Árvores deitadas: Descrição do formato e das opções de análise na Floresta Sintáctica" * .
* 12 de Fevereiro de 2006. * http://www.linguateca.pt/documentos/Afonso2006ArvoresDeitadas.pdf *

* Note: Do not use this class, internal use only! */ public class ADParagraphStream extends FilterObjectStream { public static class Paragraph { private String text; private Node root; public String getText() { return text; } public void setText(String text) { this.text = text; } public Node getRoot() { return root; } public void setRoot(Node root) { this.root = root; } } /** * Parses a sample of AD corpus. A sentence in AD corpus is represented by a * Tree. In this class we declare some types to represent that tree. */ public static class ParagraphParser { private Pattern rootPattern = Pattern.compile("^[^:=]+:[^(\\s]+$"); private Pattern nodePattern = Pattern .compile("^([=-]*)([^:=]+:[^\\(\\s]+)(\\(([^\\)]+)\\))?\\s*$"); private Pattern leafPattern = Pattern .compile("^([=-]*)([^:=]+:[^\\(\\s]+)\\(([\"'].+[\"'])?\\s*([^\\)]+)?\\)\\s+(.+)"); private Pattern bizarreLeafPattern = Pattern .compile("^([=-]*)([^:=]+=[^\\(\\s]+)\\(([\"'].+[\"'])?\\s*([^\\)]+)?\\)\\s+(.+)"); private Pattern punctuationPattern = Pattern.compile("^(=*)(\\W+)$"); /** * Parse the paragraph */ public Paragraph parse(String paragraphString) { BufferedReader reader = new BufferedReader(new StringReader( paragraphString)); Paragraph sentence = new Paragraph(); Node root = new Node(); try { // first line is String line = reader.readLine(); if (line.startsWith(" nodeStack = new Stack(); // we get the complete line root.setSyntacticTag("ROOT"); root.setLevel(0); nodeStack.add(root); // now we have to take care of the lastLevel. Every time it raises, we // will add the // leaf to the node at the top. If it decreases, we remove the top. //line = reader.readLine(); while (line.length() != 0 && line.startsWith("") == false) { TreeElement element = this.getElement(line); if(element != null) { // remove elements at same level or higher while (!nodeStack.isEmpty() && element.getLevel() > 0 && element.getLevel() <= nodeStack.peek().getLevel()) { nodeStack.pop(); } if( element.isLeaf() ) { if (nodeStack.isEmpty()) { root.addElement(element); } else { // look for the node with the correct level Node peek = nodeStack.peek(); if (element.level == 0) { // add to the root nodeStack.firstElement().addElement(element); } else { Node parent = null; int index = nodeStack.size() - 1; while(parent == null) { if(peek.getLevel() < element.getLevel()) { parent = peek; } else { index--; if(index > -1) { peek = nodeStack.get(index); } else { parent = nodeStack.firstElement(); } } } parent.addElement(element); } } } else { if (!nodeStack.isEmpty()) { nodeStack.peek().addElement(element); } nodeStack.push((Node) element); } } line = reader.readLine(); } } catch (Exception e) { System.err.println(paragraphString); e.printStackTrace(); return sentence; } // second line should be SOURCE sentence.setRoot(root); return sentence; } /** * Parse a tree element from a AD line * * @param line * the AD line * @return the tree element */ public TreeElement getElement(String line) { // try node Matcher nodeMatcher = nodePattern.matcher(line); if (nodeMatcher.matches()) { int level = nodeMatcher.group(1).length(); String syntacticTag = nodeMatcher.group(2); String morphologicalTag = nodeMatcher.group(3); Node node = new Node(); node.setLevel(level); node.setSyntacticTag(syntacticTag); node.setMorphologicalTag(morphologicalTag); return node; } Matcher leafMatcher = leafPattern.matcher(line); if (leafMatcher.matches()) { int level = leafMatcher.group(1).length(); String syntacticTag = leafMatcher.group(2); String lemma = leafMatcher.group(3); String morphologicalTag = leafMatcher.group(4); String lexeme = leafMatcher.group(5); Leaf leaf = new Leaf(); leaf.setLevel(level); leaf.setSyntacticTag(syntacticTag); leaf.setMorphologicalTag(morphologicalTag); leaf.setLexeme(lexeme); if (lemma != null) { if (lemma.length() > 2) { lemma = lemma.substring(1, lemma.length() - 1); } leaf.setLemma(lemma); } return leaf; } Matcher punctuationMatcher = punctuationPattern.matcher(line); if (punctuationMatcher.matches()) { int level = punctuationMatcher.group(1).length(); String lexeme = punctuationMatcher.group(2); Leaf leaf = new Leaf(); leaf.setLevel(level); leaf.setLexeme(lexeme); return leaf; } // process the bizarre cases if(line.equals("_") || line.startsWith(" elems = new ArrayList(); public void addElement(TreeElement element) { elems.add(element); }; public TreeElement[] getElements() { return elems.toArray(new TreeElement[elems.size()]); } @Override public String toString() { StringBuffer sb = new StringBuffer(); // print itself and its children for (int i = 0; i < this.getLevel(); i++) { sb.append("="); } sb.append(this.getSyntacticTag()); if (this.getMorphologicalTag() != null) { sb.append(this.getMorphologicalTag()); } sb.append("\n"); for (TreeElement element : elems) { sb.append(element.toString()); } return sb.toString(); } } /** Represents the AD leaf */ public class Leaf extends TreeElement { private String word; private String lemma; public boolean isLeaf() {return true;} public void setLexeme(String lexeme) { this.word = lexeme; } public String getLexeme() { return word; } @Override public String toString() { StringBuffer sb = new StringBuffer(); // print itself and its children for (int i = 0; i < this.getLevel(); i++) { sb.append("="); } if (this.getSyntacticTag() != null) { sb.append(this.getSyntacticTag() + "(" + this.getMorphologicalTag() + ") "); } sb.append(this.word + "\n"); return sb.toString(); } public void setLemma(String lemma) { this.lemma = lemma; } public String getLemma() { return lemma; } } } /** * The start paragraph pattern */ private static final Pattern start = Pattern.compile("]*>"); /** * The end paragraph pattern */ private static final Pattern end = Pattern.compile(""); private ParagraphParser parser; public ADParagraphStream(ObjectStream lineStream) { super(lineStream); parser = new ParagraphParser(); } public Paragraph read() throws IOException { StringBuilder paragraph = new StringBuilder(); boolean paragraphStarted = false; while (true) { String line = samples.read(); if (line != null) { if (start.matcher(line).matches()) { paragraphStarted = true; } if (paragraphStarted) { paragraph.append(line).append('\n'); } if (end.matcher(line).matches()) { paragraphStarted = false; } if (!paragraphStarted && paragraph.length() > 0) { return parser.parse(paragraph.toString()); } } else { // handle end of file if (paragraphStarted) { if (paragraph.length() > 0) { return parser.parse(paragraph.toString()); } } else { return null; } } } } }





© 2015 - 2024 Weber Informatics LLC | Privacy Policy