opennlp.tools.formats.ad.ADParagraphStream Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of opennlp-tools Show documentation
There is a newer version: 2.5.0
/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License. You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package opennlp.tools.formats.ad;

import java.io.BufferedReader;
import java.io.IOException;
import java.io.StringReader;
import java.util.ArrayList;
import java.util.List;
import java.util.Stack;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import opennlp.tools.formats.ad.ADParagraphStream.ParagraphParser.Node;
import opennlp.tools.util.FilterObjectStream;
import opennlp.tools.util.ObjectStream;

/**
 * Stream filter which merges text lines into paragraphs, following the Arvores
 * Deitadas syntax.
 * 
 * Information about the format:

 * Susana Afonso.
 * "Árvores deitadas: Descrição do formato e das opções de análise na Floresta Sintáctica"
 * .

 * 12 de Fevereiro de 2006. 
 * http://www.linguateca.pt/documentos/Afonso2006ArvoresDeitadas.pdf 
 * 
 * Note: Do not use this class, internal use only!
 */
public class ADParagraphStream extends
    FilterObjectStream {

  public static class Paragraph {

    private String text;
    private Node root;

    public String getText() {
      return text;
    }

    public void setText(String text) {
      this.text = text;
    }

    public Node getRoot() {
      return root;
    }

    public void setRoot(Node root) {
      this.root = root;
    }

  }

  /**
   * Parses a sample of AD corpus. A sentence in AD corpus is represented by a
   * Tree. In this class we declare some types to represent that tree.
   */
  public static class ParagraphParser {

    private Pattern rootPattern = Pattern.compile("^[^:=]+:[^(\\s]+$");
    private Pattern nodePattern = Pattern
        .compile("^([=-]*)([^:=]+:[^\\(\\s]+)(\\(([^\\)]+)\\))?\\s*$");
    private Pattern leafPattern = Pattern
        .compile("^([=-]*)([^:=]+:[^\\(\\s]+)\\(([\"'].+[\"'])?\\s*([^\\)]+)?\\)\\s+(.+)");
    private Pattern bizarreLeafPattern = Pattern
    		.compile("^([=-]*)([^:=]+=[^\\(\\s]+)\\(([\"'].+[\"'])?\\s*([^\\)]+)?\\)\\s+(.+)");
    private Pattern punctuationPattern = Pattern.compile("^(=*)(\\W+)$");

    /** 
     * Parse the paragraph 
     */
    public Paragraph parse(String paragraphString) {
      BufferedReader reader = new BufferedReader(new StringReader(
          paragraphString));
      Paragraph sentence = new Paragraph();
      Node root = new Node();
      try {
        // first line is 
        String line = reader.readLine();
        if (line.startsWith(" nodeStack = new Stack();
        // we get the complete line

        root.setSyntacticTag("ROOT");
        root.setLevel(0);
        nodeStack.add(root);
        // now we have to take care of the lastLevel. Every time it raises, we
        // will add the
        // leaf to the node at the top. If it decreases, we remove the top.
        //line = reader.readLine();
        while (line.length() != 0 && line.startsWith("") == false) {
          TreeElement element = this.getElement(line);
          
          if(element != null) {
            // remove elements at same level or higher
            while (!nodeStack.isEmpty()
                && element.getLevel() > 0 && element.getLevel() <= nodeStack.peek().getLevel()) {
              nodeStack.pop();
            }
            if( element.isLeaf() ) {
              if (nodeStack.isEmpty()) {
                root.addElement(element);
  						} else {
  							// look for the node with the correct level
  							Node peek = nodeStack.peek();
  							if (element.level == 0) { // add to the root
  								nodeStack.firstElement().addElement(element);
  							} else {
  								Node parent = null;
  								int index = nodeStack.size() - 1;
  								while(parent == null) {
  									if(peek.getLevel() < element.getLevel()) {
  										parent = peek;
  									} else {
  										index--;
  										if(index > -1) {
  											peek = nodeStack.get(index);
  										} else {
  											parent = nodeStack.firstElement();
  										}
  									}
  								}
  								parent.addElement(element);
  							}
              }
            } else {
              if (!nodeStack.isEmpty()) {
                nodeStack.peek().addElement(element);
              }
              nodeStack.push((Node) element);
            }
          }
          line = reader.readLine();
        }

      } catch (Exception e) {
        System.err.println(paragraphString);
        e.printStackTrace();
        return sentence;
      }
      // second line should be SOURCE
      sentence.setRoot(root);
      return sentence;
    }

    /**
     * Parse a tree element from a AD line
     * 
     * @param line
     *          the AD line
     * @return the tree element
     */
    public TreeElement getElement(String line) {
      // try node
      Matcher nodeMatcher = nodePattern.matcher(line);
      if (nodeMatcher.matches()) {
        int level = nodeMatcher.group(1).length();
        String syntacticTag = nodeMatcher.group(2);
        String morphologicalTag = nodeMatcher.group(3);
        Node node = new Node();
        node.setLevel(level);
        node.setSyntacticTag(syntacticTag);
        node.setMorphologicalTag(morphologicalTag);
        return node;
      }

      Matcher leafMatcher = leafPattern.matcher(line);
      if (leafMatcher.matches()) {
        int level = leafMatcher.group(1).length();
        String syntacticTag = leafMatcher.group(2);
        String lemma = leafMatcher.group(3);
        String morphologicalTag = leafMatcher.group(4);
        String lexeme = leafMatcher.group(5);
        Leaf leaf = new Leaf();
        leaf.setLevel(level);
        leaf.setSyntacticTag(syntacticTag);
        leaf.setMorphologicalTag(morphologicalTag);
        leaf.setLexeme(lexeme);
        if (lemma != null) {
          if (lemma.length() > 2) {
            lemma = lemma.substring(1, lemma.length() - 1);
          }
          leaf.setLemma(lemma);
        }

        return leaf;
      }

      Matcher punctuationMatcher = punctuationPattern.matcher(line);
      if (punctuationMatcher.matches()) {
        int level = punctuationMatcher.group(1).length();
        String lexeme = punctuationMatcher.group(2);
        Leaf leaf = new Leaf();
        leaf.setLevel(level);
        leaf.setLexeme(lexeme);
        return leaf;
      }

      // process the bizarre cases
      if(line.equals("_") || line.startsWith(" elems = new ArrayList();

      public void addElement(TreeElement element) {
        elems.add(element);
      };

      public TreeElement[] getElements() {
        return elems.toArray(new TreeElement[elems.size()]);
      }

      @Override
      public String toString() {
        StringBuffer sb = new StringBuffer();
        // print itself and its children
        for (int i = 0; i < this.getLevel(); i++) {
          sb.append("=");
        }
        sb.append(this.getSyntacticTag());
        if (this.getMorphologicalTag() != null) {
          sb.append(this.getMorphologicalTag());
        }
        sb.append("\n");
        for (TreeElement element : elems) {
          sb.append(element.toString());
        }
        return sb.toString();
      }
    }

    /** Represents the AD leaf */
    public class Leaf extends TreeElement {

      private String word;
      private String lemma;

      public boolean isLeaf() {return true;}
      
      public void setLexeme(String lexeme) {
        this.word = lexeme;
      }

      public String getLexeme() {
        return word;
      }

      @Override
      public String toString() {
        StringBuffer sb = new StringBuffer();
        // print itself and its children
        for (int i = 0; i < this.getLevel(); i++) {
          sb.append("=");
        }
        if (this.getSyntacticTag() != null) {
          sb.append(this.getSyntacticTag() + "(" + this.getMorphologicalTag()
              + ") ");
        }
        sb.append(this.word + "\n");
        return sb.toString();
      }

      public void setLemma(String lemma) {
        this.lemma = lemma;
      }

      public String getLemma() {
        return lemma;
      }
    }

  }
  
  /** 
   * The start paragraph pattern 
   */
  private static final Pattern start = Pattern.compile("]*>");

  /** 
   * The end paragraph pattern 
   */
  private static final Pattern end = Pattern.compile("");

  private ParagraphParser parser;

  public ADParagraphStream(ObjectStream lineStream) {
    super(lineStream);
    parser = new ParagraphParser();
  }

  public Paragraph read() throws IOException {

    StringBuilder paragraph = new StringBuilder();
    boolean paragraphStarted = false;

    while (true) {
      String line = samples.read();

      if (line != null) {

        if (start.matcher(line).matches()) {
          paragraphStarted = true;
        }

        if (paragraphStarted) {
          paragraph.append(line).append('\n');
        }

        if (end.matcher(line).matches()) {
          paragraphStarted = false;
        }

        if (!paragraphStarted && paragraph.length() > 0) {
          return parser.parse(paragraph.toString());
        }

      } else {
        // handle end of file
        if (paragraphStarted) {
          if (paragraph.length() > 0) {
            return parser.parse(paragraph.toString());
          }
        } else {
          return null;
        }
      }
    }
  }
}