edu.emory.mathcs.nlp.common.constituent.CTReader Maven / Gradle / Ivy
The newest version!
/**
* Copyright 2014, Emory University
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package edu.emory.mathcs.nlp.common.constituent;
import java.io.BufferedInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.LineNumberReader;
import java.io.UnsupportedEncodingException;
import java.util.ArrayDeque;
import java.util.ArrayList;
import java.util.Deque;
import java.util.List;
import java.util.StringTokenizer;
import edu.emory.mathcs.nlp.common.constant.StringConst;
/**
* Constituent tree reader.
* @see CTTree
* @author Jinho D. Choi ({@code [email protected]})
*/
public class CTReader
{
private LineNumberReader f_reader;
private Deque d_tokens;
public CTReader() {}
/** @param in internally wrapped by {@code new LineNumberReader(new InputStreamReader(new BufferedInputStream(in)))}}. */
public CTReader(InputStream in)
{
open(in);
}
/** @param in internally wrapped by {@code new LineNumberReader(new InputStreamReader(new BufferedInputStream(in)))}}. */
public void open(InputStream in)
{
open(in, "UTF-8");
}
public void open(InputStream in, String charsetName)
{
try
{
f_reader = new LineNumberReader(new InputStreamReader(new BufferedInputStream(in), charsetName));
d_tokens = new ArrayDeque();
}
catch (UnsupportedEncodingException e) {e.printStackTrace();}
}
/** Closes the current reader. */
public void close()
{
if (f_reader != null)
{
try
{
f_reader.close();
}
catch (IOException e) {e.printStackTrace();}
}
}
/** @return a list of all constituent trees in the input stream. */
public List getTreeList()
{
List trees = new ArrayList<>();
CTTree tree;
while ((tree = nextTree()) != null)
trees.add(tree);
return trees;
}
/**
* @return the next tree if exists; otherwise, {@code null}.
* Returns {@code null} if the next tree is incomplete or erroneous.
* Automatically links antecedents of all co-indexed empty categories.
*/
public CTTree nextTree()
{
String token = nextToken(), tags;
if (token == null)
return null;
if (!token.equals(StringConst.LRB))
{
System.err.println("Error: \""+token+"\" found, \"(\" expected - line "+f_reader.getLineNumber());
return null;
}
int nBrackets = 1, startLine = f_reader.getLineNumber();
CTNode root = new CTNode(CTTagEn.TOP, null);
CTNode curr = root, node;
while ((token = nextToken()) != null)
{
if (nBrackets == 1 && token.equals(CTTagEn.TOP))
continue;
if (token.equals(StringConst.LRB))
{
tags = nextToken();
node = new CTNode(tags);
curr.addChild(node);
curr = node;
nBrackets++;
}
else if (token.equals(StringConst.RRB))
{
curr = curr.getParent();
nBrackets--;
}
else
{
curr.setWordForm(token);
}
if (nBrackets == 0)
{
CTTree tree = new CTTree(root);
return tree;
}
}
System.err.println("Error: brackets mismatch - starting line "+startLine);
return null;
}
/**
* @return the next tree after skipping the specific number of trees if exists; otherwise, {@code null}.
* @param skip the number of trees to skip.
*/
public CTTree nextTree(int skip)
{
CTTree tree = null;
int i;
for (i=0; i<=skip; i++)
{
tree = nextTree();
if (tree == null) return null;
}
return tree;
}
/** Called by {@link #nextTree()}. */
private String nextToken()
{
if (d_tokens.isEmpty())
{
String line = null;
try
{
line = f_reader.readLine();
}
catch (IOException e) {e.printStackTrace();}
if (line == null)
return null;
line = line.trim();
if (line.isEmpty())
return nextToken();
StringTokenizer tok = new StringTokenizer(line, "() \t\n\r\f", true);
String str;
while (tok.hasMoreTokens())
{
str = tok.nextToken().trim();
if (!str.isEmpty()) d_tokens.add(str);
}
}
return d_tokens.pop();
}
}
© 2015 - 2024 Weber Informatics LLC | Privacy Policy