edu.berkeley.nlp.tokenizer.PTBLineLexer Maven / Gradle / Ivy

Go to download

Show more of this group Show more artifacts with this name
Show all versions of berkeleyparser Show documentation

The Berkeley parser analyzes the grammatical structure of natural language using probabilistic context-free grammars (PCFGs).

The newest version!

/**
 * 
 */
package edu.berkeley.nlp.tokenizer;

import java.io.IOException;
import java.io.StringReader;
import java.util.ArrayList;
import java.util.LinkedList;
import java.util.List;

import edu.berkeley.nlp.util.IOUtils;
import edu.berkeley.nlp.util.Iterators;
import edu.berkeley.nlp.util.StringUtils;

/**
 * Similar to PTBLexer. However, instead of reading from a Reader this class is given a line
 * and returns a list of tokenized Strings.
 * @author petrov
 *
 */
public class PTBLineLexer extends PTBLexer implements LineTokenizer {

	public PTBLineLexer(){
		super((java.io.Reader)null);
	}

	public List tokenize(String line) {
	    PTBTokenizer toker = new PTBTokenizer(new StringReader(line),true);
	    List elems = toker.tokenize();
	    List toks = new ArrayList();
	    for (Object o:elems) {
	        toks.add(o.toString());
	    }
	    return toks;
	}
	
	public List tokenizeLine(String line) throws IOException{
		LinkedList tokenized = new LinkedList();
		int nEl = line.length();
		char[] array = line.toCharArray();
		yy_buffer = line.toCharArray();//new char[nEl+1];
		//for(int i=0;i tokenizeLine = tokenizer.tokenizeLine(line);
				if (tokenizeLine.get(tokenizeLine.size() - 1) == null) tokenizeLine.remove(tokenizeLine.size() - 1);
				System.out.println(StringUtils.join(tokenizeLine));
			}
		}
		catch (IOException e)
		{
			// TODO Auto-generated catch block
			throw new RuntimeException(e);
			
		}
	}


}