All Downloads are FREE. Search and download functionalities are using the official Maven repository.

edu.berkeley.nlp.tokenizer.PTBLineLexer Maven / Gradle / Ivy

Go to download

The Berkeley parser analyzes the grammatical structure of natural language using probabilistic context-free grammars (PCFGs).

The newest version!
/**
 * 
 */
package edu.berkeley.nlp.tokenizer;

import java.io.IOException;
import java.io.StringReader;
import java.util.ArrayList;
import java.util.LinkedList;
import java.util.List;

import edu.berkeley.nlp.util.IOUtils;
import edu.berkeley.nlp.util.Iterators;
import edu.berkeley.nlp.util.StringUtils;

/**
 * Similar to PTBLexer. However, instead of reading from a Reader this class is given a line
 * and returns a list of tokenized Strings.
 * @author petrov
 *
 */
public class PTBLineLexer extends PTBLexer implements LineTokenizer {

	public PTBLineLexer(){
		super((java.io.Reader)null);
	}

	public List tokenize(String line) {
	    PTBTokenizer toker = new PTBTokenizer(new StringReader(line),true);
	    List elems = toker.tokenize();
	    List toks = new ArrayList();
	    for (Object o:elems) {
	        toks.add(o.toString());
	    }
	    return toks;
	}
	
	public List tokenizeLine(String line) throws IOException{
		LinkedList tokenized = new LinkedList();
		int nEl = line.length();
		char[] array = line.toCharArray();
		yy_buffer = line.toCharArray();//new char[nEl+1];
		//for(int i=0;i tokenizeLine = tokenizer.tokenizeLine(line);
				if (tokenizeLine.get(tokenizeLine.size() - 1) == null) tokenizeLine.remove(tokenizeLine.size() - 1);
				System.out.println(StringUtils.join(tokenizeLine));
			}
		}
		catch (IOException e)
		{
			// TODO Auto-generated catch block
			throw new RuntimeException(e);
			
		}
	}


}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy