jvnsegmenter.WordDataReader Maven / Gradle / Ivy

Go to download
/*
 Copyright (C) 2010 by
 * 
 * 	Cam-Tu Nguyen 
 *  [email protected] or [email protected]
 *
 *  Xuan-Hieu Phan  
 *  [email protected] 
 *
 *  College of Technology, Vietnamese University, Hanoi
 * 	Graduate School of Information Sciences, Tohoku University
 *
 * JVnTextPro-v.2.0 is a free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published
 * by the Free Software Foundation; either version 2 of the License,
 * or (at your option) any later version.
 *
 * JVnTextPro-v.2.0 is distributed in the hope that it will be useful, but
 * WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with  JVnTextPro-v.2.0); if not, write to the Free Software Foundation,
 * Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA.
 */
package jvnsegmenter;

import java.io.BufferedReader;
import java.io.FileInputStream;
import java.io.InputStreamReader;
import java.util.ArrayList;
import java.util.List;
import java.util.StringTokenizer;

import jvntextpro.data.DataReader;
import jvntextpro.data.Sentence;

// TODO: Auto-generated Javadoc
/**
 * The Class WordDataReader.
 */
public class WordDataReader extends DataReader{
	
	/** The is train reading. */
	protected boolean isTrainReading = false;
	
	/** The tags. */
	protected String [] tags = {"B-W", "I-W", "O"};
	
	//------------------------------------
	// Constructor
	//------------------------------------
	/**
	 * Instantiates a new word data reader.
	 */
	public WordDataReader(){
		// do nothing now
		isTrainReading = false;
	}
	
	/**
	 * Instantiates a new word data reader.
	 *
	 * @param isTrainReading the is train reading
	 */
	public WordDataReader(boolean isTrainReading){
		this.isTrainReading = isTrainReading;
	}	
	
	//-------------------------------------
	// Override methods
	//-------------------------------------
	/* (non-Javadoc)
	 * @see jvntextpro.data.DataReader#readFile(java.lang.String)
	 */
	@Override
	public List readFile(String datafile){
		try{
			BufferedReader reader = new BufferedReader(new InputStreamReader(
					new FileInputStream(datafile), "UTF-8"));
			
			String line = null;
			List data = new ArrayList();
			
			while ((line = reader.readLine()) != null){				
				Sentence sentence = new Sentence();
				
				if (line.startsWith("#"))
					continue;
				
				StringTokenizer tk = new StringTokenizer(line, " ");	
				while (tk.hasMoreTokens()){
					String word = "", tag = null;
					if (!isTrainReading){
						String token = tk.nextToken();
						word = token;
						sentence.addTWord(word, tag);
					}
					else {
						String token = tk.nextToken();
						for (int i = 0; i < tags.length; ++i){
							String labelPart = "/" + tags[i];
							if (token.endsWith(labelPart)){
								word = token.substring(0, token.length() - labelPart.length());
								tag = tags[i];
								break;
							}
						}
						sentence.addTWord(word, tag);
					}
				}				
				data.add(sentence);
			}
			reader.close();
			return data;
		}
		catch (Exception e){
			System.out.println(e.getMessage());
			return new ArrayList(); // empty array
		}
	}
	
	/* (non-Javadoc)
	 * @see jvntextpro.data.DataReader#readString(java.lang.String)
	 */
	public List readString(String dataStr){
		String [] lines = dataStr.split("\n");
		List data = new ArrayList();
		for (String line : lines){
			Sentence sentence = new Sentence();
			
			if (line.startsWith("#"))
				continue;
			
			StringTokenizer tk = new StringTokenizer(line, " ");	
			while (tk.hasMoreTokens()){
				String word = "", tag = null;
				if (!isTrainReading){
					String token = tk.nextToken();
					word = token;
					sentence.addTWord(word, tag);
				}
				else {
					String token = tk.nextToken();
					StringTokenizer sltk = new StringTokenizer(token, "/");
					word = sltk.nextToken();
					tag = sltk.nextToken();
					sentence.addTWord(word, tag);
				}
			}	
			data.add(sentence);
		}
		
		return data;
	}
}