All Downloads are FREE. Search and download functionalities are using the official Maven repository.

jvntextpro.data.TrainDataGenerating Maven / Gradle / Ivy

/*
 Copyright (C) 2010 by
 * 
 * 	Cam-Tu Nguyen 
 *  [email protected] or [email protected]
 *
 *  Xuan-Hieu Phan  
 *  [email protected] 
 *
 *  College of Technology, Vietnamese University, Hanoi
 * 	Graduate School of Information Sciences, Tohoku University
 *
 * JVnTextPro-v.2.0 is a free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published
 * by the Free Software Foundation; either version 2 of the License,
 * or (at your option) any later version.
 *
 * JVnTextPro-v.2.0 is distributed in the hope that it will be useful, but
 * WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with  JVnTextPro-v.2.0); if not, write to the Free Software Foundation,
 * Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA.
 */
package jvntextpro.data;

import java.io.BufferedWriter;
import java.io.File;
import java.io.FileOutputStream;
import java.io.OutputStreamWriter;
import java.util.ArrayList;

// TODO: Auto-generated Javadoc
/**
 * The Class TrainDataGenerating.
 */
public abstract class TrainDataGenerating {
	
	/** The reader. */
	protected DataReader reader;
	
	/** The tagger. */
	protected TaggingData tagger;
	
	/**
	 * Initialize reader, tagger for reading input data and generating context
	 * predicates for each observation.
	 */
	public abstract void init();
	
	/**
	 * Generate train data.
	 *
	 * @param inputPath the input path (file or dictionary)
	 * @param outputPath the output path
	 */
	public void generateTrainData(String inputPath, String outputPath){
		try{
			File file = new File(inputPath);
        	ArrayList data = new ArrayList();
        	if (file.isFile()){
        		System.out.println("Reading " + file.getName());
        		data = (ArrayList) reader.readFile(inputPath);
        	}
        	else if (file.isDirectory()){
        		String [] filenames = file.list();
        		for (String filename: filenames){
        			System.out.println("Reading " + filename);
        			ArrayList temp = (ArrayList) reader.readFile(file.getPath() + File.separator + filename);
        			data.addAll(temp);
        		}
        	}
        	
        	String result = "";
        	System.out.println(data.size() + "sentences read");
        	for (int i = 0; i < data.size(); ++i){
        		if (i % 20 == 0) System.out.println("Finished " + i + " in " + data.size() + " sentences");
        		Sentence sent = data.get(i);
        		
        		for (int j = 0; j < sent.size(); ++j){
        			//result += sent.getWordAt(j) + " ";
        			String line = "";
        			String context = tagger.getContextStr(sent, j);
        		    line = context + " ";
        			line += sent.getTagAt(j);
        			result += line + "\n";
        		}
        		result += "\n";
        	}
        	
        	BufferedWriter writer = new BufferedWriter(new OutputStreamWriter(
        			new FileOutputStream(outputPath + ".tagged"), "UTF-8"));	        	
        	
        	writer.write(result);
        	writer.close();	     
		}
	  catch (Exception e){
        	System.out.println("Error while generating training data");
        	System.out.println(e.getMessage());
        	e.printStackTrace();
        }
	}
}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy