jvntextpro.data.TrainDataGenerating Maven / Gradle / Ivy
/*
Copyright (C) 2010 by
*
* Cam-Tu Nguyen
* [email protected] or [email protected]
*
* Xuan-Hieu Phan
* [email protected]
*
* College of Technology, Vietnamese University, Hanoi
* Graduate School of Information Sciences, Tohoku University
*
* JVnTextPro-v.2.0 is a free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published
* by the Free Software Foundation; either version 2 of the License,
* or (at your option) any later version.
*
* JVnTextPro-v.2.0 is distributed in the hope that it will be useful, but
* WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with JVnTextPro-v.2.0); if not, write to the Free Software Foundation,
* Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA.
*/
package jvntextpro.data;
import java.io.BufferedWriter;
import java.io.File;
import java.io.FileOutputStream;
import java.io.OutputStreamWriter;
import java.util.ArrayList;
// TODO: Auto-generated Javadoc
/**
* The Class TrainDataGenerating.
*/
public abstract class TrainDataGenerating {
/** The reader. */
protected DataReader reader;
/** The tagger. */
protected TaggingData tagger;
/**
* Initialize reader, tagger for reading input data and generating context
* predicates for each observation.
*/
public abstract void init();
/**
* Generate train data.
*
* @param inputPath the input path (file or dictionary)
* @param outputPath the output path
*/
public void generateTrainData(String inputPath, String outputPath){
try{
File file = new File(inputPath);
ArrayList data = new ArrayList();
if (file.isFile()){
System.out.println("Reading " + file.getName());
data = (ArrayList) reader.readFile(inputPath);
}
else if (file.isDirectory()){
String [] filenames = file.list();
for (String filename: filenames){
System.out.println("Reading " + filename);
ArrayList temp = (ArrayList) reader.readFile(file.getPath() + File.separator + filename);
data.addAll(temp);
}
}
String result = "";
System.out.println(data.size() + "sentences read");
for (int i = 0; i < data.size(); ++i){
if (i % 20 == 0) System.out.println("Finished " + i + " in " + data.size() + " sentences");
Sentence sent = data.get(i);
for (int j = 0; j < sent.size(); ++j){
//result += sent.getWordAt(j) + " ";
String line = "";
String context = tagger.getContextStr(sent, j);
line = context + " ";
line += sent.getTagAt(j);
result += line + "\n";
}
result += "\n";
}
BufferedWriter writer = new BufferedWriter(new OutputStreamWriter(
new FileOutputStream(outputPath + ".tagged"), "UTF-8"));
writer.write(result);
writer.close();
}
catch (Exception e){
System.out.println("Error while generating training data");
System.out.println(e.getMessage());
e.printStackTrace();
}
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy