jvntokenizer.JVnTokenizer Maven / Gradle / Ivy
/*
Copyright (C) 2010 by
*
* Cam-Tu Nguyen
* [email protected] or [email protected]
*
* Xuan-Hieu Phan
* [email protected]
*
* College of Technology, Vietnamese University, Hanoi
* Graduate School of Information Sciences, Tohoku University
*
* JVnTextPro-v.2.0 is a free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published
* by the Free Software Foundation; either version 2 of the License,
* or (at your option) any later version.
*
* JVnTextPro-v.2.0 is distributed in the hope that it will be useful, but
* WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with JVnTextPro-v.2.0); if not, write to the Free Software Foundation,
* Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA.
*/
package jvntokenizer;
/**
*
* @author Nguyen Cam Tu
*/
import java.io.*;
// TODO: Auto-generated Javadoc
/**
* The Class JVnTokenizer.
*/
public class JVnTokenizer {
/**
* The main method.
*
* @param args the arguments
*/
public static void main(String [] args){
if (args.length != 2){
displayHelp();
return;
}
//Read the input data
try{
String option = args[0];
if (option.equalsIgnoreCase("-inputfile")){
BufferedReader in = new BufferedReader(
new InputStreamReader(new FileInputStream(args[1]), "UTF-8"));
BufferedWriter out = new BufferedWriter(
new OutputStreamWriter(new FileOutputStream(args[1] + ".tkn") , "UTF-8"));
String line = "";
while ((line = in.readLine()) != null){
out.write(PennTokenizer.tokenize(line));
out.write("\n");
}
in.close();
out.close();
}
else if (option.equalsIgnoreCase("-inputdir")){
System.out.println("Tokenize input");
//segment only files ends with .sent
File inputDir = new File(args[1]);
File [] childrent = inputDir.listFiles(new FilenameFilter() {
public boolean accept(File dir, String name) {
return name.endsWith(".sent");
}
});
for (int i = 0; i < childrent.length; ++i){
BufferedReader in = new BufferedReader(
new InputStreamReader(new FileInputStream(childrent[i]), "UTF-8"));
BufferedWriter out = new BufferedWriter(
new OutputStreamWriter(new FileOutputStream(childrent[i] + ".tkn") , "UTF-8"));
String line = "";
while ((line = in.readLine()) != null){
out.write(PennTokenizer.tokenize(line));
out.write("\n");
}
in.close();
out.close();
}
}
} catch (Exception e){
System.out.println("Error:" + e.getMessage());
}
}
/**
* Display help.
*/
public static void displayHelp(){
System.out.println("Usage:");
System.out.println("\tCase 1: JVnTokenizer -inputfile ");
System.out.println("\tCase 2: JVnTokenizer -inputdir ");
System.out.println("Where:");
System.out.println("\t is the file containing input text that need to");
System.out.println("\thave sentences tokenized (each sentence on a line)");
System.out.println("\t is the directory containing multiple input .sent files");
System.out.println();
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy