jvntextpro.JVnTextProTest Maven / Gradle / Ivy
/*
Copyright (C) 2010 by
*
* Cam-Tu Nguyen
* [email protected] or [email protected]
*
* Xuan-Hieu Phan
* [email protected]
*
* College of Technology, Vietnamese University, Hanoi
* Graduate School of Information Sciences, Tohoku University
*
* JVnTextPro-v.2.0 is a free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published
* by the Free Software Foundation; either version 2 of the License,
* or (at your option) any later version.
*
* JVnTextPro-v.2.0 is distributed in the hope that it will be useful, but
* WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with JVnTextPro-v.2.0); if not, write to the Free Software Foundation,
* Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA.
*/
package jvntextpro;
import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.InputStreamReader;
import java.io.OutputStreamWriter;
import jvntextpro.conversion.CompositeUnicode2Unicode;
import org.kohsuke.args4j.*;
// TODO: Auto-generated Javadoc
/**
* The Class JVnTextProTest.
*/
public class JVnTextProTest {
/**
* The main method.
*
* @param args the arguments
*/
public static void main(String [] args){
JVnTextProTestOption option = new JVnTextProTestOption();
CmdLineParser parser = new CmdLineParser(option);
if (args.length == 0) {
System.out.println("JVnTextProTest [options...] [arguments..]");
parser.printUsage(System.out);
return;
}
JVnTextPro vnTextPro = new JVnTextPro();
CompositeUnicode2Unicode conversion = new CompositeUnicode2Unicode();
try {
parser.parseArgument(args);
vnTextPro.initSenTokenization();
if (option.doSenSeg){
vnTextPro.initSenSegmenter(option.modelDir.getPath() + File.separator + "jvnsensegmenter");
}
if (option.doSenToken){
vnTextPro.initSenTokenization();
}
if (option.doWordSeg){
vnTextPro.initSegmenter(option.modelDir.getPath() + File.separator + "jvnsegmenter");
}
if (option.doPosTagging){
vnTextPro.initPosTagger(option.modelDir.getPath() + File.separator + "jvnpostag" + File.separator + "maxent");
}
if (option.inFile.isFile()){
//REad file, process and write file
BufferedReader reader = new BufferedReader(new InputStreamReader(
new FileInputStream(option.inFile), "UTF-8"));
String line, ret = "";
while ((line = reader.readLine()) != null){
line = conversion.convert(line);
ret += vnTextPro.process(line).trim() + "\n";
}
reader.close();
//write file
BufferedWriter writer = new BufferedWriter(new OutputStreamWriter(
new FileOutputStream(option.inFile.getPath() + ".pro"), "UTF-8"));
writer.write(ret);
writer.close();
}
else if (option.inFile.isDirectory()){
File [] childs = option.inFile.listFiles();
for (File child : childs){
if (!child.getName().endsWith(option.fileType)) continue;
//REad file, process and write file
BufferedReader reader = new BufferedReader(new InputStreamReader(
new FileInputStream(child), "UTF-8"));
String line, ret = "";
while ((line = reader.readLine()) != null){
line = conversion.convert(line);
ret += vnTextPro.process(line).trim() + "\n";
}
reader.close();
//write file
BufferedWriter writer = new BufferedWriter(new OutputStreamWriter(
new FileOutputStream(child.getPath() + ".pro"), "UTF-8"));
writer.write(ret);
writer.close();
}
}
}
catch(CmdLineException cle){
System.out.println("JVnTextProTest [options...] [arguments..]");
parser.printUsage(System.out);
}
catch (Exception e){
System.out.println(e.getMessage());
e.printStackTrace();
}
}
}
class JVnTextProTestOption{
@Option(name="-modeldir", usage="(required) Specify model directory, which is the folder containing model directories of subproblem tools (Word Segmentation, POS Tag)")
File modelDir;
@Option(name="-senseg", usage=" (optional) Specify if doing sentence segmentation is set or not, not set by default")
boolean doSenSeg = false;
@Option(name="-wordseg", usage = "(optional) Specify if doing word segmentation is set or not, not set by default")
boolean doWordSeg = false;
@Option(name="-sentoken", usage = "(optional) Specify if doing sentence tokenization is set or not, not set by default")
boolean doSenToken = false;
@Option(name="-postag", usage = "(optional) Specify if doing pos tagging or not is set or not, not set by default")
boolean doPosTagging = false;
@Option(name="-input", usage="(required) Specify input file/directory")
File inFile;
@Option(name="-filetype", usage=" (optional) Specify file types to process (in the case -input is a directory")
String fileType=".txt";
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy