jvntextpro.JVnTextProTest Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of heideltime Show documentation
HeidelTime is a multilingual cross-domain temporal tagger that extracts temporal expressions from documents and normalizes them according to the TIMEX3 annotation standard.
There is a newer version: 2.2.1
Show newest version
/*
 Copyright (C) 2010 by
 * 
 * 	Cam-Tu Nguyen 
 *  [email protected] or [email protected]
 *
 *  Xuan-Hieu Phan  
 *  [email protected] 
 *
 *  College of Technology, Vietnamese University, Hanoi
 * 	Graduate School of Information Sciences, Tohoku University
 *
 * JVnTextPro-v.2.0 is a free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published
 * by the Free Software Foundation; either version 2 of the License,
 * or (at your option) any later version.
 *
 * JVnTextPro-v.2.0 is distributed in the hope that it will be useful, but
 * WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with  JVnTextPro-v.2.0); if not, write to the Free Software Foundation,
 * Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA.
 */
package jvntextpro;

import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.InputStreamReader;
import java.io.OutputStreamWriter;

import jvntextpro.conversion.CompositeUnicode2Unicode;

import org.kohsuke.args4j.*;

// TODO: Auto-generated Javadoc
/**
 * The Class JVnTextProTest.
 */
public class JVnTextProTest {

	/**
	 * The main method.
	 *
	 * @param args the arguments
	 */
	public static void main(String [] args){
		JVnTextProTestOption option = new JVnTextProTestOption();
		CmdLineParser parser = new CmdLineParser(option);
		
		if (args.length == 0) {
			System.out.println("JVnTextProTest [options...] [arguments..]");
			parser.printUsage(System.out);
			return;
		}
		
		JVnTextPro vnTextPro = new JVnTextPro();
		CompositeUnicode2Unicode conversion = new CompositeUnicode2Unicode();
		
		try {
			parser.parseArgument(args);
			vnTextPro.initSenTokenization();
			
			if (option.doSenSeg){
				vnTextPro.initSenSegmenter(option.modelDir.getPath() + File.separator + "jvnsensegmenter");				
			}
			
			if (option.doSenToken){
				vnTextPro.initSenTokenization();
			}
			
			if (option.doWordSeg){
				vnTextPro.initSegmenter(option.modelDir.getPath() + File.separator + "jvnsegmenter");
			}
			
		
			if (option.doPosTagging){
				vnTextPro.initPosTagger(option.modelDir.getPath() + File.separator + "jvnpostag" + File.separator + "maxent");
			}
			
			
			if (option.inFile.isFile()){
				//REad file, process and write file
				BufferedReader reader = new BufferedReader(new InputStreamReader(
						new FileInputStream(option.inFile), "UTF-8"));
				String line, ret = "";
				
				while ((line = reader.readLine()) != null){
					line = conversion.convert(line);				
					ret += vnTextPro.process(line).trim() + "\n";
				
				}
				
				reader.close();
				
				//write file
				BufferedWriter writer = new BufferedWriter(new OutputStreamWriter(
						new FileOutputStream(option.inFile.getPath() + ".pro"), "UTF-8"));
				writer.write(ret);
				writer.close();
			}
			else if (option.inFile.isDirectory()){
				File [] childs = option.inFile.listFiles();
				
				for (File child : childs){	
					if (!child.getName().endsWith(option.fileType)) continue;
					
					//REad file, process and write file
					BufferedReader reader = new BufferedReader(new InputStreamReader(
							new FileInputStream(child), "UTF-8"));
					String line, ret = "";
					
					while ((line = reader.readLine()) != null){						
						line = conversion.convert(line);				
						ret += vnTextPro.process(line).trim() + "\n";						
					}
					
					reader.close();
					
					//write file
					BufferedWriter writer = new BufferedWriter(new OutputStreamWriter(
							new FileOutputStream(child.getPath() + ".pro"), "UTF-8"));					
					writer.write(ret);
					writer.close();
				}
			}
		}
		catch(CmdLineException cle){
			System.out.println("JVnTextProTest [options...] [arguments..]");
			parser.printUsage(System.out);
		}
		catch (Exception e){
			System.out.println(e.getMessage());
			e.printStackTrace();
		}
	}
		
}

class JVnTextProTestOption{
	@Option(name="-modeldir", usage="(required) Specify model directory, which is the folder containing model directories of subproblem tools (Word Segmentation, POS Tag)")
	File modelDir;
	
	@Option(name="-senseg", usage=" (optional) Specify if doing sentence segmentation is set or not, not set by default")
	boolean doSenSeg = false;
	
	@Option(name="-wordseg", usage = "(optional) Specify if doing word segmentation is set or not, not set by default")
	boolean doWordSeg = false;
	
	@Option(name="-sentoken", usage = "(optional) Specify if doing sentence tokenization is set or not, not set by default")
	boolean doSenToken = false;
	
	@Option(name="-postag", usage = "(optional) Specify if doing pos tagging or not is set or not, not set by default")
	boolean doPosTagging = false;
	
	@Option(name="-input", usage="(required) Specify input file/directory")
	File inFile;
	
	@Option(name="-filetype", usage=" (optional) Specify file types to process (in the case -input is a directory")
	String fileType=".txt";
}