All Downloads are FREE. Search and download functionalities are using the official Maven repository.

jvnpostag.POSTagging Maven / Gradle / Ivy

Go to download

HeidelTime is a multilingual cross-domain temporal tagger that extracts temporal expressions from documents and normalizes them according to the TIMEX3 annotation standard.

There is a newer version: 2.2.1
Show newest version
/*
 Copyright (C) 2010 by
 * 
 * 	Cam-Tu Nguyen 
 *  [email protected] or [email protected]
 *
 *  Xuan-Hieu Phan  
 *  [email protected] 
 *
 *  College of Technology, Vietnamese University, Hanoi
 * 	Graduate School of Information Sciences, Tohoku University
 *
 * JVnTextPro-v.2.0 is a free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published
 * by the Free Software Foundation; either version 2 of the License,
 * or (at your option) any later version.
 *
 * JVnTextPro-v.2.0 is distributed in the hope that it will be useful, but
 * WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with  JVnTextPro-v.2.0); if not, write to the Free Software Foundation,
 * Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA.
 */

package jvnpostag;

import java.io.BufferedWriter;
import java.io.File;
import java.io.FileOutputStream;
import java.io.FilenameFilter;
import java.io.OutputStreamWriter;

public class POSTagging {

	public static void main(String [] args){
		displayCopyright();
        
        if (!checkArgs(args)) {
            displayHelp();
            return;
        }
        
        //get model dir
        String modelDir = args[3];        
        
        //initialize tagger
        POSTagger tagger = null;
        
        if (args[1].equalsIgnoreCase("crfs"))
        	tagger = new CRFTagger(modelDir);
        else if (args[1].equalsIgnoreCase("maxent"))
        	tagger = new MaxentTagger(modelDir);
        
        //tagging
        try {
	        if (args[4].equalsIgnoreCase("-inputfile")){
	        	File inputFile = new File(args[5]);
	        	BufferedWriter writer = new BufferedWriter(new OutputStreamWriter(
	        			new FileOutputStream(inputFile.getPath() + ".pos"), "UTF-8"));
	        	
	        	String result = tagger.tagging(inputFile);
	        	
	        	writer.write(result);
	        	writer.close();
	        }
	        else{ //input dir
	        	String inputDir = args[5];
	        	 if (inputDir.endsWith(File.separator)) {
		                inputDir = inputDir.substring(0, inputDir.length() - 1);
		            }
		            
		            File dir = new File(inputDir);
		            String[] children = dir.list(new FilenameFilter() {
		                public boolean accept(File dir, String name) {
		                    return name.endsWith(".wseg");
		                }
		            });    
		            
		            for (int i = 0; i < children.length; i++) {
		            	System.out.println("Tagging " + children[i]);
		            	String filename = inputDir + File.separator + children[i];
			                if ((new File(filename)).isDirectory()) {
			                    continue;
			            }
			                
			            BufferedWriter writer = new BufferedWriter(new OutputStreamWriter(
				        			new FileOutputStream(filename + ".pos"), "UTF-8"));
			             
			            writer.write(tagger.tagging(new File(filename)));
			             
			            writer.close();
		            }
	        }
        }
        catch (Exception e){
        	System.out.println("Error while tagging");
        	System.out.println(e.getMessage());
        }
        
	}
	
	public static boolean checkArgs(String[] args) {        
        if (args.length < 6) {
            return false;
        }
        
        if (args[0].compareTo("-tagger") != 0){
        	return false;
        }
        
        if (args[1].compareToIgnoreCase("crfs") != 0 
        		&& args[1].compareToIgnoreCase("maxent") != 0)
        	return false;
        	
        if (args[2].compareToIgnoreCase("-modeldir") != 0) {
            return false;
        }
        
        if (!(args[4].compareToIgnoreCase("-inputfile") == 0 ||
                args[4].compareToIgnoreCase("-inputdir") == 0)) {
            return false;
        }
        
        return true;
    }
	
	public static void displayCopyright() {
        System.out.println("Vietnamese Part-Of-Speech Tagging:");
        System.out.println("\tusing Conditional Random Fields or Maximum Entropy");
        System.out.println("\ttesting on more than 10000 sentences of Viet Treebank with the highest F1-measure of 93.27%");
        System.out.println("Copyright (C) by Cam-Tu Nguyen {1,2} and Xuan-Hieu Phan {2}");
        System.out.println("{1}: College of Technology, Hanoi National University");
        System.out.println("{2}: Graduate School of Information Sciences, Tohoku University");
        System.out.println("Email: {[email protected] ; [email protected]}");
        System.out.println();
    }
    
    public static void displayHelp() {
        System.out.println("Usage:");
        System.out.println("\tCase 1: POSTagging -tagger  -modeldir  -inputfile ");
        System.out.println("\tCase 2: POSTagging -tagger  -modeldir  -inputdir ");
        System.out.println("Where:");
        System.out.println("\t is the tagger used for pos tagging which is either maximum entropy (maxent) or conditional random fields (crfs)");
        System.out.println("\t is the directory contain the model and option files");
        System.out.println("\t is the file containing input sentences that need to");
        System.out.println("\tbe tagged (each sentence on a line)");
        System.out.println("\t is the directory containing multiple input data files (accept files ended with .wseg)");
        System.out.println();
    }
}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy