jvnpostag.POSTagging Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of heideltime Show documentation
Show all versions of heideltime Show documentation
HeidelTime is a multilingual cross-domain temporal tagger that extracts temporal expressions from documents and normalizes them according to the TIMEX3 annotation standard.
/*
Copyright (C) 2010 by
*
* Cam-Tu Nguyen
* [email protected] or [email protected]
*
* Xuan-Hieu Phan
* [email protected]
*
* College of Technology, Vietnamese University, Hanoi
* Graduate School of Information Sciences, Tohoku University
*
* JVnTextPro-v.2.0 is a free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published
* by the Free Software Foundation; either version 2 of the License,
* or (at your option) any later version.
*
* JVnTextPro-v.2.0 is distributed in the hope that it will be useful, but
* WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with JVnTextPro-v.2.0); if not, write to the Free Software Foundation,
* Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA.
*/
package jvnpostag;
import java.io.BufferedWriter;
import java.io.File;
import java.io.FileOutputStream;
import java.io.FilenameFilter;
import java.io.OutputStreamWriter;
public class POSTagging {
public static void main(String [] args){
displayCopyright();
if (!checkArgs(args)) {
displayHelp();
return;
}
//get model dir
String modelDir = args[3];
//initialize tagger
POSTagger tagger = null;
if (args[1].equalsIgnoreCase("crfs"))
tagger = new CRFTagger(modelDir);
else if (args[1].equalsIgnoreCase("maxent"))
tagger = new MaxentTagger(modelDir);
//tagging
try {
if (args[4].equalsIgnoreCase("-inputfile")){
File inputFile = new File(args[5]);
BufferedWriter writer = new BufferedWriter(new OutputStreamWriter(
new FileOutputStream(inputFile.getPath() + ".pos"), "UTF-8"));
String result = tagger.tagging(inputFile);
writer.write(result);
writer.close();
}
else{ //input dir
String inputDir = args[5];
if (inputDir.endsWith(File.separator)) {
inputDir = inputDir.substring(0, inputDir.length() - 1);
}
File dir = new File(inputDir);
String[] children = dir.list(new FilenameFilter() {
public boolean accept(File dir, String name) {
return name.endsWith(".wseg");
}
});
for (int i = 0; i < children.length; i++) {
System.out.println("Tagging " + children[i]);
String filename = inputDir + File.separator + children[i];
if ((new File(filename)).isDirectory()) {
continue;
}
BufferedWriter writer = new BufferedWriter(new OutputStreamWriter(
new FileOutputStream(filename + ".pos"), "UTF-8"));
writer.write(tagger.tagging(new File(filename)));
writer.close();
}
}
}
catch (Exception e){
System.out.println("Error while tagging");
System.out.println(e.getMessage());
}
}
public static boolean checkArgs(String[] args) {
if (args.length < 6) {
return false;
}
if (args[0].compareTo("-tagger") != 0){
return false;
}
if (args[1].compareToIgnoreCase("crfs") != 0
&& args[1].compareToIgnoreCase("maxent") != 0)
return false;
if (args[2].compareToIgnoreCase("-modeldir") != 0) {
return false;
}
if (!(args[4].compareToIgnoreCase("-inputfile") == 0 ||
args[4].compareToIgnoreCase("-inputdir") == 0)) {
return false;
}
return true;
}
public static void displayCopyright() {
System.out.println("Vietnamese Part-Of-Speech Tagging:");
System.out.println("\tusing Conditional Random Fields or Maximum Entropy");
System.out.println("\ttesting on more than 10000 sentences of Viet Treebank with the highest F1-measure of 93.27%");
System.out.println("Copyright (C) by Cam-Tu Nguyen {1,2} and Xuan-Hieu Phan {2}");
System.out.println("{1}: College of Technology, Hanoi National University");
System.out.println("{2}: Graduate School of Information Sciences, Tohoku University");
System.out.println("Email: {[email protected] ; [email protected]}");
System.out.println();
}
public static void displayHelp() {
System.out.println("Usage:");
System.out.println("\tCase 1: POSTagging -tagger -modeldir -inputfile ");
System.out.println("\tCase 2: POSTagging -tagger -modeldir -inputdir ");
System.out.println("Where:");
System.out.println("\t is the tagger used for pos tagging which is either maximum entropy (maxent) or conditional random fields (crfs)");
System.out.println("\t is the directory contain the model and option files");
System.out.println("\t is the file containing input sentences that need to");
System.out.println("\tbe tagged (each sentence on a line)");
System.out.println("\t is the directory containing multiple input data files (accept files ended with .wseg)");
System.out.println();
}
}
© 2015 - 2024 Weber Informatics LLC | Privacy Policy