jvnpostag.POSTagging Maven / Gradle / Ivy
/*
Copyright (C) 2010 by
*
* Cam-Tu Nguyen
* [email protected] or [email protected]
*
* Xuan-Hieu Phan
* [email protected]
*
* College of Technology, Vietnamese University, Hanoi
* Graduate School of Information Sciences, Tohoku University
*
* JVnTextPro-v.2.0 is a free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published
* by the Free Software Foundation; either version 2 of the License,
* or (at your option) any later version.
*
* JVnTextPro-v.2.0 is distributed in the hope that it will be useful, but
* WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with JVnTextPro-v.2.0); if not, write to the Free Software Foundation,
* Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA.
*/
package jvnpostag;
import java.io.BufferedWriter;
import java.io.File;
import java.io.FileOutputStream;
import java.io.FilenameFilter;
import java.io.OutputStreamWriter;
public class POSTagging {
public static void main(String [] args){
displayCopyright();
if (!checkArgs(args)) {
displayHelp();
return;
}
//get model dir
String modelDir = args[3];
//initialize tagger
POSTagger tagger = null;
if (args[1].equalsIgnoreCase("crfs"))
tagger = new CRFTagger(modelDir);
else if (args[1].equalsIgnoreCase("maxent"))
tagger = new MaxentTagger(modelDir);
//tagging
try {
if (args[4].equalsIgnoreCase("-inputfile")){
File inputFile = new File(args[5]);
BufferedWriter writer = new BufferedWriter(new OutputStreamWriter(
new FileOutputStream(inputFile.getPath() + ".pos"), "UTF-8"));
String result = tagger.tagging(inputFile);
writer.write(result);
writer.close();
}
else{ //input dir
String inputDir = args[5];
if (inputDir.endsWith(File.separator)) {
inputDir = inputDir.substring(0, inputDir.length() - 1);
}
File dir = new File(inputDir);
String[] children = dir.list(new FilenameFilter() {
public boolean accept(File dir, String name) {
return name.endsWith(".wseg");
}
});
for (int i = 0; i < children.length; i++) {
System.out.println("Tagging " + children[i]);
String filename = inputDir + File.separator + children[i];
if ((new File(filename)).isDirectory()) {
continue;
}
BufferedWriter writer = new BufferedWriter(new OutputStreamWriter(
new FileOutputStream(filename + ".pos"), "UTF-8"));
writer.write(tagger.tagging(new File(filename)));
writer.close();
}
}
}
catch (Exception e){
System.out.println("Error while tagging");
System.out.println(e.getMessage());
}
}
public static boolean checkArgs(String[] args) {
if (args.length < 6) {
return false;
}
if (args[0].compareTo("-tagger") != 0){
return false;
}
if (args[1].compareToIgnoreCase("crfs") != 0
&& args[1].compareToIgnoreCase("maxent") != 0)
return false;
if (args[2].compareToIgnoreCase("-modeldir") != 0) {
return false;
}
if (!(args[4].compareToIgnoreCase("-inputfile") == 0 ||
args[4].compareToIgnoreCase("-inputdir") == 0)) {
return false;
}
return true;
}
public static void displayCopyright() {
System.out.println("Vietnamese Part-Of-Speech Tagging:");
System.out.println("\tusing Conditional Random Fields or Maximum Entropy");
System.out.println("\ttesting on more than 10000 sentences of Viet Treebank with the highest F1-measure of 93.27%");
System.out.println("Copyright (C) by Cam-Tu Nguyen {1,2} and Xuan-Hieu Phan {2}");
System.out.println("{1}: College of Technology, Hanoi National University");
System.out.println("{2}: Graduate School of Information Sciences, Tohoku University");
System.out.println("Email: {[email protected] ; [email protected]}");
System.out.println();
}
public static void displayHelp() {
System.out.println("Usage:");
System.out.println("\tCase 1: POSTagging -tagger -modeldir -inputfile ");
System.out.println("\tCase 2: POSTagging -tagger -modeldir -inputdir ");
System.out.println("Where:");
System.out.println("\t is the tagger used for pos tagging which is either maximum entropy (maxent) or conditional random fields (crfs)");
System.out.println("\t is the directory contain the model and option files");
System.out.println("\t is the file containing input sentences that need to");
System.out.println("\tbe tagged (each sentence on a line)");
System.out.println("\t is the directory containing multiple input data files (accept files ended with .wseg)");
System.out.println();
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy