jvnsensegmenter.JVnSenSegmenter Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of heideltime Show documentation
Show all versions of heideltime Show documentation
HeidelTime is a multilingual cross-domain temporal tagger that extracts temporal expressions from documents and normalizes them according to the TIMEX3 annotation standard.
/*
Copyright (C) 2010 by
*
* Cam-Tu Nguyen
* [email protected] or [email protected]
*
* Xuan-Hieu Phan
* [email protected]
*
* College of Technology, Vietnamese University, Hanoi
* Graduate School of Information Sciences, Tohoku University
*
* JVnTextPro-v.2.0 is a free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published
* by the Free Software Foundation; either version 2 of the License,
* or (at your option) any later version.
*
* JVnTextPro-v.2.0 is distributed in the hope that it will be useful, but
* WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with JVnTextPro-v.2.0); if not, write to the Free Software Foundation,
* Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA.
*/
package jvnsensegmenter;
import jmaxent.*;
import java.util.*;
import java.io.*;
// TODO: Auto-generated Javadoc
/**
* The Class JVnSenSegmenter.
*/
public class JVnSenSegmenter {
/** The positive label. */
public static String positiveLabel = "y";
/** The classifier. */
public Classification classifier = null;
/** The fea gen. */
public FeatureGenerator feaGen = null;
/**
* Creates a new instance of JVnSenSegmenter.
*
* @param modelDir the model dir
* @return true, if successful
*/
public boolean init(String modelDir){
try {
classifier = new Classification(modelDir);
feaGen = new FeatureGenerator();
classifier.init();
return true;
}
catch(Exception e){
System.out.println("Error while initilizing classifier: " + e.getMessage());
return false;
}
}
/**
* Sen segment.
*
* @param text the text
* @return the string
*/
public String senSegment(String text){
//text normalization
text = text.replaceAll("([\t \n])+", "$1");
//System.out.println(text);
//generate context predicates
List markList = new ArrayList();
List data = FeatureGenerator.doFeatureGen(new HashMap(), text, markList, false);
if (markList.isEmpty())
return text + "\n";
//classify
List labels = classifier.classify(data);
String result = text.substring(0, ((Integer)markList.get(0)).intValue());
for (int i =0; i < markList.size(); ++i){
int curPos = ((Integer) markList.get(i)).intValue();
if ( ((String)labels.get(i)).equals(positiveLabel)){
result += " " + text.charAt(curPos) + "\n";
}
else result += text.charAt(curPos);
if (i < markList.size() - 1){
int nexPos = ((Integer) markList.get(i + 1)).intValue();
result += text.substring(curPos + 1, nexPos);
}
}
int finalMarkPos = ((Integer) markList.get(markList.size() - 1)).intValue();
result += text.substring(finalMarkPos + 1, text.length());
//System.out.println(result);
result = result.replaceAll("\n ", "\n");
result = result.replaceAll("\n\n", "\n");
result = result.replaceAll("\\.\\. \\.", "...");
return result;
}
/**
* Sen segment.
*
* @param text the text
* @param senList the sen list
*/
public void senSegment(String text, List senList){
senList.clear();
String resultStr = senSegment(text);
StringTokenizer senTknr = new StringTokenizer(resultStr, "\n");
while(senTknr.hasMoreTokens()){
senList.add(senTknr.nextToken());
}
}
/**
* main method of JVnSenSegmenter
* to use this tool from command line.
*
* @param args the arguments
*/
public static void main(String args[]){
if (args.length != 4){
displayHelp();
System.exit(1);
}
try{
JVnSenSegmenter senSegmenter = new JVnSenSegmenter();
senSegmenter.init(args[1]);
String option = args[2];
if (option.equalsIgnoreCase("-inputfile"))
{
senSegmentFile(args[3], args[3] + ".sent", senSegmenter);
}
else if (option.equalsIgnoreCase("-inputdir")){
//segment only files ends with .txt
File inputDir = new File(args[3]);
File [] childrent = inputDir.listFiles(new FilenameFilter() {
public boolean accept(File dir, String name) {
return name.endsWith(".txt");
}
});
for (int i = 0; i -inputfile ");
System.out.println("\tCase 2: JVnSenSegmenter -modeldir -inputdir ");
System.out.println("Where:");
System.out.println("\t is the directory contain the model and option files");
System.out.println("\t is the file containing input text that need to");
System.out.println("\thave sentences segmented (each sentence on a line)");
System.out.println("\t is the directory containing multiple input .tkn files");
System.out.println();
}
}
© 2015 - 2024 Weber Informatics LLC | Privacy Policy