jvnsegmenter.CRFSegmenter Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of heideltime Show documentation
Show all versions of heideltime Show documentation
HeidelTime is a multilingual cross-domain temporal tagger that extracts temporal expressions from documents and normalizes them according to the TIMEX3 annotation standard.
/*
Copyright (C) 2010 by
*
* Cam-Tu Nguyen
* [email protected] or [email protected]
*
* Xuan-Hieu Phan
* [email protected]
*
* College of Technology, Vietnamese University, Hanoi
* Graduate School of Information Sciences, Tohoku University
*
* JVnTextPro-v.2.0 is a free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published
* by the Free Software Foundation; either version 2 of the License,
* or (at your option) any later version.
*
* JVnTextPro-v.2.0 is distributed in the hope that it will be useful, but
* WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with JVnTextPro-v.2.0); if not, write to the Free Software Foundation,
* Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA.
*/
package jvnsegmenter;
import java.io.File;
import java.util.Vector;
import org.w3c.dom.Element;
import jflexcrf.Labeling;
import jvntextpro.data.DataReader;
import jvntextpro.data.DataWriter;
import jvntextpro.data.TaggingData;
// TODO: Auto-generated Javadoc
/**
* The Class CRFSegmenter.
*/
public class CRFSegmenter {
/** The reader. */
DataReader reader = new WordDataReader();
/** The writer. */
DataWriter writer = new WordDataWriter();
/** The data tagger. */
TaggingData dataTagger = new TaggingData();
/** The labeling. */
Labeling labeling = null;
/**
* Instantiates a new cRF segmenter.
*
* @param modelDir the model dir
*/
public CRFSegmenter(String modelDir){
init(modelDir);
}
/**
* Instantiates a new cRF segmenter.
*/
public CRFSegmenter() {
//do nothing until now
}
/**
* Inits the.
*
* @param modelDir the model dir
*/
public void init(String modelDir) {
//Read feature template file
String templateFile = modelDir + File.separator + "featuretemplate.xml";
Vector nodes = BasicContextGenerator.readFeatureNodes(templateFile);
for (int i = 0; i < nodes.size(); ++i){
Element node = nodes.get(i);
String cpType = node.getAttribute("value");
BasicContextGenerator contextGen = null;
if (cpType.equals("Conjunction")){
contextGen = new ConjunctionContextGenerator(node);
}
else if (cpType.equals("Lexicon")){
contextGen = new LexiconContextGenerator(node);
LexiconContextGenerator.loadVietnameseDict(modelDir + File.separator + "VNDic_UTF-8.txt");
LexiconContextGenerator.loadViLocationList(modelDir + File.separator + "vnlocations.txt");
LexiconContextGenerator.loadViPersonalNames(modelDir + File.separator + "vnpernames.txt");
}
else if (cpType.equals("Regex")){
contextGen = new RegexContextGenerator(node);
}
else if (cpType.equals("SyllableFeature")){
contextGen = new SyllableContextGenerator(node);
}
else if (cpType.equals("ViSyllableFeature")){
contextGen = new VietnameseContextGenerator(node);
}
if (contextGen != null)
dataTagger.addContextGenerator(contextGen);
}
//create context generators
labeling = new Labeling(modelDir, dataTagger, reader, writer);
}
/**
* Segmenting.
*
* @param instr the instr
* @return the string
*/
public String segmenting(String instr) {
return labeling.strLabeling(instr);
}
/**
* Segmenting.
*
* @param file the file
* @return the string
*/
public String segmenting(File file) {
return labeling.strLabeling(file);
}
/**
* Sets the data reader.
*
* @param reader the new data reader
*/
public void setDataReader(DataReader reader){
this.reader = reader;
}
/**
* Sets the data writer.
*
* @param writer the new data writer
*/
public void setDataWriter(DataWriter writer){
this.writer = writer;
}
}
© 2015 - 2024 Weber Informatics LLC | Privacy Policy