jvnsegmenter.CRFSegmenter Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of heideltime Show documentation
HeidelTime is a multilingual cross-domain temporal tagger that extracts temporal expressions from documents and normalizes them according to the TIMEX3 annotation standard.
There is a newer version: 2.2.1
Show newest version
/*
 Copyright (C) 2010 by
 * 
 * 	Cam-Tu Nguyen 
 *  [email protected] or [email protected]
 *
 *  Xuan-Hieu Phan  
 *  [email protected] 
 *
 *  College of Technology, Vietnamese University, Hanoi
 * 	Graduate School of Information Sciences, Tohoku University
 *
 * JVnTextPro-v.2.0 is a free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published
 * by the Free Software Foundation; either version 2 of the License,
 * or (at your option) any later version.
 *
 * JVnTextPro-v.2.0 is distributed in the hope that it will be useful, but
 * WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with  JVnTextPro-v.2.0); if not, write to the Free Software Foundation,
 * Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA.
 */
package jvnsegmenter;

import java.io.File;
import java.util.Vector;

import org.w3c.dom.Element;

import jflexcrf.Labeling;
import jvntextpro.data.DataReader;
import jvntextpro.data.DataWriter;
import jvntextpro.data.TaggingData;

// TODO: Auto-generated Javadoc
/**
 * The Class CRFSegmenter.
 */
public class CRFSegmenter {
	
	/** The reader. */
	DataReader reader = new WordDataReader();
	
	/** The writer. */
	DataWriter writer = new WordDataWriter();
	
	/** The data tagger. */
	TaggingData dataTagger = new TaggingData();
	
	/** The labeling. */
	Labeling labeling = null;
	
	/**
	 * Instantiates a new cRF segmenter.
	 *
	 * @param modelDir the model dir
	 */
	public CRFSegmenter(String modelDir){
		init(modelDir);
	}
	
	/**
	 * Instantiates a new cRF segmenter.
	 */
	public CRFSegmenter() {
		//do nothing until now
	}

	/**
	 * Inits the.
	 *
	 * @param modelDir the model dir
	 */
	public void init(String modelDir) {
		//Read feature template file
		String templateFile = modelDir + File.separator + "featuretemplate.xml";
		Vector nodes = BasicContextGenerator.readFeatureNodes(templateFile); 
		
		for (int i = 0; i < nodes.size(); ++i){
			Element node = nodes.get(i);
			String cpType = node.getAttribute("value");
			BasicContextGenerator contextGen = null;
			
			if (cpType.equals("Conjunction")){
				contextGen = new ConjunctionContextGenerator(node);
			}
			else if (cpType.equals("Lexicon")){
				contextGen = new LexiconContextGenerator(node);	
				LexiconContextGenerator.loadVietnameseDict(modelDir + File.separator + "VNDic_UTF-8.txt");
				LexiconContextGenerator.loadViLocationList(modelDir + File.separator + "vnlocations.txt");
				LexiconContextGenerator.loadViPersonalNames(modelDir + File.separator + "vnpernames.txt");
			}
			else if (cpType.equals("Regex")){
				contextGen = new RegexContextGenerator(node);
			}
			else if (cpType.equals("SyllableFeature")){
				contextGen = new SyllableContextGenerator(node);
			}
			else if (cpType.equals("ViSyllableFeature")){
				contextGen = new VietnameseContextGenerator(node);
			}
			
			if (contextGen != null)
				dataTagger.addContextGenerator(contextGen);
		}
		
		//create context generators
		labeling = new Labeling(modelDir, dataTagger, reader, writer);
	}

	/**
	 * Segmenting.
	 *
	 * @param instr the instr
	 * @return the string
	 */
	public String segmenting(String instr) {
		return labeling.strLabeling(instr);
	}

	/**
	 * Segmenting.
	 *
	 * @param file the file
	 * @return the string
	 */
	public String segmenting(File file) {
		return labeling.strLabeling(file);
	}
	
	/**
	 * Sets the data reader.
	 *
	 * @param reader the new data reader
	 */
	public void setDataReader(DataReader reader){
		this.reader = reader;
	}
	
	/**
	 * Sets the data writer.
	 *
	 * @param writer the new data writer
	 */
	public void setDataWriter(DataWriter writer){
		this.writer = writer;
	}

}