All Downloads are FREE. Search and download functionalities are using the official Maven repository.

jmaxent.Dictionary Maven / Gradle / Ivy

Go to download

HeidelTime is a multilingual cross-domain temporal tagger that extracts temporal expressions from documents and normalizes them according to the TIMEX3 annotation standard.

There is a newer version: 2.2.1
Show newest version
/*
 Copyright (C) 2010 by
 * 
 * 	Cam-Tu Nguyen 
 *  [email protected] or [email protected]
 *
 *  Xuan-Hieu Phan  
 *  [email protected] 
 *
 *  College of Technology, Vietnamese University, Hanoi
 * 	Graduate School of Information Sciences, Tohoku University
 *
 * JVnTextPro-v.2.0 is a free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published
 * by the Free Software Foundation; either version 2 of the License,
 * or (at your option) any later version.
 *
 * JVnTextPro-v.2.0 is distributed in the hope that it will be useful, but
 * WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with  JVnTextPro-v.2.0); if not, write to the Free Software Foundation,
 * Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA.
 */

package jmaxent;

import java.io.*;
import java.util.*;

// TODO: Auto-generated Javadoc
/**
 * The Class Dictionary.
 */
public class Dictionary {

    /** The dict. */
    public Map dict = null;
    
    /** The option. */
    public Option option = null; // reference to option object
    
    /** The data. */
    public Data data = null; // reference to data object
    
    /**
     * Instantiates a new dictionary.
     */
    public Dictionary() {
	dict = new HashMap();
    }
    
    /**
     * Instantiates a new dictionary.
     *
     * @param option the option
     * @param data the data
     */
    public Dictionary(Option option, Data data) {
	this.option = option;
	this.data = data;
	dict = new HashMap();
    }
    
    // read dictionary from model file 
    /**
     * Read dict.
     *
     * @param fin the fin
     * @throws IOException Signals that an I/O exception has occurred.
     */
    public void readDict(BufferedReader fin) throws IOException {
	dict.clear();
	
	String line;
	
	// get dictionary size
	if ((line = fin.readLine()) == null) {
	    System.out.println("No dictionary size information");
	    return;
	}
	
	int dictSize = Integer.parseInt(line);
	if (dictSize <= 0) {
	    System.out.println("Invalid dictionary size");
	}
	
	System.out.println("Reading dictionary ...");
	
	// main loop for reading dictionary content
	for (int i = 0; i < dictSize; i++) {
	    line = fin.readLine();
	    
	    if (line == null) {
		System.out.println("Invalid dictionary line");
		return;
	    }
	    
	    StringTokenizer strTok = new StringTokenizer(line, " \t\r\n");
	    int len = strTok.countTokens();
	    if (len < 2) {
		// invalid line
		continue;
	    }
	    
	    StringTokenizer cpTok = new StringTokenizer(strTok.nextToken(), ":");
	    int cp = Integer.parseInt(cpTok.nextToken());
	    int cpCount = Integer.parseInt(cpTok.nextToken());
	    
	    // create a new element
	    Element elem = new Element();
	    elem.count = cpCount;
	    elem.chosen = 1;
	    
	    while (strTok.hasMoreTokens()) {
		StringTokenizer lbTok = new StringTokenizer(strTok.nextToken(), ":");
		
		int label = Integer.parseInt(lbTok.nextToken());
		int count = Integer.parseInt(lbTok.nextToken());
		int fidx = Integer.parseInt(lbTok.nextToken());
		CountFIdx cntFIdx = new CountFIdx(count, fidx);
		
		elem.lbCntFidxes.put(new Integer(label), cntFIdx);
	    }
	    
	    // insert the element to the dictionary
	    dict.put(new Integer(cp), elem);
	}
	
	System.out.println("Reading dictionary (" + Integer.toString(dict.size()) +
		    " entries) completed!");
		    
	// read the line ###...
	line = fin.readLine();
    }
    
    // write dictionary to model file
    /**
     * Write dict.
     *
     * @param fout the fout
     * @throws IOException Signals that an I/O exception has occurred.
     */
    public void writeDict(PrintWriter fout) throws IOException {
	Iterator it = null;
	int count = 0;

	for (it = dict.keySet().iterator(); it.hasNext(); ) {
	    Integer cpInt = (Integer)it.next();
	    Element elem = (Element)dict.get(cpInt);
	    
	    if (elem.chosen == 1) {
		count++;
	    }
	}
	
	// write the dictionary size
	fout.println(Integer.toString(count));	
	
	for (it = dict.keySet().iterator(); it.hasNext(); ) {    
	
	    Integer cpInt = (Integer)it.next();
	    Element elem = (Element)dict.get(cpInt);
	    
	    if (elem.chosen == 0) {
		continue;
	    }
	    
	    // write the context predicate and its count
	    fout.print(cpInt.toString() + ":" + Integer.toString(elem.count));
	    
	    for (Iterator lbIt = elem.lbCntFidxes.keySet().iterator(); lbIt.hasNext(); ) {
		Integer labelInt = (Integer)lbIt.next();
		CountFIdx cntFIdx = (CountFIdx)elem.lbCntFidxes.get(labelInt);
	
		if (cntFIdx.fidx < 0) {
		    continue;
		}
	
		fout.print(" " + labelInt.toString() + ":" + 
			    Integer.toString(cntFIdx.count) + ":" +
			    Integer.toString(cntFIdx.fidx));		
	    }
	    
	    fout.println();
	}
	
	// write the line ###...
	fout.println(Option.modelSeparator);
    }
    
    // add a context predicate (and the label it supports) to dictionary
    /**
     * Adds the dict.
     *
     * @param cp the cp
     * @param label the label
     * @param count the count
     */
    public void addDict(int cp, int label, int count) {
	Element elem = (Element)dict.get(new Integer(cp));
	
	if (elem == null) {
	    // if the context predicate is not found
	    elem = new Element();
	    elem.count = count;
	    
	    CountFIdx cntFIdx = new CountFIdx(count, -1);
	    elem.lbCntFidxes.put(new Integer(label), cntFIdx);
	    
	    // insert the new element to the dict
	    dict.put(new Integer(cp), elem);
	    
	} else {
	    // update the total count
	    elem.count += count;
	    
	    CountFIdx cntFIdx = (CountFIdx)elem.lbCntFidxes.get(new Integer(label));
	    if (cntFIdx == null) {
		// the label not found
		cntFIdx = new CountFIdx(count, -1);
		elem.lbCntFidxes.put(new Integer(label), cntFIdx);
		
	    } else {
		// if label found, update the count only
		cntFIdx.count += count;
	    }
	}
    }
    
    // generating dictionary from training data
    /**
     * Generate dict.
     */
    public void generateDict() {
	if (data.trnData == null) {
	    System.out.println("No data available for generating dictionary");
	    return;
	}
	
	// scan all data observations of the training data
	for (int i = 0; i < data.trnData.size(); i++) {
	    Observation obsr = (Observation)data.trnData.get(i);
	    
	    for (int j = 0; j < obsr.cps.length; j++) {
		addDict(obsr.cps[j], obsr.humanLabel, 1);
	    }
	}
    }
    
    /**
     * Size.
     *
     * @return the int
     */
    public int size() {
	if (dict == null) {
	    return 0;
	} else {
	    return dict.size();
	}
    }

} // end of class Dictionary





© 2015 - 2024 Weber Informatics LLC | Privacy Policy