All Downloads are FREE. Search and download functionalities are using the official Maven repository.

jmaxent.FeatureGen Maven / Gradle / Ivy

/*
 Copyright (C) 2010 by
 * 
 * 	Cam-Tu Nguyen 
 *  [email protected] or [email protected]
 *
 *  Xuan-Hieu Phan  
 *  [email protected] 
 *
 *  College of Technology, Vietnamese University, Hanoi
 * 	Graduate School of Information Sciences, Tohoku University
 *
 * JVnTextPro-v.2.0 is a free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published
 * by the Free Software Foundation; either version 2 of the License,
 * or (at your option) any later version.
 *
 * JVnTextPro-v.2.0 is distributed in the hope that it will be useful, but
 * WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with  JVnTextPro-v.2.0); if not, write to the Free Software Foundation,
 * Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA.
 */

package jmaxent;

import java.io.*;
import java.util.*;

// TODO: Auto-generated Javadoc
/**
 * The Class FeatureGen.
 */
public class FeatureGen {
    
    /** The features. */
    List features = null;	// list of features
    
    /** The fmap. */
    Map fmap = null;		// feature map
    
    /** The option. */
    Option option = null;	// option object
    
    /** The data. */
    Data data = null;		// data object
    
    /** The dict. */
    Dictionary dict = null;	// dictionary object

    // for scan feature only
    /** The current features. */
    List currentFeatures = null;
    
    /** The current feature idx. */
    int currentFeatureIdx = 0;
        
    /**
     * Instantiates a new feature gen.
     *
     * @param option the option
     * @param data the data
     * @param dict the dict
     */
    public FeatureGen(Option option, Data data, Dictionary dict) {
	this.option = option;
	this.data = data;
	this.dict = dict;
    }
    
    // adding a feature
    /**
     * Adds the feature.
     *
     * @param f the f
     */
    public void addFeature(Feature f) {
	f.strId2IdxAdd(fmap);
	features.add(f);
    }
    
    // generating features
    /**
     * Generate features.
     */
    public void generateFeatures() {
	if (features != null) {
	    features.clear();
	} else {
	    features = new ArrayList();
	}		
	
	if (fmap != null) {
	    fmap.clear(); 
	} else {
	    fmap = new HashMap();
	}	
	
	if (currentFeatures != null) {
	    currentFeatures.clear();
	} else {
	    currentFeatures = new ArrayList();
	}
	
	if (data.trnData == null || dict.dict == null) {
	    System.out.println("No data or dictionary for generating features");
	    return;
	}
	
	// scan over data list
	for (int i = 0; i < data.trnData.size(); i++) {
	    Observation obsr = (Observation)data.trnData.get(i);
	    
	    for (int j = 0; j < obsr.cps.length; j++) {
		Element elem = null;
		CountFIdx cntFIdx = null;
		
		elem = (Element)dict.dict.get(new Integer(obsr.cps[j]));
		if (elem != null) {
		    if (elem.count <= option.cpRareThreshold) {
			// skip this context predicate, it is too rare
			continue;
		    }
		    
		    cntFIdx = (CountFIdx)elem.lbCntFidxes.get(new Integer(obsr.humanLabel));
		    if (cntFIdx != null) {
			if (cntFIdx.count <= option.fRareThreshold) {
			    // skip this feature, it is too rare
			    continue;
			}			
			
		    } else {
			// not found in the dictionary, then skip
			continue;
		    }		    
		    
		} else {
		    // not found in the dictionary, then skip
		    continue;
		}
		
		// update the feature		
		Feature f = new Feature(obsr.humanLabel, obsr.cps[j]);
		f.strId2Idx(fmap);
		if (f.idx < 0) {
		    // new feature, add to the feature list
		    addFeature(f);
		    
		    // update the feature index in the dictionary
		    cntFIdx.fidx = f.idx;
		    elem.chosen = 1;
		}
	    }
	}
	
	option.numFeatures = features.size();
    }
    
    /**
     * Num features.
     *
     * @return the int
     */
    public int numFeatures() {
	if (features == null) {
	    return 0;
	} else {
	    return features.size();
	}
    }
    
    /**
     * Read features.
     *
     * @param fin the fin
     * @throws IOException Signals that an I/O exception has occurred.
     */
    public void readFeatures(BufferedReader fin) throws IOException {
	if (features != null) {
	    features.clear();
	} else {
	    features = new ArrayList();
	}
	
	if (fmap != null) {
	    fmap.clear(); 
	} else {
	    fmap = new HashMap();
	}
	
	if (currentFeatures != null) {
	    currentFeatures.clear();
	} else {
	    currentFeatures = new ArrayList();
	}
	
	String line;
	
	// get the number of features
	if ((line = fin.readLine()) == null) {
	    System.out.println("Unknown number of features");
	    return;
	}
	int numFeatures = Integer.parseInt(line);
	if (numFeatures <= 0) {
	    System.out.println("Invalid number of features");
	    return;
	}
	
	System.out.println("Reading features ...");
	
	// main loop for reading features
	for (int i = 0; i < numFeatures; i++) {
	    line = fin.readLine();
	    if (line == null) {
		// invalid feature line, ignore it
		continue;
	    }
	    
	    StringTokenizer strTok = new StringTokenizer(line, " ");
	    if (strTok.countTokens() != 4) {
		System.out.println(i + " invalid feature line ");
		// invalid feature line, ignore it
		continue;
	    }
	    
	    // create a new feature by parsing the line
	    Feature f = new Feature(line, data.cpStr2Int, data.lbStr2Int);
	    
	    Integer fidx = (Integer)fmap.get(f.strId);
	    if (fidx == null) {
		// insert the feature into the feature map
		fmap.put(f.strId, new Integer(f.idx));
		features.add(f);
	    }
	    else {
	    fmap.put(f.strId, new Integer(f.idx));	    	
		features.add(f);
	    }
	}
	
	System.out.println("Reading " + Integer.toString(features.size()) + " features completed!");
	
	// read the line ###...
	line = fin.readLine();
	
	option.numFeatures = features.size();
    }
    
    /**
     * Write features.
     *
     * @param fout the fout
     * @throws IOException Signals that an I/O exception has occurred.
     */
    public void writeFeatures(PrintWriter fout) throws IOException {
	// write the number of features
	fout.println(Integer.toString(features.size()));
	
	for (int i = 0; i < features.size(); i++) {
	    Feature f = (Feature)features.get(i);
	    fout.println(f.toString(data.cpInt2Str, data.lbInt2Str));
	}
	
	// wirte the line ###...
	fout.println(Option.modelSeparator);
    }
    
    /**
     * Scan reset.
     */
    public void scanReset() {
	currentFeatureIdx = 0;
    }    
    
    /**
     * Start scan features.
     *
     * @param obsr the obsr
     */
    public void startScanFeatures(Observation obsr) {	
	currentFeatures.clear();
	currentFeatureIdx = 0;
	
	// scan over all context predicates
	for (int i = 0; i < obsr.cps.length; i++) {
	    Element elem = (Element)dict.dict.get(new Integer(obsr.cps[i]));
	    if (elem == null) {//this context predicate doesn't appear in the dictionary of training data
		continue;
	    }
	    
	    if (!(elem.isScanned)) {
		// scan all labels for features
		Iterator it = elem.lbCntFidxes.keySet().iterator();
		while (it.hasNext()) {
		    Integer labelInt = (Integer)it.next();
		    CountFIdx cntFIdx = (CountFIdx)elem.lbCntFidxes.get(labelInt);

		    if (cntFIdx.fidx >= 0) {
			Feature f = new Feature();
			f.FeatureInit(labelInt.intValue(), obsr.cps[i]);
			f.idx = cntFIdx.fidx;
			
			elem.cpFeatures.add(f);
		    }	    
		}		
		
		elem.isScanned = true;
	    }
	    
	    for (int j = 0; j < elem.cpFeatures.size(); j++) {
		currentFeatures.add(elem.cpFeatures.get(j));
	    }
	}		
    }    
    
    /**
     * Checks for next feature.
     *
     * @return true, if successful
     */
    public boolean hasNextFeature() {
	return (currentFeatureIdx < currentFeatures.size());
    }
    
    /**
     * Next feature.
     *
     * @return the feature
     */
    public Feature nextFeature() {
	Feature f = (Feature)currentFeatures.get(currentFeatureIdx);
	currentFeatureIdx++;
	return f;
    }
    
} // end of class FeatureGen





© 2015 - 2025 Weber Informatics LLC | Privacy Policy