All Downloads are FREE. Search and download functionalities are using the official Maven repository.

cmu.arktweetnlp.impl.features.WordClusterPaths Maven / Gradle / Ivy

The newest version!
package cmu.arktweetnlp.impl.features;

import java.io.BufferedReader;
import java.io.IOException;
import java.util.HashMap;
import java.util.List;
import java.util.regex.Pattern;

import cmu.arktweetnlp.Twokenize;
import cmu.arktweetnlp.impl.features.FeatureExtractor.FeatureExtractorInterface;
import cmu.arktweetnlp.impl.features.FeatureExtractor.PositionFeaturePairs;
import cmu.arktweetnlp.util.BasicFileIO;
import edu.stanford.nlp.util.StringUtils;

/**
 * Brown word clusters: features are path prefixes down the tree. 
 **/
public class WordClusterPaths implements FeatureExtractorInterface {
	
	/** TODO this should be moved into config somehow **/
	public static String clusterResourceName = "/cmu/arktweetnlp/50mpaths2";
	
	public static HashMap wordToPath;

	public WordClusterPaths() throws IOException {
//		log.info("Loading clusters");
		
		//read in paths file
		BufferedReader bReader = BasicFileIO.getResourceReader(clusterResourceName);
		String[] splitline = new String[3];
		String line=BasicFileIO.getLine(bReader);
		wordToPath = new HashMap(); 
		while(line != null){
			splitline = line.split("\\t");
			wordToPath.put(splitline[1], splitline[0]);
			line = BasicFileIO.getLine(bReader);
		}			
//		log.info("Finished loading clusters");
	}
	
	public void addFeatures(List tokens, PositionFeaturePairs pairs) {
		String bitstring = null;
		for (int t=0; t < tokens.size(); t++) {
			String tok = tokens.get(t);
		    String normaltok = FeatureUtil.normalize(tok);
		    bitstring = wordToPath.get(normaltok);
		    if (bitstring == null){
		    	for (String fuzz : FeatureUtil.fuzztoken(normaltok, true)){
		    		bitstring = wordToPath.get(fuzz);
		    		if (bitstring != null){
		    			//System.err.println(normaltok+"->"+fuzz);
		    			break;
		    		}
		    	}
		    }
		    
			if (bitstring != null){
				int i;
				bitstring = StringUtils.pad(bitstring, 16).replace(' ', '0');
				for(i=2; i<=16; i+=2){
					pairs.add(t, "BigCluster|" + bitstring.substring(0,i));
				}
				if (t0){
					for(i=4; i<=12; i+=4)
						pairs.add(t-1, "NextBigCluster"+"|" + bitstring.substring(0,i));
				}
			}
/*				else{
				pairs.add(t, "BigCluster|none");
			}*/
		}
	}
}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy