All Downloads are FREE. Search and download functionalities are using the official Maven repository.

cmu.arktweetnlp.impl.features.WordListFeatures Maven / Gradle / Ivy

The newest version!
package cmu.arktweetnlp.impl.features;

import java.io.BufferedReader;
import java.io.IOException;
import java.util.ArrayList;
import java.util.HashSet;
import java.util.List;
import java.util.regex.Pattern;

import cmu.arktweetnlp.Twokenize;
import cmu.arktweetnlp.impl.features.FeatureExtractor.FeatureExtractorInterface;
import cmu.arktweetnlp.impl.features.FeatureExtractor.PositionFeaturePairs;
import cmu.arktweetnlp.util.BasicFileIO;

public class WordListFeatures {

	public static class POSTagDict implements FeatureExtractorInterface {
		Pattern URL = Pattern.compile(Twokenize.url);
		Pattern letter = Pattern.compile("[A-Za-z]{3,}");
		public void addFeatures(List tokens, PositionFeaturePairs pairs) {
			for (int t=0; t < tokens.size(); t++) {
				String tok = tokens.get(t);
	
				List poses = TagDictionary.WORD_TO_POS.get(tok);
				if (poses == null && letter.matcher(tok).find() && (!URL.matcher(tok).matches())) {
					String normtok = FeatureUtil.normalizecap(tok);
					poses = TagDictionary.WORD_TO_POS.get(normtok);
					if (poses==null) {
						ArrayList fuzz = new ArrayList(); 
						fuzz.addAll(FeatureUtil.fuzztoken(normtok, true));
						fuzz.addAll(FeatureUtil.fuzztoken(FeatureUtil.normalize(tok), true));
				    	for (String f:fuzz){
				    		poses = TagDictionary.WORD_TO_POS.get(f);
				    		if (poses != null){
				    			//System.err.println(tok+"->"+f);
				    			break;
				    		}
				    	}						
					}
				}
				if (poses!=null) {
					pairs.add(t, "POSTagDict|"+poses.get(0));
					if (t > 0)
						pairs.add(t-1, "NextPOSTag|" + poses.get(0));
					if (t < tokens.size()-1)
						pairs.add(t+1, "PrevPOSTag|" + poses.get(0));					
					for (int i=1; i < poses.size(); i++) {
						pairs.add(t, "POSTagDict|" + poses.get(i), (poses.size()-(double)i)/poses.size());
					}
				} 	    		
			}
		}
	}

	/*
	 * If you add a new list, make sure to run "mvn install -q" so it's copied
	 * to the target folder or else you'll get a null pointer exception
	 */
	public static class Listofnames implements FeatureExtractorInterface {
		String Listname="";
		HashSet members;
		public Listofnames(String str) throws IOException {
			Listname=str;
			this.members = initDict(Listname);
		}
		public void addFeatures(List tokens, PositionFeaturePairs pairs) {
			for (int t=0; t < tokens.size(); t++) {
				String tok = tokens.get(t);
				String normaltok = tok.toLowerCase().replaceAll("[‘’´`]", "'").replaceAll("[“”]", "\"");
				if (members.contains(normaltok)){
					pairs.add(t, Listname);
				}
			}        	
		}
	}

	public static class MetaphonePOSDict implements FeatureExtractorInterface {
		public void addFeatures(List tokens, PositionFeaturePairs pairs) {
			for (int t=0; t < tokens.size(); t++) {
				String tok = tokens.get(t);
				if(tok.length()>1){
					//tok=tok.replace("4", "four").replace("2", "two").replace("3","three");
					String metaphone_word = MetaphoneFeatures.getMetaphone().encode(tok);
					String key = String.format("**MP**%s", metaphone_word);
					if (TagDictionary.WORD_TO_POS.containsKey(key)) {
						List poses = TagDictionary.WORD_TO_POS.get(key);
						for (String pos : poses) {
							pairs.add(t, "metaph_POSDict|"+pos);
						}    				
					}
				}
			}
		}
	}

	private static HashSet initDict(String dict) throws IOException {
		BufferedReader bReader = BasicFileIO.getResourceReader("/cmu/arktweetnlp/" + dict);
		HashSet dictset = new HashSet();
		String line=BasicFileIO.getLine(bReader);
		while(line != null){
			dictset.add(line.toLowerCase());
			line = BasicFileIO.getLine(bReader);
		}
		return dictset;
	}

}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy