All Downloads are FREE. Search and download functionalities are using the official Maven repository.

cmu.arktweetnlp.impl.features.FeatureUtil Maven / Gradle / Ivy

The newest version!
package cmu.arktweetnlp.impl.features;

import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import cmu.arktweetnlp.Twokenize;

import com.twitter.Regex;


/** 
 * String normalizations and other shared utilities for feature computation. 
 **/
public class FeatureUtil {
	
	public static Pattern URL = Pattern.compile(Twokenize.OR(Twokenize.url, Twokenize.Email));
	public static Pattern justbase = Pattern.compile("(?!www\\.|ww\\.|w\\.|@)[a-zA-Z0-9]+\\.[A-Za-z0-9\\.]+"); 

//	Pattern URL = Pattern.compile(Twokenize.url);

	
	public static ArrayList normalize(List toks){
		ArrayList normtoks = new ArrayList(toks.size());
		for (String s:toks){
			normtoks.add(FeatureUtil.normalize(s));
		}
		return normtoks;
	}

	/**
	 * @param str
	 * @return Lowercase normalized string
	 */
	public static String normalize(String str) {
	    str = str.toLowerCase();
		if (URL.matcher(str).matches()){	    	
	    	String base = "";
	    	Matcher m = justbase.matcher(str);
	    	if (m.find())
	    		base=m.group().toLowerCase();
	    	return "";
	    }
	    if (Regex.VALID_MENTION_OR_LIST.matcher(str).matches())
	    	return "<@MENTION>";
	    return str;
	}

	public static ArrayList normalizecap(List toks){
		ArrayList normtoks = new ArrayList(toks.size());
		for (String s:toks){
			normtoks.add(FeatureUtil.normalizecap(s));
		}
		return normtoks;
	}

	//same as normalize but retains capitalization
	public static String normalizecap(String str) {
		if (URL.matcher(str).matches()){	    	
	    	String base = "";
	    	Matcher m = justbase.matcher(str);
	    	if (m.find())
	    		base=m.group().toLowerCase();
	    	return "";
	    }
	    if (Regex.VALID_MENTION_OR_LIST.matcher(str).matches())
	    	return "<@MENTION>";
	    return str;
	}
	
	
	static Pattern repeatchar = Pattern.compile("([\\w])\\1{1,}");
	static Pattern repeatvowel = Pattern.compile("(a|e|i|o|u)\\1+");
	
	public static ArrayList fuzztoken(String tok, boolean apos) {
	    ArrayList fuzz = new ArrayList();
	    fuzz.add(tok.replaceAll("[‘’´`]", "'").replaceAll("[“”]", "\""));
	    fuzz.add(tok);
	    fuzz.add(repeatchar.matcher(tok).replaceAll("$1"));//omggggggg->omg
	    fuzz.add(repeatchar.matcher(tok).replaceAll("$1$1"));//omggggggg->omgg
	    fuzz.add(repeatvowel.matcher(tok).replaceAll("$1"));//heeellloooo->helllo
	    if (apos && !(tok.startsWith("tswift
	    	//maybe a bad idea (bello's->bello, re-enable->re, croplife's->'s)
	    	fuzz.addAll(Arrays.asList(tok.split("\\p{Punct}")));
	    }
	    return fuzz;
	}

}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy