cmu.arktweetnlp.impl.features.WordListFeatures Maven / Gradle / Ivy
The newest version!
package cmu.arktweetnlp.impl.features;
import java.io.BufferedReader;
import java.io.IOException;
import java.util.ArrayList;
import java.util.HashSet;
import java.util.List;
import java.util.regex.Pattern;
import cmu.arktweetnlp.Twokenize;
import cmu.arktweetnlp.impl.features.FeatureExtractor.FeatureExtractorInterface;
import cmu.arktweetnlp.impl.features.FeatureExtractor.PositionFeaturePairs;
import cmu.arktweetnlp.util.BasicFileIO;
public class WordListFeatures {
public static class POSTagDict implements FeatureExtractorInterface {
Pattern URL = Pattern.compile(Twokenize.url);
Pattern letter = Pattern.compile("[A-Za-z]{3,}");
public void addFeatures(List tokens, PositionFeaturePairs pairs) {
for (int t=0; t < tokens.size(); t++) {
String tok = tokens.get(t);
List poses = TagDictionary.WORD_TO_POS.get(tok);
if (poses == null && letter.matcher(tok).find() && (!URL.matcher(tok).matches())) {
String normtok = FeatureUtil.normalizecap(tok);
poses = TagDictionary.WORD_TO_POS.get(normtok);
if (poses==null) {
ArrayList fuzz = new ArrayList();
fuzz.addAll(FeatureUtil.fuzztoken(normtok, true));
fuzz.addAll(FeatureUtil.fuzztoken(FeatureUtil.normalize(tok), true));
for (String f:fuzz){
poses = TagDictionary.WORD_TO_POS.get(f);
if (poses != null){
//System.err.println(tok+"->"+f);
break;
}
}
}
}
if (poses!=null) {
pairs.add(t, "POSTagDict|"+poses.get(0));
if (t > 0)
pairs.add(t-1, "NextPOSTag|" + poses.get(0));
if (t < tokens.size()-1)
pairs.add(t+1, "PrevPOSTag|" + poses.get(0));
for (int i=1; i < poses.size(); i++) {
pairs.add(t, "POSTagDict|" + poses.get(i), (poses.size()-(double)i)/poses.size());
}
}
}
}
}
/*
* If you add a new list, make sure to run "mvn install -q" so it's copied
* to the target folder or else you'll get a null pointer exception
*/
public static class Listofnames implements FeatureExtractorInterface {
String Listname="";
HashSet members;
public Listofnames(String str) throws IOException {
Listname=str;
this.members = initDict(Listname);
}
public void addFeatures(List tokens, PositionFeaturePairs pairs) {
for (int t=0; t < tokens.size(); t++) {
String tok = tokens.get(t);
String normaltok = tok.toLowerCase().replaceAll("[‘’´`]", "'").replaceAll("[“”]", "\"");
if (members.contains(normaltok)){
pairs.add(t, Listname);
}
}
}
}
public static class MetaphonePOSDict implements FeatureExtractorInterface {
public void addFeatures(List tokens, PositionFeaturePairs pairs) {
for (int t=0; t < tokens.size(); t++) {
String tok = tokens.get(t);
if(tok.length()>1){
//tok=tok.replace("4", "four").replace("2", "two").replace("3","three");
String metaphone_word = MetaphoneFeatures.getMetaphone().encode(tok);
String key = String.format("**MP**%s", metaphone_word);
if (TagDictionary.WORD_TO_POS.containsKey(key)) {
List poses = TagDictionary.WORD_TO_POS.get(key);
for (String pos : poses) {
pairs.add(t, "metaph_POSDict|"+pos);
}
}
}
}
}
}
private static HashSet initDict(String dict) throws IOException {
BufferedReader bReader = BasicFileIO.getResourceReader("/cmu/arktweetnlp/" + dict);
HashSet dictset = new HashSet();
String line=BasicFileIO.getLine(bReader);
while(line != null){
dictset.add(line.toLowerCase());
line = BasicFileIO.getLine(bReader);
}
return dictset;
}
}