cmu.arktweetnlp.impl.features.MiscFeatures Maven / Gradle / Ivy
The newest version!
package cmu.arktweetnlp.impl.features;
import java.util.ArrayList;
import java.util.List;
import java.util.regex.Pattern;
import cmu.arktweetnlp.Twokenize;
import cmu.arktweetnlp.impl.features.FeatureExtractor.FeatureExtractorInterface;
import cmu.arktweetnlp.impl.features.FeatureExtractor.PositionFeaturePairs;
public class MiscFeatures {
public static class NextWord implements FeatureExtractorInterface{
public void addFeatures(List tokens, PositionFeaturePairs pairs) {
ArrayList normtoks = FeatureUtil.normalize(tokens);
for (int t=0; t < tokens.size()-1; t++) {
pairs.add(t, "nextword|"+tokens.get(t+1), .5);
pairs.add(t, "nextword|"+normtoks.get(t+1), .5);
pairs.add(t, "currnext|"+normtoks.get(t)+"|"+normtoks.get(t+1));
}
pairs.add(tokens.size()-1, "currnext|"+normtoks.get(tokens.size()-1)+"|");
pairs.add(tokens.size()-1, "nextword|");
}
}
public static class Next2Words implements FeatureExtractorInterface{
public void addFeatures(List tokens, PositionFeaturePairs pairs) {
if (tokens.size()>1){
ArrayList normtoks = FeatureUtil.normalize(tokens);
for (int t=0; t < tokens.size()-2; t++) {
pairs.add(t, "next2words|"+tokens.get(t+1)+"|"+tokens.get(t+2), .5);
pairs.add(t, "next2words|"+normtoks.get(t+1)
+"|"+normtoks.get(t+2), .5);
}
pairs.add(tokens.size()-2, "next2words|"+normtoks.get(tokens.size()-1)+"|", .5);
pairs.add(tokens.size()-2, "next2words|"+tokens.get(tokens.size()-1)+"|", .5);
}
}
}
public static class PrevWord implements FeatureExtractorInterface{
public void addFeatures(List tokens, PositionFeaturePairs pairs) {
ArrayList normtoks = FeatureUtil.normalize(tokens);
pairs.add(0, "prevword|");
pairs.add(0, "prevcurr||"+normtoks.get(0));
for (int t=1; t < tokens.size(); t++) {
pairs.add(t, "prevword|"+tokens.get(t-1));
pairs.add(t, "prevword|"+normtoks.get(t-1));
pairs.add(t, "prevcurr|"+normtoks.get(t-1)
+"|"+normtoks.get(t));
}
}
}
public static class Prev2Words implements FeatureExtractorInterface{
public void addFeatures(List tokens, PositionFeaturePairs pairs) {
if (tokens.size()>1){
ArrayList normtoks = FeatureUtil.normalize(tokens);
pairs.add(1, "prev2words||"+normtoks.get(0)+"|"+normtoks.get(1));
for (int t=2; t < tokens.size(); t++) {
pairs.add(t, "prev2words|"+tokens.get(t-2)+"|"+tokens.get(t-1));
pairs.add(t, "prev2words|"+normtoks.get(t-2)+"|"+normtoks.get(t-1));
}
}
}
}
public static class PrevNext implements FeatureExtractorInterface{
public void addFeatures(List tokens, PositionFeaturePairs pairs) {
ArrayList normtoks = FeatureUtil.normalize(tokens);
if (tokens.size()>1){
pairs.add(0, "prevnext||"+normtoks.get(1));
//pairs.add(0, "prevcurrnext||"+normtoks.get(0)+"|"+normtoks.get(1));
for (int t=1; t < normtoks.size()-1; t++) {
pairs.add(t, "prevnext|"+normtoks.get(t-1)+"|"+normtoks.get(t+1));
//pairs.add(t, "prevcurrnext|"+normtoks.get(t-1)+"|"+normtoks.get(0)+"|"+normtoks.get(1));
}
pairs.add(normtoks.size()-1,"prevnext|"+normtoks.get(tokens.size()-2)+"|");
//pairs.add(normtoks.size()-1,"prevcurrnext|"+normtoks.get(tokens.size()-2)+"|"+normtoks.get(tokens.size()-1)+"|");
}
}
}
public static class CapitalizationFeatures implements FeatureExtractorInterface {
public void addFeatures(List tokens, PositionFeaturePairs pairs) {
for (int t=0; t < tokens.size(); t++) {
String tok = tokens.get(t);
/*if (Character.isUpperCase(tok.charAt(0))) {
pairs.add(t, "initcap");
}*/
int numChar = 0;
int numCap = 0;
for (int i=0; i < tok.length(); i++) {
numChar += Character.isLetter(tok.charAt(i)) ? 1 : 0;
numCap += Character.isUpperCase(tok.charAt(i)) ? 1 : 0;
}
// A => shortcap
// HELLO => longcap
// Hello => initcap
// HeLLo => mixcap
boolean allCap = numChar==numCap;
boolean shortCap = allCap && numChar <= 1;
boolean longCap = allCap && numChar >= 2;
boolean initCap = !allCap && numChar >= 2 && Character.isUpperCase(tok.charAt(0)) && numCap==1;
boolean mixCap = numCap>=1 && numChar >= 2 && (tok.charAt(0) != '@') && !(tok.startsWith("http://"));
String caplabel = shortCap ? "shortcap" : longCap ? "longcap" : initCap ? "initcap"
: mixCap ? "mixcap" : "nocap";
if (caplabel != null){
if (numChar >= 1) {
if (tok.endsWith("'s"))
caplabel = "pos-" + caplabel;
else if (t==0)
caplabel = "first-" + caplabel;
pairs.add(t, caplabel+"");
}
}
}
}
}
public static class SimpleOrthFeatures implements FeatureExtractorInterface {
public Pattern hasDigit = Pattern.compile("[0-9]");
/** TODO change to punctuation class, or better from Twokenize **/
//Pattern allPunct = Pattern.compile("^[^a-zA-Z0-9]*$");
Pattern allPunct = Pattern.compile("^\\W*$");
Pattern emoticon = Pattern.compile(Twokenize.emoticon);
public void addFeatures(List tokens, PositionFeaturePairs pairs) {
for (int t=0; t < tokens.size(); t++) {
String tok = tokens.get(t);
if (hasDigit.matcher(tok).find())
pairs.add(t, "HasDigit");
if (tok.charAt(0) == '@')
pairs.add(t, "InitAt");
if (tok.charAt(0) == '#')
pairs.add(t, "InitHash");
/*if (allPunct.matcher(tok).matches()){
pairs.add(t, "AllPunct");
}*/
if (emoticon.matcher(tok).matches()){
pairs.add(t, "Emoticon");
}
if (tok.contains("-")){
pairs.add(t, "Hyphenated");
String[] splithyph = FeatureUtil.normalize(tok).split("-", 2);
//pairs.add(t, "preHyph|"+splithyph[0]);//quote -Ben Franklin
//pairs.add(t, "postHyph|"+splithyph[1]);//-esque
for (String part:splithyph){
pairs.add(t, "hyph|" + part);
}
}
}
}
}
public static class URLFeatures implements FeatureExtractorInterface {
Pattern validURL = Pattern.compile(Twokenize.url);
Pattern validEmail = Pattern.compile(Twokenize.Email);
public void addFeatures(List tokens, PositionFeaturePairs pairs) {
for (int t=0; t < tokens.size(); t++) {
String tok = tokens.get(t);
if (validURL.matcher(tok).matches()){
pairs.add(t, "validURL");
}
if (validEmail.matcher(tok).matches()){
pairs.add(t, "validURL");
}
}
}
}
public static class WordformFeatures implements FeatureExtractorInterface {
public void addFeatures(List tokens, PositionFeaturePairs pairs) {
for (int t=0; t < tokens.size(); t++) {
String tok = tokens.get(t);
String normalizedtok=tok.replaceAll("[‘’´`]", "'").replaceAll("[“”]", "\"");
pairs.add(t, "Word|" + normalizedtok);
pairs.add(t, "Lower|" + FeatureUtil.normalize(normalizedtok));
pairs.add(t, "Xxdshape|" + Xxdshape(normalizedtok), .5);
pairs.add(t, "charclass|" + charclassshape(tok), .5);
}
}
private String Xxdshape(String tok) {
String s=tok.replaceAll("[a-z]", "x").replaceAll("[0-9]", "d").replaceAll("[A-Z]","X");
return s;
}
private String charclassshape(String tok) {
StringBuilder sb = new StringBuilder(3 * tok.length());
for(int i=0; i tokens, PositionFeaturePairs pairs) {
for (int t=0; t < tokens.size(); t++) {
String tok = FeatureUtil.normalize(tokens.get(t).replaceAll("[‘’´`]", "'").replaceAll("[“”]", "\""));
int l=tok.length();
for(int i=1;i<=ngram;i++){
if(l>=i){
pairs.add(t, i+"gramPref|"+tok.substring(0, i));
}
else break;
}
}
}
}
public static class NgramSuffix implements FeatureExtractorInterface {
int ngram=3;
public NgramSuffix(int i) {
ngram=i;
}
public void addFeatures(List tokens, PositionFeaturePairs pairs) {
for (int t=0; t < tokens.size(); t++) {
String tok = FeatureUtil.normalize(tokens.get(t).replaceAll("[‘’´`]", "'").replaceAll("[“”]", "\""));
int l=tok.length();
for(int i=1;i<=ngram;i++){
if(l>=i){
pairs.add(t, i+"gramSuff|"+tok.substring(l-i, l));
/*if (t tokens, PositionFeaturePairs pairs) {
for (int t=0; t < Math.min(tokens.size(), 4); t++) {
pairs.add(t, "t="+t);
}
for (int t=tokens.size()-1; t > Math.max(tokens.size()-4, -1); t--) {
pairs.add(t, "t=-"+t);
}
}
}
}