All Downloads are FREE. Search and download functionalities are using the official Maven repository.

cmu.arktweetnlp.impl.features.MetaphoneFeatures Maven / Gradle / Ivy

The newest version!
package cmu.arktweetnlp.impl.features;

import java.util.List;

import org.apache.commons.codec.language.DoubleMetaphone;
import org.apache.commons.codec.language.Metaphone;

import cmu.arktweetnlp.impl.features.FeatureExtractor.FeatureExtractorInterface;
import cmu.arktweetnlp.impl.features.FeatureExtractor.PositionFeaturePairs;

/** We should test if these are obsolete yet **/
public class MetaphoneFeatures {
	
	

	private static Metaphone _metaphone = null;
	private static DoubleMetaphone dblmetaphone = null;
	public static Metaphone getMetaphone() {
		if (_metaphone == null) {
			_metaphone = new Metaphone();
			_metaphone.setMaxCodeLen(100);
		} 
		return _metaphone;
	};
	public static DoubleMetaphone getDblMetaphone() {
		if (dblmetaphone == null) {
			dblmetaphone = new DoubleMetaphone();
			dblmetaphone.setMaxCodeLen(100);
		} 
		return dblmetaphone;
	};
	private String MetaphoneNum(String str){ //change this eventually
		StringBuilder sb = new StringBuilder(str);
		if (str.charAt(str.length()-1)=='1')
			sb.deleteCharAt(str.length()-1).append("one");
		if (str.charAt(0)=='1')
			sb.deleteCharAt(0).insert(0, "one");
		if (str.charAt(0)=='2')
			sb.deleteCharAt(0).insert(0, "two");
		else if(str.charAt(0)=='4')
			sb.deleteCharAt(0).insert(0, "four");
		return sb.toString();
	}
	

	public static class MetaphoneLexical implements FeatureExtractorInterface{
		public void addFeatures(List tokens, PositionFeaturePairs pairs) {
			for (int t=0; t < tokens.size(); t++) {
				String tok = tokens.get(t);
				if(tok.length()>1){
					//String ppword=MetaphoneNum(tok);
					String metaphone_word = getDblMetaphone().encode(tok);
					String alternate_word = getDblMetaphone().doubleMetaphone(tok, true);
					pairs.add(t, "metaphone_word|"+metaphone_word);
					if(!metaphone_word.equals(alternate_word))
						pairs.add(t, "metaphone_word|"+alternate_word);
				}
			}        	
		}
	}
}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy