edu.berkeley.nlp.discPCFG.LexiconFeatureExtractor Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of berkeleyparser Show documentation
The Berkeley parser analyzes the grammatical structure of natural language using probabilistic context-free grammars (PCFGs).
The newest version!
/**
 * 
 */
package edu.berkeley.nlp.discPCFG;

import java.util.List;

import edu.berkeley.nlp.classify.FeatureExtractor;
import edu.berkeley.nlp.util.Counter;

/**
 * @author adpauls
 * 
 */
public class LexiconFeatureExtractor implements
		FeatureExtractor {

	/*
	 * (non-Javadoc)
	 * 
	 * @see edu.berkeley.nlp.classify.FeatureExtractor#extractFeatures(java.lang.Object)
	 */
	public Counter extractFeatures(
			WordInSentence sentence) {

		int loc = sentence.getSecond();
		String word = sentence.getFirst().get(loc);
		Counter counter = new Counter();
		counter.incrementCount(new LexiconFeature(word), 1.0);
		// Reformed Mar 2004 (cdm); hopefully much better now.
		// { -CAPS, -INITC ap, -LC lowercase, 0 } +
		// { -KNOWNLC, 0 } + [only for INITC]
		// { -NUM, 0 } +
		// { -DASH, 0 } +
		// { -last lowered char(s) if known discriminating suffix, 0}
		int wlen = word.length();
		int numCaps = 0;
		boolean hasDigit = false;
		boolean hasDash = false;
		boolean hasLower = false;
		for (int i = 0; i < wlen; i++) {
			char ch = word.charAt(i);
			if (Character.isDigit(ch)) {
				hasDigit = true;
			} else if (ch == '-') {
				hasDash = true;
			} else if (Character.isLetter(ch)) {
				if (Character.isLowerCase(ch)) {
					hasLower = true;
				} else if (Character.isTitleCase(ch)) {
					hasLower = true;
					numCaps++;
				} else {
					numCaps++;
				}
			}
		}
		char ch0 = word.charAt(0);
		String lowered = word.toLowerCase();
		if (Character.isUpperCase(ch0) || Character.isTitleCase(ch0)) {
			if (loc == 0 && numCaps == 1) {
				counter.incrementCount(new LexiconFeature(LexiconFeature.MorphFeature.INIT_CAP), 1.0);
				// if (isKnown(lowered)) {
				// sb.incrementCount(LexiconFeature.KNOWNLC, 1.0);
				// }
			} else {
				counter.incrementCount(new LexiconFeature(LexiconFeature.MorphFeature.ALL_CAPS), 1.0);
			}
		} else if (!Character.isLetter(ch0) && numCaps > 0) {
			counter.incrementCount(new LexiconFeature(LexiconFeature.MorphFeature.ALL_CAPS), 1.0);
		} else if (hasLower) { // (Character.isLowerCase(ch0)) {
			counter.incrementCount(new LexiconFeature(LexiconFeature.MorphFeature.LOWER_CASE), 1.0);
		}
		if (hasDigit) {
			counter.incrementCount(new LexiconFeature(LexiconFeature.MorphFeature.HAS_DIGIT), 1.0);
		}
		if (hasDash) {
			counter.incrementCount(new LexiconFeature(LexiconFeature.MorphFeature.HAS_DASH), 1.0);
		}
		if (lowered.endsWith("s") && wlen >= 3) {
			// here length 3, so you don't miss out on ones like 80s
			char ch2 = lowered.charAt(wlen - 2);
			// not -ess suffixes or greek/latin -us, -is
			if (ch2 != 's' && ch2 != 'i' && ch2 != 'u') {
				counter.incrementCount(new LexiconFeature(LexiconFeature.MorphFeature.SUFF_S), 1.0);
			}
		} else if (word.length() >= 5 && !hasDash && !(hasDigit && numCaps > 0)) {
			// don't do for very short words;
			// Implement common discriminating suffixes
			/*
			 * if (Corpus.myLanguage==Corpus.GERMAN){
			 * sb.append(lowered.substring(lowered.length()-1)); }else{
			 */
			if (lowered.endsWith("ed")) {
				counter.incrementCount(new LexiconFeature(LexiconFeature.MorphFeature.SUFF_ED), 1.0);
			} else if (lowered.endsWith("ing")) {
				counter.incrementCount(new LexiconFeature(LexiconFeature.MorphFeature.SUFF_ING), 1.0);
			} else if (lowered.endsWith("ion")) {
				counter.incrementCount(new LexiconFeature(LexiconFeature.MorphFeature.SUFF_ION), 1.0);
			} else if (lowered.endsWith("er")) {
				counter.incrementCount(new LexiconFeature(LexiconFeature.MorphFeature.SUFF_ER), 1.0);
			} else if (lowered.endsWith("est")) {
				counter.incrementCount(new LexiconFeature(LexiconFeature.MorphFeature.SUFF_EST), 1.0);
			} else if (lowered.endsWith("ly")) {
				counter.incrementCount(new LexiconFeature(LexiconFeature.MorphFeature.SUFF_LY), 1.0);
			} else if (lowered.endsWith("ity")) {
				counter.incrementCount(new LexiconFeature(LexiconFeature.MorphFeature.SUFF_ITY), 1.0);
			} else if (lowered.endsWith("y")) {
				counter.incrementCount(new LexiconFeature(LexiconFeature.MorphFeature.SUFF_Y), 1.0);
			} else if (lowered.endsWith("al")) {
				counter.incrementCount(new LexiconFeature(LexiconFeature.MorphFeature.SUFF_AL), 1.0);
				// } else if (lowered.endsWith("ble")) {
				// sb.append("-ble");
				// } else if (lowered.endsWith("e")) {
				// sb.append("-e");
			}

		}
		return counter;
	}
}