All Downloads are FREE. Search and download functionalities are using the official Maven repository.

edu.berkeley.nlp.discPCFG.LexiconFeatureExtractor Maven / Gradle / Ivy

Go to download

The Berkeley parser analyzes the grammatical structure of natural language using probabilistic context-free grammars (PCFGs).

The newest version!
/**
 * 
 */
package edu.berkeley.nlp.discPCFG;

import java.util.List;

import edu.berkeley.nlp.classify.FeatureExtractor;
import edu.berkeley.nlp.util.Counter;

/**
 * @author adpauls
 * 
 */
public class LexiconFeatureExtractor implements
		FeatureExtractor {

	/*
	 * (non-Javadoc)
	 * 
	 * @see edu.berkeley.nlp.classify.FeatureExtractor#extractFeatures(java.lang.Object)
	 */
	public Counter extractFeatures(
			WordInSentence sentence) {

		int loc = sentence.getSecond();
		String word = sentence.getFirst().get(loc);
		Counter counter = new Counter();
		counter.incrementCount(new LexiconFeature(word), 1.0);
		// Reformed Mar 2004 (cdm); hopefully much better now.
		// { -CAPS, -INITC ap, -LC lowercase, 0 } +
		// { -KNOWNLC, 0 } + [only for INITC]
		// { -NUM, 0 } +
		// { -DASH, 0 } +
		// { -last lowered char(s) if known discriminating suffix, 0}
		int wlen = word.length();
		int numCaps = 0;
		boolean hasDigit = false;
		boolean hasDash = false;
		boolean hasLower = false;
		for (int i = 0; i < wlen; i++) {
			char ch = word.charAt(i);
			if (Character.isDigit(ch)) {
				hasDigit = true;
			} else if (ch == '-') {
				hasDash = true;
			} else if (Character.isLetter(ch)) {
				if (Character.isLowerCase(ch)) {
					hasLower = true;
				} else if (Character.isTitleCase(ch)) {
					hasLower = true;
					numCaps++;
				} else {
					numCaps++;
				}
			}
		}
		char ch0 = word.charAt(0);
		String lowered = word.toLowerCase();
		if (Character.isUpperCase(ch0) || Character.isTitleCase(ch0)) {
			if (loc == 0 && numCaps == 1) {
				counter.incrementCount(new LexiconFeature(LexiconFeature.MorphFeature.INIT_CAP), 1.0);
				// if (isKnown(lowered)) {
				// sb.incrementCount(LexiconFeature.KNOWNLC, 1.0);
				// }
			} else {
				counter.incrementCount(new LexiconFeature(LexiconFeature.MorphFeature.ALL_CAPS), 1.0);
			}
		} else if (!Character.isLetter(ch0) && numCaps > 0) {
			counter.incrementCount(new LexiconFeature(LexiconFeature.MorphFeature.ALL_CAPS), 1.0);
		} else if (hasLower) { // (Character.isLowerCase(ch0)) {
			counter.incrementCount(new LexiconFeature(LexiconFeature.MorphFeature.LOWER_CASE), 1.0);
		}
		if (hasDigit) {
			counter.incrementCount(new LexiconFeature(LexiconFeature.MorphFeature.HAS_DIGIT), 1.0);
		}
		if (hasDash) {
			counter.incrementCount(new LexiconFeature(LexiconFeature.MorphFeature.HAS_DASH), 1.0);
		}
		if (lowered.endsWith("s") && wlen >= 3) {
			// here length 3, so you don't miss out on ones like 80s
			char ch2 = lowered.charAt(wlen - 2);
			// not -ess suffixes or greek/latin -us, -is
			if (ch2 != 's' && ch2 != 'i' && ch2 != 'u') {
				counter.incrementCount(new LexiconFeature(LexiconFeature.MorphFeature.SUFF_S), 1.0);
			}
		} else if (word.length() >= 5 && !hasDash && !(hasDigit && numCaps > 0)) {
			// don't do for very short words;
			// Implement common discriminating suffixes
			/*
			 * if (Corpus.myLanguage==Corpus.GERMAN){
			 * sb.append(lowered.substring(lowered.length()-1)); }else{
			 */
			if (lowered.endsWith("ed")) {
				counter.incrementCount(new LexiconFeature(LexiconFeature.MorphFeature.SUFF_ED), 1.0);
			} else if (lowered.endsWith("ing")) {
				counter.incrementCount(new LexiconFeature(LexiconFeature.MorphFeature.SUFF_ING), 1.0);
			} else if (lowered.endsWith("ion")) {
				counter.incrementCount(new LexiconFeature(LexiconFeature.MorphFeature.SUFF_ION), 1.0);
			} else if (lowered.endsWith("er")) {
				counter.incrementCount(new LexiconFeature(LexiconFeature.MorphFeature.SUFF_ER), 1.0);
			} else if (lowered.endsWith("est")) {
				counter.incrementCount(new LexiconFeature(LexiconFeature.MorphFeature.SUFF_EST), 1.0);
			} else if (lowered.endsWith("ly")) {
				counter.incrementCount(new LexiconFeature(LexiconFeature.MorphFeature.SUFF_LY), 1.0);
			} else if (lowered.endsWith("ity")) {
				counter.incrementCount(new LexiconFeature(LexiconFeature.MorphFeature.SUFF_ITY), 1.0);
			} else if (lowered.endsWith("y")) {
				counter.incrementCount(new LexiconFeature(LexiconFeature.MorphFeature.SUFF_Y), 1.0);
			} else if (lowered.endsWith("al")) {
				counter.incrementCount(new LexiconFeature(LexiconFeature.MorphFeature.SUFF_AL), 1.0);
				// } else if (lowered.endsWith("ble")) {
				// sb.append("-ble");
				// } else if (lowered.endsWith("e")) {
				// sb.append("-e");
			}

		}
		return counter;
	}
}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy