org.dkpro.tc.features.ngram.util.NGramUtils Maven / Gradle / Ivy

Go to download
/*******************************************************************************
 * Copyright 2015
 * Ubiquitous Knowledge Processing (UKP) Lab
 * Technische Universität Darmstadt
 * 
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 * 
 *   http://www.apache.org/licenses/LICENSE-2.0
 * 
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 ******************************************************************************/
package org.dkpro.tc.features.ngram.util;

import static org.apache.uima.fit.util.JCasUtil.select;
import static org.apache.uima.fit.util.JCasUtil.selectCovered;
import static org.apache.uima.fit.util.JCasUtil.toText;
import static org.dkpro.tc.core.Constants.NGRAM_GLUE;

import java.util.ArrayList;
import java.util.Collections;
import java.util.List;
import java.util.Map.Entry;
import java.util.Set;

import org.apache.commons.codec.EncoderException;
import org.apache.commons.codec.StringEncoder;
import org.apache.commons.codec.language.ColognePhonetic;
import org.apache.commons.codec.language.Soundex;
import org.apache.commons.lang.StringUtils;
import org.apache.uima.cas.text.AnnotationFS;
import org.apache.uima.fit.util.JCasUtil;
import org.apache.uima.jcas.JCas;
import org.apache.uima.jcas.tcas.Annotation;

import de.tudarmstadt.ukp.dkpro.core.api.featurepath.FeaturePathException;
import de.tudarmstadt.ukp.dkpro.core.api.featurepath.FeaturePathFactory;
import de.tudarmstadt.ukp.dkpro.core.api.frequency.util.FrequencyDistribution;
import de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS;
import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence;
import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token;
import de.tudarmstadt.ukp.dkpro.core.ngrams.util.CharacterNGramStringIterable;
import de.tudarmstadt.ukp.dkpro.core.ngrams.util.NGramStringListIterable;
import org.dkpro.tc.api.exception.TextClassificationException;

public class NGramUtils
{

    public static FrequencyDistribution getAnnotationNgrams(JCas jcas,
            Annotation focusAnnotation, boolean lowerCaseNGrams, boolean filterPartialMatches,
            int minN, int maxN)
    {
        Set empty = Collections.emptySet();
        return getAnnotationNgrams(jcas, focusAnnotation, lowerCaseNGrams, filterPartialMatches,
                minN, maxN, empty);
    }

    public static FrequencyDistribution getAnnotationNgrams(JCas jcas,
            Annotation focusAnnotation, boolean lowerCaseNGrams, boolean filterPartialMatches,
            int minN, int maxN, Set stopwords)
    {
        FrequencyDistribution annoNgrams = new FrequencyDistribution();

        // If the focusAnnotation contains sentence annotations, extract the ngrams sentence-wise
        // if not, extract them from all tokens in the focusAnnotation
        if (JCasUtil.selectCovered(jcas, Sentence.class, focusAnnotation).size() > 0) {
            for (Sentence s : selectCovered(jcas, Sentence.class, focusAnnotation)) {
                for (List ngram : new NGramStringListIterable(toText(selectCovered(
                        Token.class, s)), minN, maxN)) {

                    if (lowerCaseNGrams) {
                        ngram = lower(ngram);
                    }

                    if (passesNgramFilter(ngram, stopwords, filterPartialMatches)) {
                        String ngramString = StringUtils.join(ngram, NGRAM_GLUE);
                        annoNgrams.inc(ngramString);
                    }
                }
            }
        }
        // FIXME the focus annotation branch doesn't make much sense
        else {
            for (List ngram : new NGramStringListIterable(toText(selectCovered(Token.class,
                    focusAnnotation)), minN, maxN)) {

                if (lowerCaseNGrams) {
                    ngram = lower(ngram);
                }

                if (passesNgramFilter(ngram, stopwords, filterPartialMatches)) {
                    String ngramString = StringUtils.join(ngram, NGRAM_GLUE);
                    annoNgrams.inc(ngramString);
                }
            }
        }
        return annoNgrams;
    }

    /**
     * Convenience method to return document ngrams when there's no stopword list.
     * 
     * @param jcas
     * @param lowerCaseNGrams
     * @param filterPartialMatches
     * @param minN
     * @param maxN
     * @return
     */
    public static FrequencyDistribution getDocumentNgrams(JCas jcas,
            boolean lowerCaseNGrams, boolean filterPartialMatches, int minN, int maxN)
        throws TextClassificationException
    {
        Set empty = Collections.emptySet();
        return getDocumentNgrams(jcas, lowerCaseNGrams, filterPartialMatches, minN, maxN, empty);
    }

    /**
     * Convenience method to return document ngrams over Tokens.
     * 
     * @param jcas
     * @param lowerCaseNGrams
     * @param filterPartialMatches
     * @param minN
     * @param maxN
     * @param stopwords
     * @return
     */
    public static FrequencyDistribution getDocumentNgrams(JCas jcas,
            boolean lowerCaseNGrams, boolean filterPartialMatches, int minN, int maxN,
            Set stopwords)
        throws TextClassificationException
    {
        return getDocumentNgrams(jcas, lowerCaseNGrams, filterPartialMatches, minN, maxN,
                stopwords, Token.class);
    }

    /**
     * Returns document ngrams over any annotation type that extends Annotation. Intended use is
     * Lemma, Stem, etc.
     * 
     * @param jcas
     * @param lowerCaseNGrams
     * @param filterPartialMatches
     * @param minN
     * @param maxN
     * @param stopwords
     * @param annotationClass
     *            annotation type of the ngram
     * @return
     */
    public static FrequencyDistribution getDocumentNgrams(JCas jcas,
            boolean lowerCaseNGrams, boolean filterPartialMatches, int minN, int maxN,
            Set stopwords, Class annotationClass)
        throws TextClassificationException
    {
        FrequencyDistribution documentNgrams = new FrequencyDistribution();
        for (Sentence s : select(jcas, Sentence.class)) {
            List strings = valuesToText(jcas, s, annotationClass.getName());
            for (List ngram : new NGramStringListIterable(strings, minN, maxN)) {
                if (lowerCaseNGrams) {
                    ngram = lower(ngram);
                }

                if (passesNgramFilter(ngram, stopwords, filterPartialMatches)) {
                    String ngramString = StringUtils.join(ngram, NGRAM_GLUE);
                    documentNgrams.inc(ngramString);
                }
            }
        }
        return documentNgrams;
    }

    public static FrequencyDistribution getDocumentPosNgrams(JCas jcas, int minN, int maxN,
            boolean useCanonical)
    {
        FrequencyDistribution posNgrams = new FrequencyDistribution();
        for (Sentence s : select(jcas, Sentence.class)) {
            List postagstrings = new ArrayList();
            for (POS p : JCasUtil.selectCovered(jcas, POS.class, s)) {
                if (useCanonical) {
                    postagstrings.add(p.getClass().getSimpleName());
                }
                else {
                    postagstrings.add(p.getPosValue());
                }
            }
            String[] posarray = postagstrings.toArray(new String[postagstrings.size()]);

            for (List ngram : new NGramStringListIterable(posarray, minN, maxN)) {
                posNgrams.inc(StringUtils.join(ngram, NGRAM_GLUE));

            }
        }
        return posNgrams;
    }
    
	public static FrequencyDistribution getDocumentPosNgrams(JCas jcas, Annotation focusAnnotation, int minN,
			int maxN, boolean useCanonical) {
		FrequencyDistribution posNgrams = new FrequencyDistribution();

		if (JCasUtil.selectCovered(jcas, Sentence.class, focusAnnotation).size() > 0) {
			for (Sentence s : selectCovered(jcas, Sentence.class, focusAnnotation)) {
				List postagstrings = new ArrayList();
				for (POS p : JCasUtil.selectCovered(jcas, POS.class, s)) {
					if (useCanonical) {
						postagstrings.add(p.getClass().getSimpleName());
					} else {
						postagstrings.add(p.getPosValue());
					}
				}
				String[] posarray = postagstrings.toArray(new String[postagstrings.size()]);
				for (List ngram : new NGramStringListIterable(posarray, minN, maxN)) {
					posNgrams.inc(StringUtils.join(ngram, NGRAM_GLUE));
				}
			}
		} else {
			List postagstrings = new ArrayList();
			for (POS p : selectCovered(POS.class, focusAnnotation)) {
				if (useCanonical) {
					postagstrings.add(p.getClass().getSimpleName());
				} else {
					postagstrings.add(p.getPosValue());
				}
			}
			String[] posarray = postagstrings.toArray(new String[postagstrings.size()]);
			for (List ngram : new NGramStringListIterable(posarray, minN, maxN)) {
				posNgrams.inc(StringUtils.join(ngram, NGRAM_GLUE));
			}
		}
		return posNgrams;
	}

    public static FrequencyDistribution getDocumentPhoneticNgrams(JCas jcas, int minN,
            int maxN)
        throws TextClassificationException
    {
        StringEncoder encoder;
        String languageCode = jcas.getDocumentLanguage();

        if (languageCode.equals("en")) {
            encoder = new Soundex();
        }
        else if (languageCode.equals("de")) {
            encoder = new ColognePhonetic();
        }
        else {
            throw new TextClassificationException("Language code '" + languageCode
                    + "' not supported by phonetic ngrams FE.");
        }

        FrequencyDistribution phoneticNgrams = new FrequencyDistribution();
        for (Sentence s : select(jcas, Sentence.class)) {
            List phoneticStrings = new ArrayList();
            for (Token t : JCasUtil.selectCovered(jcas, Token.class, s)) {
                try {
                    phoneticStrings.add(encoder.encode(t.getCoveredText()));
                }
                catch (EncoderException e) {
                    throw new TextClassificationException(e);
                }
            }
            String[] array = phoneticStrings.toArray(new String[phoneticStrings.size()]);

            for (List ngram : new NGramStringListIterable(array, minN, maxN)) {
                phoneticNgrams.inc(StringUtils.join(ngram, NGRAM_GLUE));

            }
        }
        return phoneticNgrams;
    }

    public static FrequencyDistribution getDocumentCharacterNgrams(JCas jcas,
            boolean lowerCaseNgrams, int minN, int maxN)
    {
        FrequencyDistribution charNgrams = new FrequencyDistribution();
        for (String charNgram : new CharacterNGramStringIterable(jcas.getDocumentText(), minN, maxN)) {
            if (lowerCaseNgrams) {
                charNgram = charNgram.toLowerCase();
            }
            charNgrams.inc(charNgram);
        }

        return charNgrams;
    }

    /**
     * Creates a frequency distribution of character ngrams over the span of an annotation. The
     * boundary* parameter allows it to provide a string that is added additionally at the beginning
     * and end of the respective annotation span. If for instance the 'begin of sequence' or 'end of
     * sequence' of a span shall be marked the boundary parameter can be used. Provide an empty
     * character in case this parameters are not needed
     */
    public static FrequencyDistribution getAnnotationCharacterNgrams(
            Annotation focusAnnotation, boolean lowerCaseNgrams, int minN, int maxN,
            char boundaryBegin, char boundaryEnd)
    {
        FrequencyDistribution charNgrams = new FrequencyDistribution();
        for (String charNgram : new CharacterNGramStringIterable(boundaryBegin
                + focusAnnotation.getCoveredText() + boundaryEnd, minN, maxN)) {
            if (lowerCaseNgrams) {
                charNgram = charNgram.toLowerCase();
            }
            charNgrams.inc(charNgram);
        }

        return charNgrams;
    }

    /**
     * An ngram (represented by the list of tokens) does not pass the stopword filter: a)
     * filterPartialMatches=true - if it contains any stopwords b) filterPartialMatches=false - if
     * it entirely consists of stopwords
     * 
     * @param tokenList
     *            The list of tokens in a single ngram
     * @param stopwords
     *            The set of stopwords used for filtering
     * @param filterPartialMatches
     *            Whether ngrams where only parts are stopwords should also be filtered. For
     *            example, "United States of America" would be filtered, as it contains the stopword
     *            "of".
     * @return Whether the ngram (represented by the list of tokens) passes the stopword filter or
     *         not.
     */
    public static boolean passesNgramFilter(List tokenList, Set stopwords,
            boolean filterPartialMatches)
    {
        List filteredList = new ArrayList();
        for (String ngram : tokenList) {
            if (!stopwords.contains(ngram)) {
                filteredList.add(ngram);
            }
        }

        if (filterPartialMatches) {
            return filteredList.size() == tokenList.size();
        }
        else {
            return filteredList.size() != 0;
        }
    }

    public static FrequencyDistribution getDocumentSkipNgrams(JCas jcas,
            boolean lowerCaseNGrams, boolean filterPartialMatches, int minN, int maxN, int skipN,
            Set stopwords)
    {
        FrequencyDistribution documentNgrams = new FrequencyDistribution();
        for (Sentence s : select(jcas, Sentence.class)) {
            for (List ngram : new SkipNgramStringListIterable(toText(selectCovered(
                    Token.class, s)), minN, maxN, skipN)) {
                if (lowerCaseNGrams) {
                    ngram = lower(ngram);
                }

                if (passesNgramFilter(ngram, stopwords, filterPartialMatches)) {
                    String ngramString = StringUtils.join(ngram, NGRAM_GLUE);
                    documentNgrams.inc(ngramString);
                }
            }
        }
        return documentNgrams;
    }

    public static FrequencyDistribution getCharacterSkipNgrams(JCas jcas,
            boolean lowerCaseNGrams, int minN, int maxN, int skipN)
    {
        FrequencyDistribution charNgrams = new FrequencyDistribution();
        for (Token t : select(jcas, Token.class)) {
            String tokenText = t.getCoveredText();
            String[] charsTemp = tokenText.split("");
            String[] chars = new String[charsTemp.length + 1];
            for (int i = 0; i < charsTemp.length; i++) {
                chars[i] = charsTemp[i];
            }

            chars[0] = "^";
            chars[charsTemp.length] = "$";

            for (List ngram : new SkipNgramStringListIterable(chars, minN, maxN, skipN)) {
                if (lowerCaseNGrams) {
                    ngram = lower(ngram);
                }

                String ngramString = StringUtils.join(ngram, NGRAM_GLUE);
                charNgrams.inc(ngramString);
            }
        }
        return charNgrams;
    }

    public static List lower(List ngram)
    {
        List newNgram = new ArrayList();
        for (String token : ngram) {
            newNgram.add(token.toLowerCase());
        }
        return newNgram;
    }

    public static  List valuesToText(JCas jcas, Sentence s,
            String annotationClassName)
        throws TextClassificationException
    {
        List texts = new ArrayList();

        try {
            for (Entry entry : FeaturePathFactory.select(jcas.getCas(),
                    annotationClassName)) {
                if (entry.getKey().getBegin() >= s.getBegin()
                        && entry.getKey().getEnd() <= s.getEnd()) {
                    texts.add(entry.getValue());
                }
            }
        }
        catch (FeaturePathException e) {
            throw new TextClassificationException(e);
        }
        return texts;
    }
}