
org.dkpro.tc.features.ngram.util.NGramUtils Maven / Gradle / Ivy
/*******************************************************************************
* Copyright 2015
* Ubiquitous Knowledge Processing (UKP) Lab
* Technische Universität Darmstadt
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
******************************************************************************/
package org.dkpro.tc.features.ngram.util;
import static org.apache.uima.fit.util.JCasUtil.select;
import static org.apache.uima.fit.util.JCasUtil.selectCovered;
import static org.apache.uima.fit.util.JCasUtil.toText;
import static org.dkpro.tc.core.Constants.NGRAM_GLUE;
import java.util.ArrayList;
import java.util.Collections;
import java.util.List;
import java.util.Map.Entry;
import java.util.Set;
import org.apache.commons.codec.EncoderException;
import org.apache.commons.codec.StringEncoder;
import org.apache.commons.codec.language.ColognePhonetic;
import org.apache.commons.codec.language.Soundex;
import org.apache.commons.lang.StringUtils;
import org.apache.uima.cas.text.AnnotationFS;
import org.apache.uima.fit.util.JCasUtil;
import org.apache.uima.jcas.JCas;
import org.apache.uima.jcas.tcas.Annotation;
import de.tudarmstadt.ukp.dkpro.core.api.featurepath.FeaturePathException;
import de.tudarmstadt.ukp.dkpro.core.api.featurepath.FeaturePathFactory;
import de.tudarmstadt.ukp.dkpro.core.api.frequency.util.FrequencyDistribution;
import de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS;
import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence;
import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token;
import de.tudarmstadt.ukp.dkpro.core.ngrams.util.CharacterNGramStringIterable;
import de.tudarmstadt.ukp.dkpro.core.ngrams.util.NGramStringListIterable;
import org.dkpro.tc.api.exception.TextClassificationException;
public class NGramUtils
{
public static FrequencyDistribution getAnnotationNgrams(JCas jcas,
Annotation focusAnnotation, boolean lowerCaseNGrams, boolean filterPartialMatches,
int minN, int maxN)
{
Set empty = Collections.emptySet();
return getAnnotationNgrams(jcas, focusAnnotation, lowerCaseNGrams, filterPartialMatches,
minN, maxN, empty);
}
public static FrequencyDistribution getAnnotationNgrams(JCas jcas,
Annotation focusAnnotation, boolean lowerCaseNGrams, boolean filterPartialMatches,
int minN, int maxN, Set stopwords)
{
FrequencyDistribution annoNgrams = new FrequencyDistribution();
// If the focusAnnotation contains sentence annotations, extract the ngrams sentence-wise
// if not, extract them from all tokens in the focusAnnotation
if (JCasUtil.selectCovered(jcas, Sentence.class, focusAnnotation).size() > 0) {
for (Sentence s : selectCovered(jcas, Sentence.class, focusAnnotation)) {
for (List ngram : new NGramStringListIterable(toText(selectCovered(
Token.class, s)), minN, maxN)) {
if (lowerCaseNGrams) {
ngram = lower(ngram);
}
if (passesNgramFilter(ngram, stopwords, filterPartialMatches)) {
String ngramString = StringUtils.join(ngram, NGRAM_GLUE);
annoNgrams.inc(ngramString);
}
}
}
}
// FIXME the focus annotation branch doesn't make much sense
else {
for (List ngram : new NGramStringListIterable(toText(selectCovered(Token.class,
focusAnnotation)), minN, maxN)) {
if (lowerCaseNGrams) {
ngram = lower(ngram);
}
if (passesNgramFilter(ngram, stopwords, filterPartialMatches)) {
String ngramString = StringUtils.join(ngram, NGRAM_GLUE);
annoNgrams.inc(ngramString);
}
}
}
return annoNgrams;
}
/**
* Convenience method to return document ngrams when there's no stopword list.
*
* @param jcas
* @param lowerCaseNGrams
* @param filterPartialMatches
* @param minN
* @param maxN
* @return
*/
public static FrequencyDistribution getDocumentNgrams(JCas jcas,
boolean lowerCaseNGrams, boolean filterPartialMatches, int minN, int maxN)
throws TextClassificationException
{
Set empty = Collections.emptySet();
return getDocumentNgrams(jcas, lowerCaseNGrams, filterPartialMatches, minN, maxN, empty);
}
/**
* Convenience method to return document ngrams over Tokens.
*
* @param jcas
* @param lowerCaseNGrams
* @param filterPartialMatches
* @param minN
* @param maxN
* @param stopwords
* @return
*/
public static FrequencyDistribution getDocumentNgrams(JCas jcas,
boolean lowerCaseNGrams, boolean filterPartialMatches, int minN, int maxN,
Set stopwords)
throws TextClassificationException
{
return getDocumentNgrams(jcas, lowerCaseNGrams, filterPartialMatches, minN, maxN,
stopwords, Token.class);
}
/**
* Returns document ngrams over any annotation type that extends Annotation. Intended use is
* Lemma, Stem, etc.
*
* @param jcas
* @param lowerCaseNGrams
* @param filterPartialMatches
* @param minN
* @param maxN
* @param stopwords
* @param annotationClass
* annotation type of the ngram
* @return
*/
public static FrequencyDistribution getDocumentNgrams(JCas jcas,
boolean lowerCaseNGrams, boolean filterPartialMatches, int minN, int maxN,
Set stopwords, Class extends Annotation> annotationClass)
throws TextClassificationException
{
FrequencyDistribution documentNgrams = new FrequencyDistribution();
for (Sentence s : select(jcas, Sentence.class)) {
List strings = valuesToText(jcas, s, annotationClass.getName());
for (List ngram : new NGramStringListIterable(strings, minN, maxN)) {
if (lowerCaseNGrams) {
ngram = lower(ngram);
}
if (passesNgramFilter(ngram, stopwords, filterPartialMatches)) {
String ngramString = StringUtils.join(ngram, NGRAM_GLUE);
documentNgrams.inc(ngramString);
}
}
}
return documentNgrams;
}
public static FrequencyDistribution getDocumentPosNgrams(JCas jcas, int minN, int maxN,
boolean useCanonical)
{
FrequencyDistribution posNgrams = new FrequencyDistribution();
for (Sentence s : select(jcas, Sentence.class)) {
List postagstrings = new ArrayList();
for (POS p : JCasUtil.selectCovered(jcas, POS.class, s)) {
if (useCanonical) {
postagstrings.add(p.getClass().getSimpleName());
}
else {
postagstrings.add(p.getPosValue());
}
}
String[] posarray = postagstrings.toArray(new String[postagstrings.size()]);
for (List ngram : new NGramStringListIterable(posarray, minN, maxN)) {
posNgrams.inc(StringUtils.join(ngram, NGRAM_GLUE));
}
}
return posNgrams;
}
public static FrequencyDistribution getDocumentPosNgrams(JCas jcas, Annotation focusAnnotation, int minN,
int maxN, boolean useCanonical) {
FrequencyDistribution posNgrams = new FrequencyDistribution();
if (JCasUtil.selectCovered(jcas, Sentence.class, focusAnnotation).size() > 0) {
for (Sentence s : selectCovered(jcas, Sentence.class, focusAnnotation)) {
List postagstrings = new ArrayList();
for (POS p : JCasUtil.selectCovered(jcas, POS.class, s)) {
if (useCanonical) {
postagstrings.add(p.getClass().getSimpleName());
} else {
postagstrings.add(p.getPosValue());
}
}
String[] posarray = postagstrings.toArray(new String[postagstrings.size()]);
for (List ngram : new NGramStringListIterable(posarray, minN, maxN)) {
posNgrams.inc(StringUtils.join(ngram, NGRAM_GLUE));
}
}
} else {
List postagstrings = new ArrayList();
for (POS p : selectCovered(POS.class, focusAnnotation)) {
if (useCanonical) {
postagstrings.add(p.getClass().getSimpleName());
} else {
postagstrings.add(p.getPosValue());
}
}
String[] posarray = postagstrings.toArray(new String[postagstrings.size()]);
for (List ngram : new NGramStringListIterable(posarray, minN, maxN)) {
posNgrams.inc(StringUtils.join(ngram, NGRAM_GLUE));
}
}
return posNgrams;
}
public static FrequencyDistribution getDocumentPhoneticNgrams(JCas jcas, int minN,
int maxN)
throws TextClassificationException
{
StringEncoder encoder;
String languageCode = jcas.getDocumentLanguage();
if (languageCode.equals("en")) {
encoder = new Soundex();
}
else if (languageCode.equals("de")) {
encoder = new ColognePhonetic();
}
else {
throw new TextClassificationException("Language code '" + languageCode
+ "' not supported by phonetic ngrams FE.");
}
FrequencyDistribution phoneticNgrams = new FrequencyDistribution();
for (Sentence s : select(jcas, Sentence.class)) {
List phoneticStrings = new ArrayList();
for (Token t : JCasUtil.selectCovered(jcas, Token.class, s)) {
try {
phoneticStrings.add(encoder.encode(t.getCoveredText()));
}
catch (EncoderException e) {
throw new TextClassificationException(e);
}
}
String[] array = phoneticStrings.toArray(new String[phoneticStrings.size()]);
for (List ngram : new NGramStringListIterable(array, minN, maxN)) {
phoneticNgrams.inc(StringUtils.join(ngram, NGRAM_GLUE));
}
}
return phoneticNgrams;
}
public static FrequencyDistribution getDocumentCharacterNgrams(JCas jcas,
boolean lowerCaseNgrams, int minN, int maxN)
{
FrequencyDistribution charNgrams = new FrequencyDistribution();
for (String charNgram : new CharacterNGramStringIterable(jcas.getDocumentText(), minN, maxN)) {
if (lowerCaseNgrams) {
charNgram = charNgram.toLowerCase();
}
charNgrams.inc(charNgram);
}
return charNgrams;
}
/**
* Creates a frequency distribution of character ngrams over the span of an annotation. The
* boundary* parameter allows it to provide a string that is added additionally at the beginning
* and end of the respective annotation span. If for instance the 'begin of sequence' or 'end of
* sequence' of a span shall be marked the boundary parameter can be used. Provide an empty
* character in case this parameters are not needed
*/
public static FrequencyDistribution getAnnotationCharacterNgrams(
Annotation focusAnnotation, boolean lowerCaseNgrams, int minN, int maxN,
char boundaryBegin, char boundaryEnd)
{
FrequencyDistribution charNgrams = new FrequencyDistribution();
for (String charNgram : new CharacterNGramStringIterable(boundaryBegin
+ focusAnnotation.getCoveredText() + boundaryEnd, minN, maxN)) {
if (lowerCaseNgrams) {
charNgram = charNgram.toLowerCase();
}
charNgrams.inc(charNgram);
}
return charNgrams;
}
/**
* An ngram (represented by the list of tokens) does not pass the stopword filter: a)
* filterPartialMatches=true - if it contains any stopwords b) filterPartialMatches=false - if
* it entirely consists of stopwords
*
* @param tokenList
* The list of tokens in a single ngram
* @param stopwords
* The set of stopwords used for filtering
* @param filterPartialMatches
* Whether ngrams where only parts are stopwords should also be filtered. For
* example, "United States of America" would be filtered, as it contains the stopword
* "of".
* @return Whether the ngram (represented by the list of tokens) passes the stopword filter or
* not.
*/
public static boolean passesNgramFilter(List tokenList, Set stopwords,
boolean filterPartialMatches)
{
List filteredList = new ArrayList();
for (String ngram : tokenList) {
if (!stopwords.contains(ngram)) {
filteredList.add(ngram);
}
}
if (filterPartialMatches) {
return filteredList.size() == tokenList.size();
}
else {
return filteredList.size() != 0;
}
}
public static FrequencyDistribution getDocumentSkipNgrams(JCas jcas,
boolean lowerCaseNGrams, boolean filterPartialMatches, int minN, int maxN, int skipN,
Set stopwords)
{
FrequencyDistribution documentNgrams = new FrequencyDistribution();
for (Sentence s : select(jcas, Sentence.class)) {
for (List ngram : new SkipNgramStringListIterable(toText(selectCovered(
Token.class, s)), minN, maxN, skipN)) {
if (lowerCaseNGrams) {
ngram = lower(ngram);
}
if (passesNgramFilter(ngram, stopwords, filterPartialMatches)) {
String ngramString = StringUtils.join(ngram, NGRAM_GLUE);
documentNgrams.inc(ngramString);
}
}
}
return documentNgrams;
}
public static FrequencyDistribution getCharacterSkipNgrams(JCas jcas,
boolean lowerCaseNGrams, int minN, int maxN, int skipN)
{
FrequencyDistribution charNgrams = new FrequencyDistribution();
for (Token t : select(jcas, Token.class)) {
String tokenText = t.getCoveredText();
String[] charsTemp = tokenText.split("");
String[] chars = new String[charsTemp.length + 1];
for (int i = 0; i < charsTemp.length; i++) {
chars[i] = charsTemp[i];
}
chars[0] = "^";
chars[charsTemp.length] = "$";
for (List ngram : new SkipNgramStringListIterable(chars, minN, maxN, skipN)) {
if (lowerCaseNGrams) {
ngram = lower(ngram);
}
String ngramString = StringUtils.join(ngram, NGRAM_GLUE);
charNgrams.inc(ngramString);
}
}
return charNgrams;
}
public static List lower(List ngram)
{
List newNgram = new ArrayList();
for (String token : ngram) {
newNgram.add(token.toLowerCase());
}
return newNgram;
}
public static List valuesToText(JCas jcas, Sentence s,
String annotationClassName)
throws TextClassificationException
{
List texts = new ArrayList();
try {
for (Entry entry : FeaturePathFactory.select(jcas.getCas(),
annotationClassName)) {
if (entry.getKey().getBegin() >= s.getBegin()
&& entry.getKey().getEnd() <= s.getEnd()) {
texts.add(entry.getValue());
}
}
}
catch (FeaturePathException e) {
throw new TextClassificationException(e);
}
return texts;
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy