com.optimaize.langdetect.ngram.NgramExtractor Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of language-detector Show documentation
Show all versions of language-detector Show documentation
Language Detection Library for Java.
package com.optimaize.langdetect.ngram;
import com.google.common.collect.ImmutableList;
import org.jetbrains.annotations.NotNull;
import org.jetbrains.annotations.Nullable;
import java.util.*;
/**
* Class for extracting n-grams out of a text.
*
* @author Fabian Kessler
*/
public class NgramExtractor {
@NotNull
private final List gramLengths = new ArrayList<>(4);
@Nullable
private final NgramFilter filter;
@Nullable
private Character textPadding;
public static NgramExtractor gramLength(int gramLength) {
return new NgramExtractor(ImmutableList.of(gramLength), null, null);
}
public static NgramExtractor gramLengths(Integer... gramLength) {
return new NgramExtractor(Arrays.asList(gramLength), null, null);
}
public NgramExtractor filter(NgramFilter filter) {
return new NgramExtractor(this.gramLengths, filter, this.textPadding);
}
/**
* To ensure having border grams, this character is added to the left and right of the text.
*
* Example: when textPadding is a space ' ' then a text input "foo" becomes " foo ", ensuring that n-grams like " f"
* are created.
*
* If the text already has such a character in that position (eg starts with), it is not added there.
*
* @param textPadding for example a space ' '.
*/
public NgramExtractor textPadding(char textPadding) {
return new NgramExtractor(this.gramLengths, this.filter, textPadding);
}
private NgramExtractor(@NotNull List gramLengths, @Nullable NgramFilter filter, @Nullable Character textPadding) {
if (gramLengths.isEmpty()) throw new IllegalArgumentException();
this.gramLengths.addAll(gramLengths);
this.filter = filter;
this.textPadding = textPadding;
}
public List getGramLengths() {
return Collections.unmodifiableList(gramLengths);
}
/**
* Creates the n-grams for a given text in the order they occur.
*
* Example: extractSortedGrams("Foo bar", 2) => [Fo,oo,o , b,ba,ar]
*
* @param text
* @return The grams, empty if the input was empty or if none for that gramLength fits.
*/
@NotNull
public List extractGrams(@NotNull CharSequence text) {
text = applyPadding(text);
int len = text.length();
//the actual size will be totalNumGrams or less (filter)
int totalNumGrams = 0;
for (Integer gramLength : gramLengths) {
int num = len - (gramLength - 1);
if (num >= 1) { //yes can be negative
totalNumGrams += num;
}
}
if (totalNumGrams <= 0) {
return Collections.emptyList();
}
List grams = new ArrayList<>(totalNumGrams);
for (Integer gramLength : gramLengths) {
int numGrams = len - (gramLength -1);
if (numGrams >= 1) { //yes can be negative
for (int pos=0; pos extractCountedGrams(@NotNull CharSequence text) {
text = applyPadding(text);
int len = text.length();
int initialCapacity = 0;
for (Integer gramLength : gramLengths) {
initialCapacity += guessNumDistinctiveGrams(len, gramLength);
}
Map grams = new LinkedHashMap<>(initialCapacity);
for (Integer gramLength : gramLengths) {
_extractCounted(text, gramLength, len, grams);
}
return grams;
}
private void _extractCounted(CharSequence text, int gramLength, int len, Map grams) {
int endPos = len - (gramLength -1);
for (int pos=0; pos
© 2015 - 2025 Weber Informatics LLC | Privacy Policy