All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.optimaize.langdetect.ngram.NgramExtractor Maven / Gradle / Ivy

There is a newer version: 0.6
Show newest version
package com.optimaize.langdetect.ngram;

import com.google.common.collect.ImmutableList;
import org.jetbrains.annotations.NotNull;
import org.jetbrains.annotations.Nullable;

import java.util.*;

/**
 * Class for extracting n-grams out of a text.
 *
 * @author Fabian Kessler
 */
public class NgramExtractor {

    @NotNull
    private final List gramLengths = new ArrayList<>(4);
    @Nullable
    private final NgramFilter filter;
    @Nullable
    private Character textPadding;

    public static NgramExtractor gramLength(int gramLength) {
        return new NgramExtractor(ImmutableList.of(gramLength), null, null);
    }
    public static NgramExtractor gramLengths(Integer... gramLength) {
        return new NgramExtractor(Arrays.asList(gramLength), null, null);
    }

    public NgramExtractor filter(NgramFilter filter) {
        return new NgramExtractor(this.gramLengths, filter, this.textPadding);
    }

    /**
     * To ensure having border grams, this character is added to the left and right of the text.
     *
     * 

Example: when textPadding is a space ' ' then a text input "foo" becomes " foo ", ensuring that n-grams like " f" * are created.

* *

If the text already has such a character in that position (eg starts with), it is not added there.

* * @param textPadding for example a space ' '. */ public NgramExtractor textPadding(char textPadding) { return new NgramExtractor(this.gramLengths, this.filter, textPadding); } private NgramExtractor(@NotNull List gramLengths, @Nullable NgramFilter filter, @Nullable Character textPadding) { if (gramLengths.isEmpty()) throw new IllegalArgumentException(); this.gramLengths.addAll(gramLengths); this.filter = filter; this.textPadding = textPadding; } public List getGramLengths() { return Collections.unmodifiableList(gramLengths); } /** * Creates the n-grams for a given text in the order they occur. * *

Example: extractSortedGrams("Foo bar", 2) => [Fo,oo,o , b,ba,ar]

* * @param text * @return The grams, empty if the input was empty or if none for that gramLength fits. */ @NotNull public List extractGrams(@NotNull CharSequence text) { text = applyPadding(text); int len = text.length(); //the actual size will be totalNumGrams or less (filter) int totalNumGrams = 0; for (Integer gramLength : gramLengths) { int num = len - (gramLength - 1); if (num >= 1) { //yes can be negative totalNumGrams += num; } } if (totalNumGrams <= 0) { return Collections.emptyList(); } List grams = new ArrayList<>(totalNumGrams); for (Integer gramLength : gramLengths) { int numGrams = len - (gramLength -1); if (numGrams >= 1) { //yes can be negative for (int pos=0; pos extractCountedGrams(@NotNull CharSequence text) { text = applyPadding(text); int len = text.length(); int initialCapacity = 0; for (Integer gramLength : gramLengths) { initialCapacity += guessNumDistinctiveGrams(len, gramLength); } Map grams = new LinkedHashMap<>(initialCapacity); for (Integer gramLength : gramLengths) { _extractCounted(text, gramLength, len, grams); } return grams; } private void _extractCounted(CharSequence text, int gramLength, int len, Map grams) { int endPos = len - (gramLength -1); for (int pos=0; pos




© 2015 - 2025 Weber Informatics LLC | Privacy Policy