com.yahoo.language.process.GramSplitter Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of linguistics Show documentation
There is a newer version: 8.441.21
// Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
package com.yahoo.language.process;


import java.util.ArrayList;
import java.util.Collections;
import java.util.Iterator;
import java.util.List;
import java.util.NoSuchElementException;

/**
 * A class which splits consecutive word character sequences into overlapping character n-grams.
 * For example "en gul bille sang" split into 2-grams becomes
 * "en gu ul bi il ll le sa an ng", and split into 3-grams becomes "en gul bil ill lle san ang".
 * 
 * This class is multithread safe.
 *
 * @author bratseth
 */
public class GramSplitter {

    private final CharacterClasses characterClasses;

    public GramSplitter(CharacterClasses characterClasses) {
        this.characterClasses = characterClasses;
    }

    /**
     * Splits the input into grams of size n and returns an iterator over grams represented as [start index,length]
     * pairs into the input string.
     * 
     * The iterator is implemented as a sliding view over the input string rather than being backed by a
     * list, which makes this space efficient for large strings.
     *
     * @param input the input string to be split, cannot be null
     * @param n     the gram size, a positive integer
     * @return a read only iterator over the resulting grams
     * @throws NullPointerException     if input==null
     * @throws IllegalArgumentException if n is less than 1
     */
    public GramSplitterIterator split(String input, int n) {
        if (input == null) throw new NullPointerException("input cannot be null");
        if (n < 1) throw new IllegalArgumentException("n (gram size) cannot be smaller than 1, was " + n);
        return new GramSplitterIterator(input, n, characterClasses);
    }

    public static class GramSplitterIterator implements Iterator {

        private final CharacterClasses characterClasses;

        /** Text to split */
        private final UnicodeString input;

        /** Gram size in code points */
        private final int n;

        /** Current position in the string */
        private int i = 0;

        /** Whether the last thing that happened was being on a separator (including the start of the string) */
        private boolean isFirstAfterSeparator = true;

        /** The next gram or null if not determined yet */
        private Gram nextGram = null;

        public GramSplitterIterator(String input, int n, CharacterClasses characterClasses) {
            this.input = new UnicodeString(input);
            this.n = n;
            this.characterClasses = characterClasses;
        }

        @Override
        public boolean hasNext() {
            if (nextGram != null) return true;
            nextGram = findNext();
            return nextGram != null;
        }

        @Override
        public Gram next() {
            Gram currentGram = nextGram;
            if (currentGram == null)
                currentGram = findNext();
            if (currentGram == null)
                throw new NoSuchElementException("No next gram at position " + i);
            nextGram = null;
            return currentGram;
        }

        private Gram findNext() {
            // Skip to next indexable character
            while (i < input.length() && !isIndexable(input.codePointAt(i))) {
                i = input.next(i);
                isFirstAfterSeparator = true;
            }
            if (i >= input.length()) return null; // no indexable characters

            int tokenStart = i;
            UnicodeString gram = input.substring(tokenStart, n);
            int tokenEnd = tokenEnd(gram);
            gram = new UnicodeString(gram.toString().substring(0, tokenEnd));
            if (gram.codePointCount() == n) { // normal case: got a full length gram
                Gram g = new Gram(i, gram.codePointCount());
                i = input.next(i);
                isFirstAfterSeparator = false;
                return g;
            }
            else { // gram is too short due either to being a symbol, being followed by a non-word separator, or end of string
                if (isFirstAfterSeparator || ( gram.codePointCount() == 1 && characterClasses.isSymbol(gram.codePointAt(0)))) { // make a gram anyway
                    Gram g = new Gram(i, gram.codePointCount());
                    i = input.next(i);
                    isFirstAfterSeparator = false;
                    return g;
                } else { // skip to next
                    i = input.skip(gram.codePointCount(), i);
                    isFirstAfterSeparator = true;
                    return findNext();
                }
            }
        }

        private boolean isIndexable(int codepoint) {
            if (characterClasses.isLetterOrDigit(codepoint)) return true;
            if (characterClasses.isSymbol(codepoint)) return true;
            return false;
        }

        /** Given a string s starting by an indexable character, return the position where that token should end. */
        private int tokenEnd(UnicodeString s) {
            if (characterClasses.isSymbol(s.codePointAt(0)))
                return s.next(0); // symbols have length 1

            int i = 0;
            for (; i < s.length(); i = s.next(i)) {
                if ( ! characterClasses.isLetterOrDigit(s.codePointAt(i)))
                    return i;
            }
            return i;
        }

        @Override
        public void remove() {
            throw new UnsupportedOperationException("This iterator is read only");
        }

        /**
         * Convenience list which splits the remaining items in this iterator into a list of gram strings
         *
         * @return an immutable list of extracted grams
         */
        public List toExtractedList() {
            List gramList = new ArrayList<>();
            while (hasNext())
                gramList.add(next().extractFrom(input));
            return Collections.unmodifiableList(gramList);
        }
    }

    /**
     * An immutable start index and length pair
     */
    public static final class Gram {

        private final int start, codePointCount;

        public Gram(int start, int codePointCount) {
            this.start = start;
            this.codePointCount = codePointCount;
        }

        public int getStart() {
            return start;
        }

        public int getCodePointCount() {
            return codePointCount;
        }

        /** Returns this gram as a string from the input string */
        public String extractFrom(String input) {
            return extractFrom(new UnicodeString(input));
        }

        /** Returns this gram as a string from the input string */
        public String extractFrom(UnicodeString input) {
            return input.substring(start, codePointCount).toString();
        }

        @Override
        public boolean equals(Object o) {
            if (this == o) return true;
            if ( ! (o instanceof Gram gram)) return false;

            if (codePointCount != gram.codePointCount) return false;
            if (start != gram.start) return false;
            return true;
        }

        @Override
        public int hashCode() {
            int result = start;
            result = 31 * result + codePointCount;
            return result;
        }

    }

    /**
     * A string wrapper with some convenience methods for dealing with UTF-16 surrogate pairs
     * (a crime against humanity for which we'll be negatively impacted for at least the next million years).
     */
    private static class UnicodeString {

        private final String s;

        public UnicodeString(String s) {
            this.s = s;
        }

        /** Substring in code point space */
        public UnicodeString substring(int start, int codePoints) {
            int cps = codePoints * 2 <= s.length() - start ? codePoints
                                                           : Math.min(codePoints, s.codePointCount(start, s.length()));
            return new UnicodeString(s.substring(start, s.offsetByCodePoints(start, cps)));
        }

        /** Returns the position count code points after start (which may be past the end of the string) */
        public int skip(int codePointCount, int start) {
            int index = start;
            for (int i = 0; i < codePointCount; i++) {
                index = next(index);
                if (index > s.length()) break;
            }
            return index;
        }

        /** Returns the index of the next code point after start (which may be past the end of the string) */
        public int next(int index) {
            int next = index + 1;
            if (next < s.length() && Character.isLowSurrogate(s.charAt(next)))
                next++;
            return next;
        }

        /** Returns the number of positions (not code points) in this */
        public int length() { return s.length(); }

        /** Returns the number of code points in this */
        public int codePointCount() { return s.codePointCount(0, s.length()); }

        public int codePointAt(int index) {
            return s.codePointAt(index);
        }

        @Override
        public String toString() { return s; }

    }

}