All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.yahoo.language.process.SpecialTokens Maven / Gradle / Ivy

There is a newer version: 8.441.21
Show newest version
// Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
package com.yahoo.language.process;

import java.util.ArrayList;
import java.util.Collections;
import java.util.Iterator;
import java.util.List;
import java.util.Locale;
import java.util.Map;
import java.util.Objects;
import java.util.stream.Collectors;

import static com.yahoo.language.LinguisticsCase.toLowerCase;

/**
 * An immutable list of special tokens - strings which should override the normal tokenizer semantics
 * and be tokenized into a single token. Special tokens are case insensitive.
 *
 * @author bratseth
 */
public class SpecialTokens {

    private static final SpecialTokens empty = new SpecialTokens("(empty)", List.of());

    private final String name;
    private final int maximumLength;
    private final List tokens;
    private final Map tokenMap;

    public SpecialTokens(String name,  List tokens) {
        tokens.stream().peek(token -> token.validate());
        List mutableTokens = new ArrayList<>(tokens);
        Collections.sort(mutableTokens);
        this.name = name;
        this.maximumLength = tokens.stream().mapToInt(token -> token.token().length()).max().orElse(0);
        this.tokens = List.copyOf(mutableTokens);
        this.tokenMap = tokens.stream().collect(Collectors.toUnmodifiableMap(t -> t.token(), t -> t.replacement()));
    }

    /** Returns the name of this special tokens list */
    public String name() {
        return name;
    }

    /**
     * Returns the tokens of this as an immutable map from token to replacement.
     * Tokens which do not have a replacement token maps to themselves.
     */
    public Map asMap() { return tokenMap; }

    /**
     * Returns the special token starting at the start of the given string, or null if no
     * special token starts at this string
     *
     * @param string the string to search for a special token at the start position
     * @param substring true to allow the special token to be followed by a character which does not
     *        mark the end of a token
     */
    public Token tokenize(String string, boolean substring) {
        // XXX detonator pattern token.length may be != the length of the
        // matching data in string, ref caseIndependentLength(String)
        String input = toLowerCase(string.substring(0, Math.min(string.length(), maximumLength)));
        for (Iterator i = tokens.iterator(); i.hasNext();) {
            Token special = i.next();

            if (input.startsWith(special.token())) {
                if (string.length() == special.token().length() || substring || tokenEndsAt(special.token().length(), string))
                    return special;
            }
        }
        return null;
    }

    private boolean tokenEndsAt(int position, String string) {
        return !Character.isLetterOrDigit(string.charAt(position));
    }

    public static SpecialTokens empty() { return empty; }

    /** An immutable special token */
    public final static class Token implements Comparable {

        private final String token;
        private final String replacement;

        /** Creates a special token */
        public Token(String token) {
            this(token, null);
        }

        /** Creates a special token which will be represented by the given replacement token */
        public Token(String token, String replacement) {
            this.token = toLowerCase(token);
            if (replacement == null || replacement.trim().equals(""))
                this.replacement = this.token;
            else
                this.replacement = toLowerCase(replacement);
        }

        /** Returns the special token */
        public String token() { return token; }

        /** Returns the token to replace occurrences of this by, which equals token() unless this has a replacement. */
        public String replacement() { return replacement; }

        @Override
        public int compareTo(Token other) {
            if (this.token().length() < other.token().length()) return 1;
            if (this.token().length() == other.token().length()) return 0;
            return -1;
        }

        @Override
        public boolean equals(Object other) {
            if (other == this) return true;
            if ( ! (other instanceof Token)) return false;
            return Objects.equals(this.token, ((Token)other).token);
        }

        @Override
        public int hashCode() { return token.hashCode(); }

        @Override
        public String toString() {
            return "token '" + token + "'" + (replacement.equals(token) ? "" : " replacement '" + replacement + "'");
        }

        private void validate() {
            // XXX not fool proof length test, should test codepoint by codepoint for mixed case user input? not even that will necessarily be 100% robust...
            String asLow = toLowerCase(token);
            // TODO: Put along with the global toLowerCase
            String asHigh = token.toUpperCase(Locale.ENGLISH);
            if (asLow.length() != token.length() || asHigh.length() != token.length()) {
                throw new IllegalArgumentException("Special token '" + token + "' has case sensitive length. " +
                                                   "Please report this to the Vespa team.");
            }
        }

    }

}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy