com.ibm.icu.impl.TextTrieMap Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of icu4j Show documentation
International Component for Unicode for Java (ICU4J) is a mature, widely used Java library providing Unicode and Globalization support
There is a newer version: 76.1
Show newest version
// © 2016 and later: Unicode, Inc. and others.
// License & terms of use: http://www.unicode.org/copyright.html
/*
 * ********************************************************************************
 * Copyright (C) 2007-2011, International Business Machines Corporation and others.
 * All Rights Reserved.
 * ********************************************************************************
 */
package com.ibm.icu.impl;

import java.util.Iterator;
import java.util.LinkedList;
import java.util.List;
import java.util.ListIterator;

import com.ibm.icu.lang.UCharacter;
import com.ibm.icu.text.UnicodeSet;

/**
 * TextTrieMap is a trie implementation for supporting
 * fast prefix match for the key.
 */
public class TextTrieMap {

    private Node _root = new Node();
    boolean _ignoreCase;

    public static class Output {
        public int matchLength;
        public boolean partialMatch;
    }

    /**
     * Constructs a TextTrieMap object.
     *
     * @param ignoreCase true to use simple case insensitive match
     */
    public TextTrieMap(boolean ignoreCase) {
        _ignoreCase = ignoreCase;
    }

    /**
     * Adds the text key and its associated object in this object.
     *
     * @param text The text.
     * @param val The value object associated with the text.
     */
    public TextTrieMap put(CharSequence text, V val) {
        CharIterator chitr = new CharIterator(text, 0, _ignoreCase);
        _root.add(chitr, val);
        return this;
    }

    /**
     * Gets an iterator of the objects associated with the
     * longest prefix matching string key.
     *
     * @param text The text to be matched with prefixes.
     * @return An iterator of the objects associated with
     * the longest prefix matching matching key, or null
     * if no matching entry is found.
     */
    public Iterator get(String text) {
        return get(text, 0);
    }

    /**
     * Gets an iterator of the objects associated with the
     * longest prefix matching string key starting at the
     * specified position.
     *
     * @param text The text to be matched with prefixes.
     * @param start The start index of of the text
     * @return An iterator of the objects associated with the
     * longest prefix matching matching key, or null if no
     * matching entry is found.
     */
    public Iterator get(CharSequence text, int start) {
        return get(text, start, null);
    }

    public Iterator get(CharSequence text, int start, Output output) {
        LongestMatchHandler handler = new LongestMatchHandler();
        find(text, start, handler, output);
        if (output != null) {
            output.matchLength = handler.getMatchLength();
        }
        return handler.getMatches();
    }

    public void find(CharSequence text, ResultHandler handler) {
        find(text, 0, handler, null);
    }

    public void find(CharSequence text, int offset, ResultHandler handler) {
        find(text, offset, handler, null);
    }

    private void find(CharSequence text, int offset, ResultHandler handler, Output output) {
        CharIterator chitr = new CharIterator(text, offset, _ignoreCase);
        find(_root, chitr, handler, output);
    }

    private synchronized void find(Node node, CharIterator chitr, ResultHandler handler, Output output) {
        Iterator values = node.values();
        if (values != null) {
            if (!handler.handlePrefixMatch(chitr.processedLength(), values)) {
                return;
            }
        }

        Node nextMatch = node.findMatch(chitr, output);
        if (nextMatch != null) {
            find(nextMatch, chitr, handler, output);
        }
    }

    public void putLeadCodePoints(UnicodeSet output) {
        _root.putLeadCodePoints(output);
    }

    public static class CharIterator implements Iterator {
        private boolean _ignoreCase;
        private CharSequence _text;
        private int _nextIdx;
        private int _startIdx;

        private Character _remainingChar;

        CharIterator(CharSequence text, int offset, boolean ignoreCase) {
            _text = text;
            _nextIdx = _startIdx = offset;
            _ignoreCase = ignoreCase;
        }

        /* (non-Javadoc)
         * @see java.util.Iterator#hasNext()
         */
        @Override
        public boolean hasNext() {
            if (_nextIdx == _text.length() && _remainingChar == null) {
                return false;
            }
            return true;
        }

        /* (non-Javadoc)
         * @see java.util.Iterator#next()
         */
        @Override
        public Character next() {
            if (_nextIdx == _text.length() && _remainingChar == null) {
                return null;
            }
            Character next;
            if (_remainingChar != null) {
                next = _remainingChar;
                _remainingChar = null;
            } else {
                if (_ignoreCase) {
                    int cp = UCharacter.foldCase(Character.codePointAt(_text, _nextIdx), true);
                    _nextIdx = _nextIdx + Character.charCount(cp);

                    char[] chars = Character.toChars(cp);
                    next = chars[0];
                    if (chars.length == 2) {
                        _remainingChar = chars[1];
                    }
                } else {
                    next = _text.charAt(_nextIdx);
                    _nextIdx++;
                }
            }
            return next;
        }

        /* (non-Javadoc)
         * @see java.util.Iterator#remove()
         */
        @Override
        public void remove() {
            throw new UnsupportedOperationException("remove() not supported");
        }

        public int nextIndex() {
            return _nextIdx;
        }

        public int processedLength() {
            if (_remainingChar != null) {
                throw new IllegalStateException("In the middle of surrogate pair");
            }
            return _nextIdx - _startIdx;
        }
    }

    /**
     * Callback handler for processing prefix matches used by
     * find method.
     */
    public interface ResultHandler {
        /**
         * Handles a prefix key match
         *
         * @param matchLength Matched key's length
         * @param values An iterator of the objects associated with the matched key
         * @return Return true to continue the search in the trie, false to quit.
         */
        public boolean handlePrefixMatch(int matchLength, Iterator values);
    }

    private static class LongestMatchHandler implements ResultHandler {
        private Iterator matches = null;
        private int length = 0;

        @Override
        public boolean handlePrefixMatch(int matchLength, Iterator values) {
            if (matchLength > length) {
                length = matchLength;
                matches = values;
            }
            return true;
        }

        public Iterator getMatches() {
            return matches;
        }

        public int getMatchLength() {
            return length;
        }
    }

    /**
     * Inner class representing a text node in the trie.
     */
    private class Node {
        private char[] _text;
        private List _values;
        private List _children;

        private Node() {
        }

        private Node(char[] text, List values, List children) {
            _text = text;
            _values = values;
            _children = children;
        }

        public int charCount() {
          return _text == null ? 0 : _text.length;
        }

        public Iterator values() {
            if (_values == null) {
                return null;
            }
            return _values.iterator();
        }

        public void add(CharIterator chitr, V value) {
            StringBuilder buf = new StringBuilder();
            while (chitr.hasNext()) {
                buf.append(chitr.next());
            }
            add(toCharArray(buf), 0, value);
        }

        public Node findMatch(CharIterator chitr, Output output) {
            if (_children == null) {
                return null;
            }
            if (!chitr.hasNext()) {
                if (output != null) {
                    output.partialMatch = true;
                }
                return null;
            }
            Node match = null;
            Character ch = chitr.next();
            for (Node child : _children) {
                if (ch < child._text[0]) {
                    break;
                }
                if (ch == child._text[0]) {
                    if (child.matchFollowing(chitr, output)) {
                        match = child;
                    }
                    break;
                }
            }
            return match;
        }

        public void putLeadCodePoints(UnicodeSet output) {
            if (_children == null) {
                return;
            }
            for (Node child : _children) {
                char c0 = child._text[0];
                if (!UCharacter.isHighSurrogate(c0)) {
                    output.add(c0);
                } else if (child.charCount() >= 2) {
                    output.add(Character.codePointAt(child._text, 0));
                } else if (child._children != null) {
                    // Construct all possible code points from grandchildren.
                    for (Node grandchild : child._children) {
                        char c1 = grandchild._text[0];
                        int cp = Character.toCodePoint(c0, c1);
                        output.add(cp);
                    }
                }
            }
        }

        private void add(char[] text, int offset, V value) {
            if (text.length == offset) {
                _values = addValue(_values, value);
                return;
            }

            if (_children == null) {
                _children = new LinkedList();
                Node child = new Node(subArray(text, offset), addValue(null, value), null);
                _children.add(child);
                return;
            }

            // walk through children
            ListIterator litr = _children.listIterator();
            while (litr.hasNext()) {
                Node next = litr.next();
                if (text[offset] < next._text[0]) {
                    litr.previous();
                    break;
                }
                if (text[offset] == next._text[0]) {
                    int matchLen = next.lenMatches(text, offset);
                    if (matchLen == next._text.length) {
                        // full match
                        next.add(text, offset + matchLen, value);
                    } else {
                        // partial match, create a branch
                        next.split(matchLen);
                        next.add(text, offset + matchLen, value);
                    }
                    return;
                }
            }
            // add a new child to this node
            litr.add(new Node(subArray(text, offset), addValue(null, value), null));
        }

        private boolean matchFollowing(CharIterator chitr, Output output) {
            boolean matched = true;
            int idx = 1;
            while (idx < _text.length) {
                if(!chitr.hasNext()) {
                    if (output != null) {
                        output.partialMatch = true;
                    }
                    matched = false;
                    break;
                }
                Character ch = chitr.next();
                if (ch != _text[idx]) {
                    matched = false;
                    break;
                }
                idx++;
            }
            return matched;
        }

        private int lenMatches(char[] text, int offset) {
            int textLen = text.length - offset;
            int limit = _text.length < textLen ? _text.length : textLen;
            int len = 0;
            while (len < limit) {
                if (_text[len] != text[offset + len]) {
                    break;
                }
                len++;
            }
            return len;
        }

        private void split(int offset) {
            // split the current node at the offset
            char[] childText = subArray(_text, offset);
            _text = subArray(_text, 0, offset);

            // add the Node representing after the offset as a child
            Node child = new Node(childText, _values, _children);
            _values = null;

            _children = new LinkedList();
            _children.add(child);
        }

        private List addValue(List list, V value) {
            if (list == null) {
                list = new LinkedList();
            }
            list.add(value);
            return list;
        }
    }

    private static char[] toCharArray(CharSequence text) {
        char[] array = new char[text.length()];
        for (int i = 0; i < array.length; i++) {
            array[i] = text.charAt(i);
        }
        return array;
    }

    private static char[] subArray(char[] array, int start) {
        if (start == 0) {
            return array;
        }
        char[] sub = new char[array.length - start];
        System.arraycopy(array, start, sub, 0, sub.length);
        return sub;
    }

    private static char[] subArray(char[] array, int start, int limit) {
        if (start == 0 && limit == array.length) {
            return array;
        }
        char[] sub = new char[limit - start];
        System.arraycopy(array, start, sub, 0, limit - start);
        return sub;
    }
}