com.ibm.icu.impl.TextTrieMap Maven / Gradle / Ivy
The newest version!
// © 2016 and later: Unicode, Inc. and others.
// License & terms of use: http://www.unicode.org/copyright.html
/*
* ********************************************************************************
* Copyright (C) 2007-2011, International Business Machines Corporation and others.
* All Rights Reserved.
* ********************************************************************************
*/
package com.ibm.icu.impl;
import java.util.Iterator;
import java.util.LinkedList;
import java.util.List;
import java.util.ListIterator;
import com.ibm.icu.lang.UCharacter;
import com.ibm.icu.text.UnicodeSet;
/**
* TextTrieMap is a trie implementation for supporting
* fast prefix match for the key.
*/
public class TextTrieMap {
private Node _root = new Node();
boolean _ignoreCase;
public static class Output {
public int matchLength;
public boolean partialMatch;
}
/**
* Constructs a TextTrieMap object.
*
* @param ignoreCase true to use simple case insensitive match
*/
public TextTrieMap(boolean ignoreCase) {
_ignoreCase = ignoreCase;
}
/**
* Adds the text key and its associated object in this object.
*
* @param text The text.
* @param val The value object associated with the text.
*/
public TextTrieMap put(CharSequence text, V val) {
CharIterator chitr = new CharIterator(text, 0, _ignoreCase);
_root.add(chitr, val);
return this;
}
/**
* Gets an iterator of the objects associated with the
* longest prefix matching string key.
*
* @param text The text to be matched with prefixes.
* @return An iterator of the objects associated with
* the longest prefix matching matching key, or null
* if no matching entry is found.
*/
public Iterator get(String text) {
return get(text, 0);
}
/**
* Gets an iterator of the objects associated with the
* longest prefix matching string key starting at the
* specified position.
*
* @param text The text to be matched with prefixes.
* @param start The start index of of the text
* @return An iterator of the objects associated with the
* longest prefix matching matching key, or null if no
* matching entry is found.
*/
public Iterator get(CharSequence text, int start) {
return get(text, start, null);
}
public Iterator get(CharSequence text, int start, Output output) {
LongestMatchHandler handler = new LongestMatchHandler();
find(text, start, handler, output);
if (output != null) {
output.matchLength = handler.getMatchLength();
}
return handler.getMatches();
}
public void find(CharSequence text, ResultHandler handler) {
find(text, 0, handler, null);
}
public void find(CharSequence text, int offset, ResultHandler handler) {
find(text, offset, handler, null);
}
private void find(CharSequence text, int offset, ResultHandler handler, Output output) {
CharIterator chitr = new CharIterator(text, offset, _ignoreCase);
find(_root, chitr, handler, output);
}
private synchronized void find(Node node, CharIterator chitr, ResultHandler handler, Output output) {
Iterator values = node.values();
if (values != null) {
if (!handler.handlePrefixMatch(chitr.processedLength(), values)) {
return;
}
}
Node nextMatch = node.findMatch(chitr, output);
if (nextMatch != null) {
find(nextMatch, chitr, handler, output);
}
}
public void putLeadCodePoints(UnicodeSet output) {
_root.putLeadCodePoints(output);
}
public static class CharIterator implements Iterator {
private boolean _ignoreCase;
private CharSequence _text;
private int _nextIdx;
private int _startIdx;
private Character _remainingChar;
CharIterator(CharSequence text, int offset, boolean ignoreCase) {
_text = text;
_nextIdx = _startIdx = offset;
_ignoreCase = ignoreCase;
}
/* (non-Javadoc)
* @see java.util.Iterator#hasNext()
*/
@Override
public boolean hasNext() {
if (_nextIdx == _text.length() && _remainingChar == null) {
return false;
}
return true;
}
/* (non-Javadoc)
* @see java.util.Iterator#next()
*/
@Override
public Character next() {
if (_nextIdx == _text.length() && _remainingChar == null) {
return null;
}
Character next;
if (_remainingChar != null) {
next = _remainingChar;
_remainingChar = null;
} else {
if (_ignoreCase) {
int cp = UCharacter.foldCase(Character.codePointAt(_text, _nextIdx), true);
_nextIdx = _nextIdx + Character.charCount(cp);
char[] chars = Character.toChars(cp);
next = chars[0];
if (chars.length == 2) {
_remainingChar = chars[1];
}
} else {
next = _text.charAt(_nextIdx);
_nextIdx++;
}
}
return next;
}
/* (non-Javadoc)
* @see java.util.Iterator#remove()
*/
@Override
public void remove() {
throw new UnsupportedOperationException("remove() not supported");
}
public int nextIndex() {
return _nextIdx;
}
public int processedLength() {
if (_remainingChar != null) {
throw new IllegalStateException("In the middle of surrogate pair");
}
return _nextIdx - _startIdx;
}
}
/**
* Callback handler for processing prefix matches used by
* find method.
*/
public interface ResultHandler {
/**
* Handles a prefix key match
*
* @param matchLength Matched key's length
* @param values An iterator of the objects associated with the matched key
* @return Return true to continue the search in the trie, false to quit.
*/
public boolean handlePrefixMatch(int matchLength, Iterator values);
}
private static class LongestMatchHandler implements ResultHandler {
private Iterator matches = null;
private int length = 0;
@Override
public boolean handlePrefixMatch(int matchLength, Iterator values) {
if (matchLength > length) {
length = matchLength;
matches = values;
}
return true;
}
public Iterator getMatches() {
return matches;
}
public int getMatchLength() {
return length;
}
}
/**
* Inner class representing a text node in the trie.
*/
private class Node {
private char[] _text;
private List _values;
private List _children;
private Node() {
}
private Node(char[] text, List values, List children) {
_text = text;
_values = values;
_children = children;
}
public int charCount() {
return _text == null ? 0 : _text.length;
}
public Iterator values() {
if (_values == null) {
return null;
}
return _values.iterator();
}
public void add(CharIterator chitr, V value) {
StringBuilder buf = new StringBuilder();
while (chitr.hasNext()) {
buf.append(chitr.next());
}
add(toCharArray(buf), 0, value);
}
public Node findMatch(CharIterator chitr, Output output) {
if (_children == null) {
return null;
}
if (!chitr.hasNext()) {
if (output != null) {
output.partialMatch = true;
}
return null;
}
Node match = null;
Character ch = chitr.next();
for (Node child : _children) {
if (ch < child._text[0]) {
break;
}
if (ch == child._text[0]) {
if (child.matchFollowing(chitr, output)) {
match = child;
}
break;
}
}
return match;
}
public void putLeadCodePoints(UnicodeSet output) {
if (_children == null) {
return;
}
for (Node child : _children) {
char c0 = child._text[0];
if (!UCharacter.isHighSurrogate(c0)) {
output.add(c0);
} else if (child.charCount() >= 2) {
output.add(Character.codePointAt(child._text, 0));
} else if (child._children != null) {
// Construct all possible code points from grandchildren.
for (Node grandchild : child._children) {
char c1 = grandchild._text[0];
int cp = Character.toCodePoint(c0, c1);
output.add(cp);
}
}
}
}
private void add(char[] text, int offset, V value) {
if (text.length == offset) {
_values = addValue(_values, value);
return;
}
if (_children == null) {
_children = new LinkedList();
Node child = new Node(subArray(text, offset), addValue(null, value), null);
_children.add(child);
return;
}
// walk through children
ListIterator litr = _children.listIterator();
while (litr.hasNext()) {
Node next = litr.next();
if (text[offset] < next._text[0]) {
litr.previous();
break;
}
if (text[offset] == next._text[0]) {
int matchLen = next.lenMatches(text, offset);
if (matchLen == next._text.length) {
// full match
next.add(text, offset + matchLen, value);
} else {
// partial match, create a branch
next.split(matchLen);
next.add(text, offset + matchLen, value);
}
return;
}
}
// add a new child to this node
litr.add(new Node(subArray(text, offset), addValue(null, value), null));
}
private boolean matchFollowing(CharIterator chitr, Output output) {
boolean matched = true;
int idx = 1;
while (idx < _text.length) {
if(!chitr.hasNext()) {
if (output != null) {
output.partialMatch = true;
}
matched = false;
break;
}
Character ch = chitr.next();
if (ch != _text[idx]) {
matched = false;
break;
}
idx++;
}
return matched;
}
private int lenMatches(char[] text, int offset) {
int textLen = text.length - offset;
int limit = _text.length < textLen ? _text.length : textLen;
int len = 0;
while (len < limit) {
if (_text[len] != text[offset + len]) {
break;
}
len++;
}
return len;
}
private void split(int offset) {
// split the current node at the offset
char[] childText = subArray(_text, offset);
_text = subArray(_text, 0, offset);
// add the Node representing after the offset as a child
Node child = new Node(childText, _values, _children);
_values = null;
_children = new LinkedList();
_children.add(child);
}
private List addValue(List list, V value) {
if (list == null) {
list = new LinkedList();
}
list.add(value);
return list;
}
}
private static char[] toCharArray(CharSequence text) {
char[] array = new char[text.length()];
for (int i = 0; i < array.length; i++) {
array[i] = text.charAt(i);
}
return array;
}
private static char[] subArray(char[] array, int start) {
if (start == 0) {
return array;
}
char[] sub = new char[array.length - start];
System.arraycopy(array, start, sub, 0, sub.length);
return sub;
}
private static char[] subArray(char[] array, int start, int limit) {
if (start == 0 && limit == array.length) {
return array;
}
char[] sub = new char[limit - start];
System.arraycopy(array, start, sub, 0, limit - start);
return sub;
}
}