com.ibm.icu.impl.TextTrieMap Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of icu4j Show documentation
Show all versions of icu4j Show documentation
International Component for Unicode for Java (ICU4J) is a mature, widely used Java library
providing Unicode and Globalization support
// © 2016 and later: Unicode, Inc. and others.
// License & terms of use: http://www.unicode.org/copyright.html
/*
* ********************************************************************************
* Copyright (C) 2007-2011, International Business Machines Corporation and others.
* All Rights Reserved.
* ********************************************************************************
*/
package com.ibm.icu.impl;
import java.util.Iterator;
import java.util.LinkedList;
import java.util.List;
import java.util.ListIterator;
import com.ibm.icu.lang.UCharacter;
import com.ibm.icu.text.UnicodeSet;
/**
* TextTrieMap is a trie implementation for supporting
* fast prefix match for the key.
*/
public class TextTrieMap {
private Node _root = new Node();
boolean _ignoreCase;
public static class Output {
public int matchLength;
public boolean partialMatch;
}
/**
* Constructs a TextTrieMap object.
*
* @param ignoreCase true to use simple case insensitive match
*/
public TextTrieMap(boolean ignoreCase) {
_ignoreCase = ignoreCase;
}
/**
* Adds the text key and its associated object in this object.
*
* @param text The text.
* @param val The value object associated with the text.
*/
public TextTrieMap put(CharSequence text, V val) {
CharIterator chitr = new CharIterator(text, 0, _ignoreCase);
_root.add(chitr, val);
return this;
}
/**
* Gets an iterator of the objects associated with the
* longest prefix matching string key.
*
* @param text The text to be matched with prefixes.
* @return An iterator of the objects associated with
* the longest prefix matching matching key, or null
* if no matching entry is found.
*/
public Iterator get(String text) {
return get(text, 0);
}
/**
* Gets an iterator of the objects associated with the
* longest prefix matching string key starting at the
* specified position.
*
* @param text The text to be matched with prefixes.
* @param start The start index of of the text
* @return An iterator of the objects associated with the
* longest prefix matching matching key, or null if no
* matching entry is found.
*/
public Iterator get(CharSequence text, int start) {
return get(text, start, null);
}
public Iterator get(CharSequence text, int start, Output output) {
LongestMatchHandler handler = new LongestMatchHandler();
find(text, start, handler, output);
if (output != null) {
output.matchLength = handler.getMatchLength();
}
return handler.getMatches();
}
public void find(CharSequence text, ResultHandler handler) {
find(text, 0, handler, null);
}
public void find(CharSequence text, int offset, ResultHandler handler) {
find(text, offset, handler, null);
}
private void find(CharSequence text, int offset, ResultHandler handler, Output output) {
CharIterator chitr = new CharIterator(text, offset, _ignoreCase);
find(_root, chitr, handler, output);
}
private synchronized void find(Node node, CharIterator chitr, ResultHandler handler, Output output) {
Iterator values = node.values();
if (values != null) {
if (!handler.handlePrefixMatch(chitr.processedLength(), values)) {
return;
}
}
Node nextMatch = node.findMatch(chitr, output);
if (nextMatch != null) {
find(nextMatch, chitr, handler, output);
}
}
public void putLeadCodePoints(UnicodeSet output) {
_root.putLeadCodePoints(output);
}
public static class CharIterator implements Iterator {
private boolean _ignoreCase;
private CharSequence _text;
private int _nextIdx;
private int _startIdx;
private Character _remainingChar;
CharIterator(CharSequence text, int offset, boolean ignoreCase) {
_text = text;
_nextIdx = _startIdx = offset;
_ignoreCase = ignoreCase;
}
/* (non-Javadoc)
* @see java.util.Iterator#hasNext()
*/
@Override
public boolean hasNext() {
if (_nextIdx == _text.length() && _remainingChar == null) {
return false;
}
return true;
}
/* (non-Javadoc)
* @see java.util.Iterator#next()
*/
@Override
public Character next() {
if (_nextIdx == _text.length() && _remainingChar == null) {
return null;
}
Character next;
if (_remainingChar != null) {
next = _remainingChar;
_remainingChar = null;
} else {
if (_ignoreCase) {
int cp = UCharacter.foldCase(Character.codePointAt(_text, _nextIdx), true);
_nextIdx = _nextIdx + Character.charCount(cp);
char[] chars = Character.toChars(cp);
next = chars[0];
if (chars.length == 2) {
_remainingChar = chars[1];
}
} else {
next = _text.charAt(_nextIdx);
_nextIdx++;
}
}
return next;
}
/* (non-Javadoc)
* @see java.util.Iterator#remove()
*/
@Override
public void remove() {
throw new UnsupportedOperationException("remove() not supproted");
}
public int nextIndex() {
return _nextIdx;
}
public int processedLength() {
if (_remainingChar != null) {
throw new IllegalStateException("In the middle of surrogate pair");
}
return _nextIdx - _startIdx;
}
}
/**
* Callback handler for processing prefix matches used by
* find method.
*/
public interface ResultHandler {
/**
* Handles a prefix key match
*
* @param matchLength Matched key's length
* @param values An iterator of the objects associated with the matched key
* @return Return true to continue the search in the trie, false to quit.
*/
public boolean handlePrefixMatch(int matchLength, Iterator values);
}
private static class LongestMatchHandler implements ResultHandler {
private Iterator matches = null;
private int length = 0;
@Override
public boolean handlePrefixMatch(int matchLength, Iterator values) {
if (matchLength > length) {
length = matchLength;
matches = values;
}
return true;
}
public Iterator getMatches() {
return matches;
}
public int getMatchLength() {
return length;
}
}
/**
* Inner class representing a text node in the trie.
*/
private class Node {
private char[] _text;
private List _values;
private List _children;
private Node() {
}
private Node(char[] text, List values, List children) {
_text = text;
_values = values;
_children = children;
}
public int charCount() {
return _text == null ? 0 : _text.length;
}
public Iterator values() {
if (_values == null) {
return null;
}
return _values.iterator();
}
public void add(CharIterator chitr, V value) {
StringBuilder buf = new StringBuilder();
while (chitr.hasNext()) {
buf.append(chitr.next());
}
add(toCharArray(buf), 0, value);
}
public Node findMatch(CharIterator chitr, Output output) {
if (_children == null) {
return null;
}
if (!chitr.hasNext()) {
if (output != null) {
output.partialMatch = true;
}
return null;
}
Node match = null;
Character ch = chitr.next();
for (Node child : _children) {
if (ch < child._text[0]) {
break;
}
if (ch == child._text[0]) {
if (child.matchFollowing(chitr, output)) {
match = child;
}
break;
}
}
return match;
}
public void putLeadCodePoints(UnicodeSet output) {
if (_children == null) {
return;
}
for (Node child : _children) {
char c0 = child._text[0];
if (!UCharacter.isHighSurrogate(c0)) {
output.add(c0);
} else if (child.charCount() >= 2) {
output.add(Character.codePointAt(child._text, 0));
} else if (child._children != null) {
// Construct all possible code points from grandchildren.
for (Node grandchild : child._children) {
char c1 = grandchild._text[0];
int cp = Character.toCodePoint(c0, c1);
output.add(cp);
}
}
}
}
private void add(char[] text, int offset, V value) {
if (text.length == offset) {
_values = addValue(_values, value);
return;
}
if (_children == null) {
_children = new LinkedList();
Node child = new Node(subArray(text, offset), addValue(null, value), null);
_children.add(child);
return;
}
// walk through children
ListIterator litr = _children.listIterator();
while (litr.hasNext()) {
Node next = litr.next();
if (text[offset] < next._text[0]) {
litr.previous();
break;
}
if (text[offset] == next._text[0]) {
int matchLen = next.lenMatches(text, offset);
if (matchLen == next._text.length) {
// full match
next.add(text, offset + matchLen, value);
} else {
// partial match, create a branch
next.split(matchLen);
next.add(text, offset + matchLen, value);
}
return;
}
}
// add a new child to this node
litr.add(new Node(subArray(text, offset), addValue(null, value), null));
}
private boolean matchFollowing(CharIterator chitr, Output output) {
boolean matched = true;
int idx = 1;
while (idx < _text.length) {
if(!chitr.hasNext()) {
if (output != null) {
output.partialMatch = true;
}
matched = false;
break;
}
Character ch = chitr.next();
if (ch != _text[idx]) {
matched = false;
break;
}
idx++;
}
return matched;
}
private int lenMatches(char[] text, int offset) {
int textLen = text.length - offset;
int limit = _text.length < textLen ? _text.length : textLen;
int len = 0;
while (len < limit) {
if (_text[len] != text[offset + len]) {
break;
}
len++;
}
return len;
}
private void split(int offset) {
// split the current node at the offset
char[] childText = subArray(_text, offset);
_text = subArray(_text, 0, offset);
// add the Node representing after the offset as a child
Node child = new Node(childText, _values, _children);
_values = null;
_children = new LinkedList();
_children.add(child);
}
private List addValue(List list, V value) {
if (list == null) {
list = new LinkedList();
}
list.add(value);
return list;
}
}
private static char[] toCharArray(CharSequence text) {
char[] array = new char[text.length()];
for (int i = 0; i < array.length; i++) {
array[i] = text.charAt(i);
}
return array;
}
private static char[] subArray(char[] array, int start) {
if (start == 0) {
return array;
}
char[] sub = new char[array.length - start];
System.arraycopy(array, start, sub, 0, sub.length);
return sub;
}
private static char[] subArray(char[] array, int start, int limit) {
if (start == 0 && limit == array.length) {
return array;
}
char[] sub = new char[limit - start];
System.arraycopy(array, start, sub, 0, limit - start);
return sub;
}
}