org.wicketstuff.datatable_autocomplete.trie.PatriciaTrie Maven / Gradle / Ivy

Go to download
/*
 * 
 * ==============================================================================
 * Licensed under the Apache License, Version 2.0 (the "License"); you may not
 * use this file except in compliance with the License. You may obtain a copy of
 * the License at
 * 
 * http://www.apache.org/licenses/LICENSE-2.0
 * 
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
 * License for the specific language governing permissions and limitations under
 * the License.
 */
package org.wicketstuff.datatable_autocomplete.trie;

import java.util.LinkedList;
import java.util.List;
import java.util.Set;
import java.util.concurrent.atomic.AtomicInteger;

import org.apache.wicket.IClusterable;

/**
 * @author mocleiri
 * 
 *         A Trie is a specialized search tree that is optimized for retrieval of data.
 * 
 *         This implementation is read-only and expects to load the data then minimize itself and be
 *         part of a singleton that returns the indexed data.
 * 
 *         A Patricia Trie is used to index words from left to right.
 * 
 *         A Suffix Tree, which is useful for any string matching, can be build on top of a Patricia
 *         Trie simply using a variant indexing method.
 * 
 *         An ITrieFilter can be used to filter additional fields within an indexed object when
 *         the list of matching words (objects) is being computed.
 * 
 * @see Radix tree
 * 
 *      It is suited for quick retrieval of prefix matches over large static datasets (100,000
 *      elements)
 * 
 *      This implementation will index an object C based on the word (String) that is extracted
 *      using the ITrieNodeConfiguration.getWord (C c) method.
 * 
 */
public class PatriciaTrie implements IClusterable, Trie
{

	/**
	 * 
	 */
	private static final long serialVersionUID = -6075870905379098868L;

	private TrieNode root = null;

	private ITrieConfiguration configuration = null;

	/**
	 * 
	 */
	public PatriciaTrie()
	{

		super();
	}


	/**
	 * 
	 */
	public PatriciaTrie(ITrieConfiguration configuration)
	{

		this.configuration = configuration;
		this.configuration.setTrie(this);

		this.root = configuration.createTrieNode(null, "", "");


	}

	/*
	 * (non-Javadoc)
	 * 
	 * @see org.wicketstuff.datatable_autocomplete.trie.Trie#index(C)
	 */
	public void index(C value)
	{

		// traverse to the point where no match is found and then insert at that
		// point.

		if (configuration.isSuffixTree())
		{
			// suffix tree
			// for anystring match

			String word = configuration.getWord(value);

			int length = word.length();

			for (int i = 0; i < length; i++)
			{

				// index each substring of the word from the initial full word through to the last
// character.
				String subWord = word.substring(i);

				this.root.index(subWord, value);


			}
		}
		else
		{
			// prefix tree
			// for prefix match
			this.root.index(value);

		}


	}


	/*
	 * (non-Javadoc)
	 * 
	 * @see org.wicketstuff.datatable_autocomplete.trie.Trie#getWordList(java.lang.String)
	 */
	public List getWordList(String prefix)
	{

		return getWordList(prefix, configuration.getDefaultFilter(), -1);

	}

	// private List getWordList(TrieNode prefixNode) {
	//
	// return getWordList(prefixNode, configuration.getDefaultFilter(), -1);
	// }

	public PrefixTrieMatch find(String prefix, ITrieFilter filter)
	{

		return this.root.find(prefix, filter);
	}

	public List getWordList(String prefix, ITrieFilter filter, int limit)
	{

		PrefixTrieMatch prefixNodeMatch = this.root.find(prefix, filter);

		if (prefixNodeMatch == null)
			return new LinkedList();
		else
			return prefixNodeMatch.getWordList(limit);
	}


	/**
	 * Visit each TrieNode
	 * 
	 * @param v
	 */

	public void visit(ITrieNodeVisitor v)
	{

		this.root.visit(v);
	}

	/**
	 * Compresses the sparse nodes with only 1 branch; makes the Trie into a Patricia Trie which
	 * uses less space.
	 */
	public void simplifyIndex()
	{

		// the first simplification is to remove nodes that have only 1 branch.
		// we will basically have nodes that represent more than a single
		// character
		this.root.simplify();

		/*
		 * We visit each leaf then iterate over upward to mark the max length of each nodes sub
		 * tree.
		 */

		final List> leafNodeList = new LinkedList>();

		this.root.visit(new ITrieNodeVisitor()
		{

			public void visit(TrieNode element)
			{

				if (element.getOrderedNodeList().size() == 0)
					leafNodeList.add(element);

				for (TrieNode trieNode : element.getOrderedNodeList())
				{

					trieNode.visit(this);
				}

			}
		});

		for (TrieNode trieNode : leafNodeList)
		{

			TrieNode parentNode = trieNode.getParentNode();
			TrieNode currentNode = trieNode;

			while (parentNode != null)
			{

				// start at the bottom and work upwards

				int currentLength = currentNode.getCharacter().length();

				int currentMax = currentNode.getMaxChildStringLength() + currentLength;

				int maxParentLength = parentNode.getMaxChildStringLength();

				if (currentMax > maxParentLength)
				{
					parentNode.setMaxChildStringLength(currentMax);

				}

				currentNode = parentNode;
				parentNode = parentNode.getParentNode();

			}

		}

	}


	/**
	 * @return children of the root node.
	 */
	public int getChildren()
	{

		return root.getOrderedNodeList().size();
	}

	/**
	 * Return the size of the subtree for the prefix given. This avoids the need to get the list
	 * especially when the count is large.
	 * 
	 * @param prefix
	 * @return the number of elements in the subtree corresponding to the prefix given.
	 * 
	 */
	public int getPrefixMatchedElementCount(String prefix, final ITrieFilter nodeFilter)
	{

		PrefixTrieMatch match = root.find(prefix, nodeFilter);

		if (match == null)
			return 0;

		final AtomicInteger counter = new AtomicInteger(0);

		match.getNode().visit(new ITrieNodeVisitor()
		{

			public void visit(TrieNode node)
			{

				for (C value : node.getOrderedMatchList())
				{
					if (nodeFilter.isVisible(value))
					{
						counter.addAndGet(node.getTotalMatches());
					}
				}

			}
		});

		return counter.intValue();

	}

	/**
	 * 
	 * @return the total number of elements indexed by this trie.
	 * 
	 *         Note this can be an expensive call as each node in the trie is visited.
	 * 
	 */

	public int size()
	{

		final AtomicInteger counter = new AtomicInteger(0);

		// visit each node an aggregate the number of matches:
		root.visit(new ITrieNodeVisitor()
		{

			public void visit(TrieNode node)
			{

				counter.addAndGet(node.getTotalMatches());
			}
		});

		return counter.intValue();
	}

	/**
	 * @return the set of strings that map to the next nodes of the root node.
	 * @see org.wicketstuff.datatable_autocomplete.trie.TrieNode#getNextNodeCharacterSet()
	 */
	public Set getNextNodeCharacterSet()
	{
		/*
		 * This is really just to support the datatable-autocomplete-examples where we give a count
		 * of the matches for each first character contained in this set.
		 */
		return root.getNextNodeCharacterSet();
	}


	public List getWordList(String prefix, ITrieFilter filter)
	{
		return getWordList(prefix, filter, -1);
	}


	/*
	 * (non-Javadoc)
	 * 
	 * @see org.wicketstuff.datatable_autocomplete.trie.Trie#getWordList(java.lang.String, int)
	 */
	public List getWordList(String prefix, int limit)
	{
		return getWordList(prefix, null, limit);
	}


	/*
	 * (non-Javadoc)
	 * 
	 * @see org.wicketstuff.datatable_autocomplete.trie.Trie#postIndexing()
	 */
	public void postIndexing()
	{

		this.simplifyIndex();

	}


	/*
	 * (non-Javadoc)
	 * 
	 * @see org.wicketstuff.datatable_autocomplete.trie.Trie#preIndexing()
	 */
	public void preIndexing()
	{

	}


	public TrieNode getRoot()
	{
		return root;
	}


}