com.googlecode.concurrenttrees.solver.LCSubstringSolver Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of concurrent-trees Show documentation
Show all versions of concurrent-trees Show documentation
Concurrent Radix Trees and Concurrent Suffix Trees for Java.
The newest version!
/**
* Copyright 2012-2013 Niall Gallagher
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.googlecode.concurrenttrees.solver;
import com.googlecode.concurrenttrees.common.CharSequences;
import com.googlecode.concurrenttrees.radix.ConcurrentRadixTree;
import com.googlecode.concurrenttrees.radix.node.Node;
import com.googlecode.concurrenttrees.radix.node.NodeFactory;
import java.util.*;
import java.util.concurrent.ConcurrentHashMap;
/**
* Finds the longest common substring in a collection of documents.
* See Longest common substring problem.
*
* This class internally extends {@link ConcurrentRadixTree} and combines it with elements from
* {@link com.googlecode.concurrenttrees.suffix.ConcurrentSuffixTree}, but implements its own traversal algorithm.
*
* @author Niall Gallagher
*/
public class LCSubstringSolver {
class ConcurrentSuffixTreeImpl extends ConcurrentRadixTree {
public ConcurrentSuffixTreeImpl(NodeFactory nodeFactory) {
super(nodeFactory);
}
@Override
protected void acquireWriteLock() {
super.acquireWriteLock();
}
@Override
protected void releaseWriteLock() {
super.releaseWriteLock();
}
// Override to make accessible to outer class...
@Override
protected Iterable lazyTraverseDescendants(CharSequence startKey, Node startNode) {
return super.lazyTraverseDescendants(startKey, startNode);
}
/**
* The main algorithm to find the longest common substring.
*
* -
* Traverses all nodes in the suffix tree
*
* -
* For each node checks if the path from the root via edges to that node is
* longer than the longest common substring encountered so far (and so is a candidate)
*
* -
* Calls helper method {@link #subTreeReferencesAllOriginalDocuments(CharSequence, Node)},
* supplying the candidate node. That method returns true if nodes in the sub-tree descending from
* that node collectively references all of the original documents added to the solver
*
* -
* If the nodes in the sub-tree do collectively reference all documents, then
* the path from root to the current node is a substring of all documents
*
* -
* Updates the longest common substring encountered so far if the conditions above hold for the
* current node
*
* -
* Continues traversing the tree until all nodes have been checked
*
*
* Implementation note: Method {@link #subTreeReferencesAllOriginalDocuments(CharSequence, Node)} will
* stop traversal early if it finds all original documents early. This method currently does not apply a similar
* optimization, and will actually descend into and apply the same tests to branches which the helper method
* already indicated are dead-ends(!). Future work might be to use this knowledge, skip dead-end branches, but
* it would involve not using any of the traversal logic from superclasses and overriding it all here for this
* one use case.
*
* @return The longest common substring
*/
CharSequence getLongestCommonSubstring() {
Node root = suffixTree.getNode();
final CharSequence[] longestCommonSubstringSoFar = new CharSequence[] {""};
final int[] longestCommonSubstringSoFarLength = new int[] {0};
for (NodeKeyPair nodeKeyPair : lazyTraverseDescendants("", root)) {
if (nodeKeyPair.key.length() > longestCommonSubstringSoFarLength[0]
&& subTreeReferencesAllOriginalDocuments(nodeKeyPair.key, nodeKeyPair.node)) {
longestCommonSubstringSoFarLength[0] = nodeKeyPair.key.length();
longestCommonSubstringSoFar[0] = nodeKeyPair.key;
}
}
return longestCommonSubstringSoFar[0];
}
/**
* Returns true if the given node and its descendants collectively reference all original documents added to
* the solver.
*
* This method will traverse the entire sub-tree until it has encountered all of the original documents. If
* it encounters all of the original documents early, before exhausting all nodes, returns early.
*
* @param startKey The key associated with the start node (concatenation of edges from root leading to it)
* @param startNode The root of the sub-tree to traverse
* @return True if the given node and its descendants collectively reference all original documents added to
* the solver, false if the sub-tree does not reference all documents added to the solver
*/
boolean subTreeReferencesAllOriginalDocuments(CharSequence startKey, Node startNode) {
final Set documentsEncounteredSoFar = new HashSet(originalDocuments.size());
final boolean[] result = new boolean[] {false};
for (NodeKeyPair nodeKeyPair : lazyTraverseDescendants(startKey, startNode)) {
@SuppressWarnings({"unchecked"})
Set documentsReferencedByThisNode = (Set) nodeKeyPair.node.getValue();
if (documentsReferencedByThisNode != null) {
documentsEncounteredSoFar.addAll(documentsReferencedByThisNode);
if (documentsEncounteredSoFar.equals(originalDocuments)) {
// We have now found all of the original documents
// referenced from descendants of the start node...
result[0] = true;
// Stop traversal...
break;
}
}
}
return result[0];
}
}
final ConcurrentSuffixTreeImpl> suffixTree;
final Set originalDocuments;
/**
* Creates a new {@link LCSubstringSolver} which will use the given {@link NodeFactory} to create nodes.
*
* @param nodeFactory An object which creates {@link com.googlecode.concurrenttrees.radix.node.Node} objects
* on-demand, and which might return node implementations optimized for storing the values supplied to it for
* the creation of each node
*/
public LCSubstringSolver(NodeFactory nodeFactory) {
this.suffixTree = new ConcurrentSuffixTreeImpl>(nodeFactory);
this.originalDocuments = createSetForOriginalKeys();
}
/**
* Adds a {@link CharSequence} document to the solver.
*
* @param document The {@link CharSequence} to add to the solver
* @return True if the document was added, false if it was not because it had been added previously
*/
public boolean add(CharSequence document) {
if (document == null) {
throw new IllegalArgumentException("The document argument was null");
}
if (document.length() == 0) {
throw new IllegalArgumentException("The document argument was zero-length");
}
suffixTree.acquireWriteLock();
try {
// We convert to string (for now) due to lack of equals() and hashCode() support in CharSequence...
String documentString = CharSequences.toString(document);
// Put/replace value in set before we add suffixes to the tree...
boolean addedNew = originalDocuments.add(documentString);
if (!addedNew) {
// Key was not added as was already contained, no need to do anything, return false...
return false;
}
// Kew was added to set, now add to tree...
addSuffixesToRadixTree(documentString);
return true;
}
finally {
suffixTree.releaseWriteLock();
}
}
void addSuffixesToRadixTree(String keyAsString) {
Iterable suffixes = CharSequences.generateSuffixes(keyAsString);
for (CharSequence suffix : suffixes) {
Set originalKeyRefs = suffixTree.getValueForExactKey(suffix);
if (originalKeyRefs == null) {
originalKeyRefs = createSetForOriginalKeys();
suffixTree.put(suffix, originalKeyRefs);
}
originalKeyRefs.add(keyAsString);
}
}
/**
* Finds the longest common substring in the documents added to the solver so far.
*
* @return The longest common substring
*/
public CharSequence getLongestCommonSubstring() {
return suffixTree.getLongestCommonSubstring();
}
protected Set createSetForOriginalKeys() {
return Collections.newSetFromMap(new ConcurrentHashMap());
}
}