com.github.tmatek.zhangshasha.TreeDistance Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of zhang-shasha-java Show documentation
Show all versions of zhang-shasha-java Show documentation
A Java implementation of Zhang-Shasha algorithm for ordered tree distance calculation.
Calculates a series of transformations required to transform one tree into another. Every
transformation has an associated cost. The sum of costs of all transformations is minimal - the tree distance.
package com.github.tmatek.zhangshasha;
import java.util.*;
/**
* Utility class for calculating the tree distance between two tree structures.
*/
public final class TreeDistance {
private static final int HIGH_COST = 100000;
private TreeDistance() {
}
private static class IntHolder {
public int value;
}
/**
* Assigns a unique identifier to each tree node according to the postorder traversal of the tree structure.
* Assumes that {@code node} is the root of the tree structure. All identifiers are in the range [0, number of
* nodes in the tree).
*
* @param node the node from which to identify postorder numbering.
* @return a mapping of tree nodes to postorder sequence ids
*/
public static ReversibleIdentityMap getPostorderIdentifiers(TreeNode node) {
ReversibleIdentityMap postorderMap = new ReversibleIdentityMap<>();
getPostorderIdentifiersRec(node, postorderMap, new IntHolder());
return postorderMap;
}
/**
* Recursively assign postorder identifiers to tree nodes.
*
* @param current the current tree node being processed
* @param map a mapping of tree nodes to postorder sequence ids
* @param idSequence the counter of postorder sequence ids
*/
private static void getPostorderIdentifiersRec(TreeNode current, Map map,
IntHolder idSequence) {
for (TreeNode child : current.getChildren())
getPostorderIdentifiersRec(child, map, idSequence);
map.put(current, idSequence.value++);
}
/**
* Returns an array of leftmost leaf descendants for every node in the tree given by root
. The
* indexing of the returned array follows postorder IDs given in postorderIDs
; result[0]
* returns the leftmost leaf descendant of node with postorder ID zero.
*
* A leftmost leaf descendant of a node is found by following the leftmost branch from the node to a leaf.
*
* @param root the root node to start the search from
* @param postorderIDs a mapping of tree nodes to postorder IDs
* @return a list of leftmost leaf descendants for every node
*/
public static TreeNode[] leftmostLeafDescendants(TreeNode root, Map postorderIDs) {
TreeNode[] lmld = new TreeNode[postorderIDs.get(root) + 1];
leftmostLeafDescendantsRec(root, lmld, new ArrayList<>(), postorderIDs);
return lmld;
}
/**
* Recursively find the leftmost leaf descendants for all nodes in the tree.
*
* @param current the current node being processed
* @param ref the list reference in which to store leftmost descendants
* @param chain the current path in the tree
* @param postorderIDs a mapping of tree nodes to postorder IDs
*/
private static void leftmostLeafDescendantsRec(TreeNode current, TreeNode[] ref, List chain,
Map postorderIDs) {
if (current.getChildren().size() == 0) {
// leftmost descendant of a leaf is the leaf itself
ref[postorderIDs.get(current)] = current;
// assign the rest of nodes in the chain the same leftmost leaf descendant - this leaf
for (TreeNode ancestor : chain)
ref[postorderIDs.get(ancestor)] = current;
} else {
chain.add(current);
int i = 0;
for (TreeNode child : current.getChildren())
leftmostLeafDescendantsRec(child, ref, i++ == 0 ? chain : new ArrayList<>(), postorderIDs);
}
}
/**
* Returns an ordered list of keyroot nodes in the tree. A keyroot node is a node which either has a left sibling
* or is the root of the tree. The keyroot nodes are ordered according to their postorder IDs.
*
* @param root the root node to start the search from
* @param postorderIDs a mapping of tree nodes to postorder IDs
* @return an ordered list of keyroot nodes, ordered according to postorder IDs
*/
public static List getKeyroots(TreeNode root, Map postorderIDs) {
List keyroots = new ArrayList<>();
keyrootsRec(root, keyroots, new ArrayList<>());
Collections.sort(keyroots, new PostorderComparator(postorderIDs));
return keyroots;
}
/**
* A comparator which sorts {@link TreeNode} objects according to their postorder IDs
* given by a mapping.
*/
private static class PostorderComparator implements Comparator {
private Map postorderIDs;
public PostorderComparator(Map postorderIDs) {
this.postorderIDs = postorderIDs;
}
@Override
public int compare(TreeNode t1, TreeNode t2) {
return this.postorderIDs.get(t1) - this.postorderIDs.get(t2);
}
}
/**
* Recursively find the keyroots starting from current
.
*
* @param current the current node being processed
* @param ref the list reference in which to store keyroot nodes
* @param chain the current path in the tree
*/
private static void keyrootsRec(TreeNode current, List ref, List chain) {
if (current.getChildren().size() == 0) {
if (chain.size() > 0) {
// the first node in the chain is the keyroot node
ref.add(chain.get(0));
} else
ref.add(current);
} else {
chain.add(current);
int i = 0;
for (TreeNode child : current.getChildren())
keyrootsRec(child, ref, i++ == 0 ? chain : new ArrayList<>());
}
}
/**
* Calculates the tree distance between tree {@code t1} and {@code t2}, taking into account that both
* trees are ordered i.e. the order of siblings is important.
* Returns the tree distance between {@code t1} and {@code t2} i.e. the minimal sum of costs of all tree
* transformations required to transform one tree into another.
*
* For further information see paper by K. Zhang et al.:
* Simple fast algorithms for the editing distance between trees and related problems
*
* @param t1 the first tree structure
* @param t2 the second tree structure
* @throws IllegalArgumentException if {@code t1} or {@code t2} is {@code null}.
* @return the tree distance between {@code t1} and {@code t2}
*/
public static int treeDistanceZhangShasha(TreeNode t1, TreeNode t2) {
return treeDistanceZhangShasha(t1, t2, null);
}
/**
* Calculates the tree distance between tree {@code t1} and {@code t2}, taking into account that both
* trees are ordered i.e. the order of siblings is important.
* Returns a list of tree transformations required to transform tree {@code t1} to {@code t2}.
* Every transformation has an associated cost. The sum of costs of all transformations is the tree distance
* between {@code t1} and {@code t2}. The sum of costs is minimal.
*
*
* The list of transformations should be applied in the order returned, as according to K. Zhang et al.:
* {@code "}To construct the sequence of editing operations, simply perform all the deletes indicated by the
* mapping (i.e., all nodes in T having no lines attached to them are deleted), then all relabellings, then all
* inserts.{@code "}
*
*
* For further information see paper by K. Zhang et al.:
* Simple fast algorithms for the editing distance between trees and related problems
*
* @param t1 the first tree structure
* @param t2 the second tree structure
* @throws IllegalArgumentException if {@code t1} or {@code t2} is {@code null}.
* @return a list of tree transformations required to transform first tree into the second
*/
public static List treeDistanceZhangShasha(EditableTreeNode t1, EditableTreeNode t2) {
List transformations = new ArrayList<>();
treeDistanceZhangShasha(t1, t2, transformations);
return transformations;
}
private static int treeDistanceZhangShasha(TreeNode t1, TreeNode t2, List transformations) {
if (t1 == null || t2 == null)
throw new IllegalArgumentException("Both tree structures must not be null");
// prepare postorder numbering
ReversibleIdentityMap postorder1 = getPostorderIdentifiers(t1),
postorder2 = getPostorderIdentifiers(t2);
// prepare leftmost leaf descendants
TreeNode[] lmld1 = leftmostLeafDescendants(t1, postorder1),
lmld2 = leftmostLeafDescendants(t2, postorder2);
// prepare keyroots
List keyRoots1 = getKeyroots(t1, postorder1),
keyRoots2 = getKeyroots(t2, postorder2);
// prepare tree distance table and transformation list
ForestTrail[][] treeDistance = new ForestTrail[postorder2.get(t2) + 1][postorder1.get(t1) + 1];
// calculate tree distance
for (TreeNode keyRoot1 : keyRoots1) {
for (TreeNode keyRoot2 : keyRoots2) {
forestDistance(keyRoot1, keyRoot2, lmld1, lmld2, postorder1, postorder2, treeDistance);
}
}
if (transformations != null) {
applyForestTrails(treeDistance[postorder2.get(t2)][postorder1.get(t1)], transformations, new
IdentityHashMap<>());
Collections.sort(transformations);
}
return treeDistance[postorder2.get(t2)][postorder1.get(t1)].getTotalCost();
}
/**
* Transforms a series of {@link ForestTrail} objects to a list of {@link TreeTransformation} objects, stored
* in ref
. This is needed as {@link ForestTrail} objects are used internally for storing forest
* distances, while {@link TreeTransformation} are public.
* @param current the current {@link ForestTrail} object in the serie
* @param ref the list in which to store {@link TreeTransformation} objects
* @param matchedNodes a mapping of matched/inserted nodes
*/
private static void applyForestTrails(ForestTrail current, List ref,
IdentityHashMap matchedNodes) {
if (current.nextState == null)
return;
if (current.treeState != null) {
applyForestTrails(current.nextState, ref, matchedNodes);
applyForestTrails(current.treeState, ref, matchedNodes);
} else {
TreeTransformation t;
switch (current.operation) {
case OP_INSERT_NODE:
TreeNode clone = ((EditableTreeNode) current.first).cloneNode();
matchedNodes.put(current.first, clone);
if (current.second != null) {
t = new TreeTransformation(current.operation, current.cost, clone, matchedNodes.get(current
.second));
t.setPosition(current.first.getParent().positionOfChild(current.first));
t.setChildrenCount(current.second.getChildren().size());
} else
t = new TreeTransformation(current.operation, current.cost, clone);
break;
case OP_DELETE_NODE:
t = new TreeTransformation(current.operation, current.cost, current.first);
break;
default:
t = new TreeTransformation(current.operation, current.cost, current.first, current.second);
matchedNodes.put(current.second, current.first);
}
ref.add(t);
applyForestTrails(current.nextState, ref, matchedNodes);
if (current.operation == TreeOperation.OP_INSERT_NODE) {
List descendants = new ArrayList<>();
populateDescendants(current.first, matchedNodes, descendants);
t.setDescendants(descendants);
}
}
}
/**
* Stores all descendants of tree node {@code cur} but with references to their clones, given by mapping {@code
* map}.
* @param cur - the current tree node being visited
* @param map - a mapping of tree nodes to their cloned counterparts
* @param ref - the reference list in which to store the descendants
*/
private static void populateDescendants(TreeNode cur, IdentityHashMap map, List ref) {
for (TreeNode child : cur.getChildren()) {
if (map.containsKey(child))
ref.add(map.get(child));
populateDescendants(child, map, ref);
}
}
/**
* A class which encodes a single transformation in the forest distance table. Used for backtracking to produce a
* series of transformations needed to transform one tree into another.
*/
private static class ForestTrail {
private TreeOperation operation;
private int cost;
private ForestTrail nextState, treeState;
private TreeNode first, second;
/**
* A constructor which initializes the final forest trail state - state where both trees are empty.
*/
public ForestTrail() {
this.cost = 0;
}
public ForestTrail(TreeOperation operation, TreeNode first) {
this.operation = operation;
this.first = first;
this.cost = first.getTransformationCost(operation, null);
}
public ForestTrail(TreeOperation operation, TreeNode first, TreeNode second) {
this.operation = operation;
this.first = first;
this.second = second;
this.cost = first.getTransformationCost(operation, second);
}
public int getTotalCost() {
return this.cost + (this.nextState == null ? 0 : this.nextState.getTotalCost()) + (this.treeState == null
? 0 : this.treeState.getTotalCost());
}
}
private static void forestDistance(TreeNode keyRoot1, TreeNode keyRoot2, TreeNode[] lmld1, TreeNode[] lmld2,
ReversibleIdentityMap postorder1,
ReversibleIdentityMap postorder2, ForestTrail[][] treeDist) {
int kr1 = postorder1.get(keyRoot1),
kr2 = postorder2.get(keyRoot2);
int lm1 = postorder1.get(lmld1[kr1]),
lm2 = postorder2.get(lmld2[kr2]);
int bound1 = kr1 - lm1 + 2;
int bound2 = kr2 - lm2 + 2;
// initialize forest distance table
ForestTrail[][] forestDistance = new ForestTrail[bound2][bound1];
forestDistance[0][0] = new ForestTrail();
for (int i = 1, k = lm2; i < bound2; i++, k++) {
TreeNode t = postorder2.getInverse(k);
forestDistance[i][0] = new ForestTrail(TreeOperation.OP_INSERT_NODE, t, t.getParent());
forestDistance[i][0].nextState = forestDistance[i - 1][0];
}
for (int j = 1, l = lm1; j < bound1; j++, l++) {
TreeNode t = postorder1.getInverse(l);
forestDistance[0][j] = new ForestTrail(TreeOperation.OP_DELETE_NODE, t);
forestDistance[0][j].nextState = forestDistance[0][j - 1];
// prevent removing the root node
if (t.getParent() == null)
forestDistance[0][j].cost = HIGH_COST;
}
// fill in the rest of forest distances
for (int k = lm1, j = 1; k <= kr1; k++, j++) {
for (int l = lm2, i = 1; l <= kr2; l++, i++) {
TreeNode first = postorder1.getInverse(k);
TreeNode second = postorder2.getInverse(l);
ForestTrail insert = new ForestTrail(TreeOperation.OP_INSERT_NODE, second, second.getParent());
insert.nextState = forestDistance[i - 1][j];
ForestTrail delete = new ForestTrail(TreeOperation.OP_DELETE_NODE, first);
delete.nextState = forestDistance[i][j - 1];
// prevent removing the root node
if (first.getParent() == null)
delete.cost = HIGH_COST;
// both keyroots present a tree
ForestTrail rename = new ForestTrail(TreeOperation.OP_RENAME_NODE, first, second);
boolean trees = postorder1.get(lmld1[k]).equals(lm1) && postorder2.get(lmld2[l]).equals(lm2);
if (trees)
rename.nextState = forestDistance[i - 1][j - 1];
else {
rename.treeState = treeDist[l][k];
rename.nextState = forestDistance[postorder2.get(lmld2[l]) - lm2][postorder1.get(lmld1[k]) - lm1];
}
int min = Math.min(insert.getTotalCost(), Math.min(delete.getTotalCost(), rename.getTotalCost()));
if (min == insert.getTotalCost())
forestDistance[i][j] = insert;
else if (min == delete.getTotalCost())
forestDistance[i][j] = delete;
else
forestDistance[i][j] = rename;
if (trees)
treeDist[l][k] = forestDistance[i][j];
}
}
}
/**
* Transform the tree given by the root node {@code root} using a list of {@code transformations}
* obtained by the call to {@link TreeDistance#treeDistanceZhangShasha(TreeNode, TreeNode)}. This operation does
* not produce a copy of the original tree, but makes all modifications in-place.
* @param root the root of the tree being transformed
* @param transformations a list of tree operations which will transform the tree
* @return the transformed tree
*/
public static EditableTreeNode transformTree(EditableTreeNode root, List transformations) {
for (TreeTransformation t : transformations) {
switch (t.getOperation()) {
case OP_INSERT_NODE:
if (t.getSecondNode() == null) {
// insert a new root node
EditableTreeNode inserted = (EditableTreeNode) t.getFirstNode();
inserted.addChildAt(root, 0);
root.setParent(inserted);
root = inserted;
} else {
// insert a child and make demoted siblings its new children
EditableTreeNode parent = (EditableTreeNode) t.getSecondNode();
EditableTreeNode inserted = (EditableTreeNode) t.getFirstNode();
List toRemove = new ArrayList<>();
for (TreeNode child : parent.getChildren()) {
for (TreeNode desc : t.getDescendants()) {
if (desc == child) {
toRemove.add(child);
inserted.addChildAt(child, inserted.getChildren().size());
((EditableTreeNode) child).setParent(inserted);
}
}
}
for (TreeNode child : toRemove)
parent.deleteChild(child);
parent.addChildAt(inserted, Math.max(0, parent.getChildren().size() - t.getChildrenCount() + 1
+ t.getPosition()));
inserted.setParent(parent);
}
break;
case OP_DELETE_NODE:
// delete node from the tree, promoting its children
TreeNode deleted = t.getFirstNode();
int position = deleted.getParent().positionOfChild(deleted);
for (int i = deleted.getChildren().size() - 1; i >= 0; i--) {
((EditableTreeNode) deleted.getParent()).addChildAt(deleted.getChildren().get(i), position);
((EditableTreeNode) deleted.getChildren().get(i)).setParent(deleted.getParent());
}
((EditableTreeNode) deleted.getParent()).deleteChild(deleted);
break;
default:
EditableTreeNode first = (EditableTreeNode) t.getFirstNode(),
second = (EditableTreeNode) t.getSecondNode();
first.renameNodeTo(second);
}
}
return root;
}
}