All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.jgrapht.alg.similarity.ZhangShashaTreeEditDistance Maven / Gradle / Ivy

The newest version!
/*
 * (C) Copyright 2020-2023, by Semen Chudakov and Contributors.
 *
 * JGraphT : a free Java graph-theory library
 *
 * See the CONTRIBUTORS.md file distributed with this work for additional
 * information regarding copyright ownership.
 *
 * This program and the accompanying materials are made available under the
 * terms of the Eclipse Public License 2.0 which is available at
 * http://www.eclipse.org/legal/epl-2.0, or the
 * GNU Lesser General Public License v2.1 or later
 * which is available at
 * http://www.gnu.org/licenses/old-licenses/lgpl-2.1-standalone.html.
 *
 * SPDX-License-Identifier: EPL-2.0 OR LGPL-2.1-or-later
 */
package org.jgrapht.alg.similarity;

import org.jgrapht.Graph;
import org.jgrapht.GraphTests;
import org.jgrapht.Graphs;

import java.util.ArrayList;
import java.util.Collections;
import java.util.Iterator;
import java.util.List;
import java.util.Objects;
import java.util.function.ToDoubleBiFunction;
import java.util.function.ToDoubleFunction;

/**
 * Dynamic programming algorithm for computing edit distance between trees.
 *
 * 

* The algorithm is originally described in Zhang, Kaizhong & Shasha, Dennis. (1989). Simple * Fast Algorithms for the Editing Distance Between Trees and Related Problems. SIAM J. Comput.. 18. * 1245-1262. 10.1137/0218082. * *

* The time complexity of the algorithm is $O(|T_1|\cdot|T_2|\cdot min(depth(T_1),leaves(T_1)) \cdot * min(depth(T_2),leaves(T_2)))$. Space complexity is $O(|T_1|\cdot |T_2|)$, where $|T_1|$ and * $|T_2|$ denote number of vertices in trees $T_1$ and $T_2$ correspondingly, $leaves()$ function * returns number of leaf vertices in a tree. * * *

* The tree edit distance problem is defined in a following way. Consider $2$ trees $T_1$ and $T_2$ * with root vertices $r_1$ and $r_2$ correspondingly. For those trees there are 3 elementary * modification actions: * *

    *
  • Remove a vertex $v$ from $T_1$.
  • *
  • Insert a vertex $v$ into $T_2$.
  • *
  • Change vertex $v_1$ in $T_1$ to vertex $v_2$ in $T_2$.
  • *
* * The algorithm assigns a cost to each of those operations which also depends on the vertices. The * problem is then to compute a sequence of edit operations which transforms $T_1$ into $T_2$ and * has a minimum cost over all such sequences. Here the cost of a sequence of edit operations is * defined as sum of costs of individual operations. * *

* The algorithm is based on a dynamic programming principle and assigns a label to each vertex in * the trees which is equal to its index in post-oder traversal. It also uses a notion of a keyroot * which is defined as a vertex in a tree which has a left sibling. Additionally a special $l()$ * function is introduced with returns for every vertex the index of its leftmost child wrt the * post-order traversal in the tree. * *

* Solving the tree edit problem distance is divided into computing edit distance for every pair of * subtrees rooted at vertices $v_1$ and $v_2$ where $v_1$ is a keyroot in the first tree and $v_2$ * is a keyroot in the second tree. * * @param graph vertex type * @param graph edge type * @author Semen Chudakov */ public class ZhangShashaTreeEditDistance { /** * First tree for which the distance is computed by this algorithm. */ private Graph tree1; /** * Root vertex of the {@code tree1}. */ private V root1; /** * Second tree for which the distance is computed by this algorithm. */ private Graph tree2; /** * Root vertex of the {@code tree2}. */ private V root2; /** * Function which computes cost of inserting a particular vertex into {@code tree2}. */ private ToDoubleFunction insertCost; /** * Function which computes cost of removing a particular vertex from {2code tree1}. */ private ToDoubleFunction removeCost; /** * Function which computes cost of changing a vertex $v1$ in {@code tree1} to vertex $v2$ in * {@code tree2}. */ private ToDoubleBiFunction changeCost; /** * Array with edit distances between subtrees of {@code tree1} and {@code tree2}. Formally, * $treeDistances[i][j]$ stores edit distance between subtree of {@code tree1} rooted at vertex * $i+1$ and subtree of {@code tree2} rooted at vertex $j+1$, where $i$ and $j$ are vertex * indices from the corresponding tree orderings. */ private double[][] treeDistances; /** * Array with lists of edit operations which transform subtrees of {@code tree1} into subtrees * {@code tree2}. Formally, editOperationLists[i][j]$ stores a list of edit operations which * transform subtree {@code tree1} rooted at vertex $i$ into subtree of {@code tree2} rooted at * vertex $j$, where $i$ and $j$ are vertex indices from the corresponding tree orderings. */ private List>>> editOperationLists; /** * Helper field which indicates whether the algorithm has already been executed for * {@code tree1} and {@code tree2}. */ private boolean algorithmExecuted; /** * Constructs an instance of the algorithm for the given {@code tree1}, {@code root1}, * {@code tree2} and {@code root2}. This constructor sets following default values for the * distance functions. The {@code insertCost} and {@code removeCost} always return $1.0$, the * {@code changeCost} return $0.0$ if vertices are equal and {@code 1.0} otherwise. * * @param tree1 a tree * @param root1 root vertex of {@code tree1} * @param tree2 a tree * @param root2 root vertex of {@code tree2} */ public ZhangShashaTreeEditDistance(Graph tree1, V root1, Graph tree2, V root2) { this(tree1, root1, tree2, root2, v -> 1.0, v -> 1.0, (v1, v2) -> { if (v1.equals(v2)) { return 0.0; } return 1.0; }); } /** * Constructs an instance of the algorithm for the given {@code tree1}, {@code root1}, * {@code tree2}, {@code root2}, {@code insertCost}, {@code removeCost} and {@code changeCost}. * * @param tree1 a tree * @param root1 root vertex of {@code tree1} * @param tree2 a tree * @param root2 root vertex of {@code tree2} * @param insertCost cost function for inserting a node into {@code tree1} * @param removeCost cost function for removing a node from {@code tree2} * @param changeCost cost function of changing a node in {@code tree1} to a node in * {@code tree2} */ public ZhangShashaTreeEditDistance( Graph tree1, V root1, Graph tree2, V root2, ToDoubleFunction insertCost, ToDoubleFunction removeCost, ToDoubleBiFunction changeCost) { this.tree1 = Objects.requireNonNull(tree1, "graph1 cannot be null!"); this.root1 = Objects.requireNonNull(root1, "root1 cannot be null!"); this.tree2 = Objects.requireNonNull(tree2, "graph2 cannot be null!"); this.root2 = Objects.requireNonNull(root2, "root2 cannot be null!"); this.insertCost = Objects.requireNonNull(insertCost, "insertCost cannot be null!"); this.removeCost = Objects.requireNonNull(removeCost, "removeCost cannot be null!"); this.changeCost = Objects.requireNonNull(changeCost, "changeCost cannot be null!"); if (!GraphTests.isTree(tree1)) { throw new IllegalArgumentException("graph1 must be a tree!"); } if (!GraphTests.isTree(tree2)) { throw new IllegalArgumentException("graph2 must be a tree!"); } int m = tree1.vertexSet().size(); int n = tree2.vertexSet().size(); treeDistances = new double[m][n]; editOperationLists = new ArrayList<>(m); for (int i = 0; i < m; ++i) { editOperationLists.add(new ArrayList<>(Collections.nCopies(n, null))); } } /** * Computes edit distance for {@code tree1} and {@code tree2}. * * @return edit distance between {@code tree1} and {@code tree2} */ public double getDistance() { lazyRunAlgorithm(); int m = tree1.vertexSet().size(); int n = tree2.vertexSet().size(); return treeDistances[m - 1][n - 1]; } /** * Computes a list of edit operations which transform {@code tree1} into {@code tree2}. * * @return list of edit operations */ public List> getEditOperationLists() { lazyRunAlgorithm(); int m = tree1.vertexSet().size(); int n = tree2.vertexSet().size(); return Collections.unmodifiableList(editOperationLists.get(m - 1).get(n - 1)); } /** * Performs lazy computations of this algorithm and stores cached data in {@code treeDistances} * and {@code editOperationList}. */ private void lazyRunAlgorithm() { if (!algorithmExecuted) { TreeOrdering ordering1 = new TreeOrdering(tree1, root1); TreeOrdering ordering2 = new TreeOrdering(tree2, root2); for (int keyroot1 : ordering1.keyroots) { for (Integer keyroot2 : ordering2.keyroots) { treeDistance(keyroot1, keyroot2, ordering1, ordering2); } } algorithmExecuted = true; } } /** * Computes edit distance and list of edit operations for vertex $v1$ from {@code tree1} which * has tree ordering index equal to $i$ and vertex $v2$ from {@code tree2} which has tree * ordering index equal to $j$. Both $v1$ and $v2$ must be keyroots in the corresponding trees. * * @param i ordering index of a keyroot in {@code tree1} * @param j ordering index of a keywoot in {@code tree2} * @param ordering1 ordering of {@code tree1} * @param ordering2 ordering of {@code tree2} */ private void treeDistance(int i, int j, TreeOrdering ordering1, TreeOrdering ordering2) { int li = ordering1.indexToLValueList.get(i); int lj = ordering2.indexToLValueList.get(j); int m = i - li + 2; int n = j - lj + 2; double[][] forestdist = new double[m][n]; List> cachedOperations = new ArrayList<>(m); for (int k = 0; k < m; ++k) { cachedOperations.add(new ArrayList<>(Collections.nCopies(n, null))); } int iOffset = li - 1; int jOffset = lj - 1; for (int i1 = li; i1 <= i; ++i1) { V i1Vertex = ordering1.indexToVertexList.get(i1); int iIndex = i1 - iOffset; forestdist[iIndex][0] = forestdist[iIndex - 1][0] + removeCost.applyAsDouble(i1Vertex); CacheEntry entry = new CacheEntry( iIndex - 1, 0, new EditOperation<>(OperationType.REMOVE, i1Vertex, null)); cachedOperations.get(iIndex).set(0, entry); } for (int j1 = lj; j1 <= j; ++j1) { V j1Vertex = ordering2.indexToVertexList.get(j1); int jIndex = j1 - jOffset; forestdist[0][jIndex] = forestdist[0][jIndex - 1] + removeCost.applyAsDouble(j1Vertex); CacheEntry entry = new CacheEntry( 0, jIndex - 1, new EditOperation<>(OperationType.INSERT, j1Vertex, null)); cachedOperations.get(0).set(jIndex, entry); } for (int i1 = li; i1 <= i; ++i1) { V i1Vertex = ordering1.indexToVertexList.get(i1); int li1 = ordering1.indexToLValueList.get(i1); for (int j1 = lj; j1 <= j; ++j1) { V j1Vertex = ordering2.indexToVertexList.get(j1); int lj1 = ordering2.indexToLValueList.get(j1); int iIndex = i1 - iOffset; int jIndex = j1 - jOffset; if (li1 == li && lj1 == lj) { double dist1 = forestdist[iIndex - 1][jIndex] + removeCost.applyAsDouble(i1Vertex); double dist2 = forestdist[iIndex][jIndex - 1] + insertCost.applyAsDouble(j1Vertex); double dist3 = forestdist[iIndex - 1][jIndex - 1] + changeCost.applyAsDouble(i1Vertex, j1Vertex); double result = Math.min(dist1, Math.min(dist2, dist3)); CacheEntry entry; if (result == dist1) { // remove operation entry = new CacheEntry( iIndex - 1, jIndex, new EditOperation<>(OperationType.REMOVE, i1Vertex, null)); } else if (result == dist2) { // insert operation entry = new CacheEntry( iIndex, jIndex - 1, new EditOperation<>(OperationType.INSERT, j1Vertex, null)); } else { // result == dist3 => change operation entry = new CacheEntry( iIndex - 1, jIndex - 1, new EditOperation<>(OperationType.CHANGE, i1Vertex, j1Vertex)); } cachedOperations.get(iIndex).set(jIndex, entry); forestdist[iIndex][jIndex] = result; treeDistances[i1 - 1][j1 - 1] = result; editOperationLists.get(i1 - 1).set( j1 - 1, restoreOperationsList(cachedOperations, iIndex, jIndex)); } else { int i2 = li1 - 1 - iOffset; int j2 = lj1 - 1 - jOffset; double dist1 = forestdist[iIndex - 1][jIndex] + removeCost.applyAsDouble(i1Vertex); double dist2 = forestdist[iIndex][jIndex - 1] + insertCost.applyAsDouble(j1Vertex); double dist3 = forestdist[i2][j2] + treeDistances[i1 - 1][j1 - 1]; double result = Math.min(dist1, Math.min(dist2, dist3)); forestdist[iIndex][jIndex] = result; CacheEntry entry; if (result == dist1) { entry = new CacheEntry( iIndex - 1, jIndex, new EditOperation<>(OperationType.REMOVE, i1Vertex, null)); } else if (result == dist2) { entry = new CacheEntry( iIndex, jIndex - 1, new EditOperation<>(OperationType.INSERT, j1Vertex, null)); } else { entry = new CacheEntry(i2, j2, null); entry.treeDistanceI = i1 - 1; entry.treeDistanceJ = j1 - 1; } cachedOperations.get(iIndex).set(jIndex, entry); } } } } /** * Restores list of edit operations which have been cached in {@code cachedOperations} during * the edit distance computation. Starting from a cache entry at index $(i,j)$. * * @param cachedOperations 2-dimensional list with cached operations * @param i starting operation index * @param j starting operation index * @return list of edit operations */ private List> restoreOperationsList( List> cachedOperations, int i, int j) { List> result = new ArrayList<>(); CacheEntry it = cachedOperations.get(i).get(j); while (it != null) { if (it.editOperation == null) { result.addAll(editOperationLists.get(it.treeDistanceI).get(it.treeDistanceJ)); } else { result.add(it.editOperation); } it = cachedOperations.get(it.cachePreviousPosI).get(it.cachePreviousPosJ); } return result; } /** * Auxiliary class which for computes keyroot vertices, tree ordering and $l()$ function for a * particular tree. * *

* A keyroot of a tree is a vertex which has a left sibling. Ordering of a tree assings an * integer index to every its vertex. Indices are assigned using post-order traversal. $l()$ * function for every vertex in a tree returns ordering index of its leftmost child. For leaf * vertex the function returns its own ordering index. */ private class TreeOrdering { /** * Underlying tree of this ordering. */ final Graph tree; /** * Root vertex of {@code tree}. */ final V treeRoot; /** * List of keyroots of {@code tree}. */ List keyroots; /** * List which at very position $i$ stores a vertex from {@code tree} which has ordering * index equal to $i$. */ List indexToVertexList; /** * List which at every position $i$ stores value of $l()$ function for a vertex from * {@code tree} whihc has ordering index equal to $i$. */ List indexToLValueList; /** * Ordering index to be assigned to the next traversed vertex. */ int currentIndex; /** * Constructs an instance of the tree ordering for the given {@code graph} and * {@code treeRoot}. * * @param tree a tree * @param treeRoot root vertex of {@code tree} */ public TreeOrdering(Graph tree, V treeRoot) { this.tree = tree; this.treeRoot = treeRoot; int numberOfVertices = tree.vertexSet().size(); keyroots = new ArrayList<>(); indexToVertexList = new ArrayList<>(Collections.nCopies(numberOfVertices + 1, null)); indexToLValueList = new ArrayList<>(Collections.nCopies(numberOfVertices + 1, null)); currentIndex = 1; computeKeyrootsAndMapping(treeRoot); } /** * Runs post-order DFS on {@code tree} starting at {@code treeRoot}. Assigns consecutive * integer index to every traversed vertex and computes keyroots for {@code tree}. * * @param treeRoot root vertex of {@code tree} */ private void computeKeyrootsAndMapping(V treeRoot) { List stack = new ArrayList<>(); stack.add(new StackEntry(treeRoot, true)); while (!stack.isEmpty()) { StackEntry entry = stack.get(stack.size() - 1); if (entry.state == 0) { if (stack.size() > 1) { entry.vParent = stack.get(stack.size() - 2).v; } entry.vChildIterator = Graphs.successorListOf(tree, entry.v).iterator(); entry.state = 1; } else if (entry.state == 1) { if (entry.vChildIterator.hasNext()) { entry.vChild = entry.vChildIterator.next(); if (entry.vParent == null || !entry.vChild.equals(entry.vParent)) { stack.add(new StackEntry(entry.vChild, entry.isKeyrootArg)); entry.state = 2; } } else { entry.state = 3; } } else if (entry.state == 2) { entry.isKeyrootArg = true; if (entry.lValue == -1) { entry.lValue = entry.lVChild; } entry.state = 1; } else if (entry.state == 3) { if (entry.lValue == -1) { entry.lValue = currentIndex; } if (entry.isKeyroot) { keyroots.add(currentIndex); } indexToVertexList.set(currentIndex, entry.v); indexToLValueList.set(currentIndex, entry.lValue); ++currentIndex; if (stack.size() > 1) { stack.get(stack.size() - 2).lVChild = entry.lValue; } stack.remove(stack.size() - 1); } } } /** * Auxiliary class which stores all needed variables to emulate recursive execution of DFS * algorithm in {@code computeKeyrootsAndMapping()} method. */ private class StackEntry { /** * A vertex from {@code tree}. */ V v; /** * Indites if {@code v} is a keyroot wrt {@code tree}. */ boolean isKeyroot; /** * Parent vertex of {@code v} in {@code tree} or $null$ if {@code v} is root of * {@code tree}. */ V vParent; /** * Indicates if the next vertex returned by {@code vChildIterator} will be a keyroot. */ boolean isKeyrootArg; /** * Value of the $l()$ function for {@code v}; */ int lValue; /** * Iterates over children of $v$ in {@code tree}. */ Iterator vChildIterator; /** * Current child vertex of {@code v}. */ V vChild; /** * Value of $l()$ function for {@code vChild}. */ int lVChild; /** * Auxiliary field which helps to identify which part of the recursive procedure should * be executed next for this stack entry. */ int state; /** * Constructs an instance of the stack entry for the given {@code v} and * {@code isKeyroot} * * @param v a vertex from {@code tree} * @param isKeyroot true iff {@code v} is a keyroot */ public StackEntry(V v, boolean isKeyroot) { this.v = v; this.isKeyroot = isKeyroot; this.lValue = -1; } } } /** * Represents elementary action which changes the structure of a tree. * * @param tree vertex type */ public static class EditOperation { /** * Type of this operation. */ private final OperationType type; /** * Vertex of a tree which is the first operand of this operations. */ private final V firstOperand; /** * Vertex of a tree which is a second operand of this operation. For * {@code OperationsType.INSERT} and {@code OperationsType.REMOVE} this field is null. */ private final V secondOperand; /** * Returns type of this operation. * * @return oeration type */ public OperationType getType() { return type; } /** * Returns first operand of this operation * * @return first operand */ public V getFirstOperand() { return firstOperand; } /** * Returns second operand of this operation. * * @return second operand */ public V getSecondOperand() { return secondOperand; } /** * Constructs an instance of edit operation for the given {@code type}, {@code firstOperand} * and {@code secondOperand}. * * @param type type of the operation * @param firstOperand first operand of the operation * @param secondOperand second operand of the operation */ public EditOperation(OperationType type, V firstOperand, V secondOperand) { this.type = type; this.firstOperand = firstOperand; this.secondOperand = secondOperand; } @Override public boolean equals(Object o) { if (this == o) return true; if (o == null || getClass() != o.getClass()) return false; EditOperation editOperation = (EditOperation) o; if (type != editOperation.type) return false; if (!firstOperand.equals(editOperation.firstOperand)) return false; return secondOperand != null ? secondOperand.equals(editOperation.secondOperand) : editOperation.secondOperand == null; } @Override public int hashCode() { int result = type.hashCode(); result = 31 * result + firstOperand.hashCode(); result = 31 * result + (secondOperand != null ? secondOperand.hashCode() : 0); return result; } @Override public String toString() { if (type.equals(OperationType.INSERT) || type.equals(OperationType.REMOVE)) { return type + " " + firstOperand; } return type + " " + firstOperand + " -> " + secondOperand; } } /** * Type of an edit operation. */ public enum OperationType { /** * Indicates that an edit operation is inserting a vertex into a tree. */ INSERT, /** * Indicates that an edit operation is removing a vertex into a tree. */ REMOVE, /** * Indicates that an edit operation is changing a vertex in one tree to a vertex in another * three. */ CHANGE } /** * Auxiliary class which is used in {@code treeDistance()} function to store intermediate edit * operations during dynamic programming computation. */ private class CacheEntry { /** * Outer index of the previous entry which is part of the computed optimal solution. */ int cachePreviousPosI; /** * Inner index of the previous entry which is part of the computed optimal solution. */ int cachePreviousPosJ; /** * Edit operation stored in this entry. Is this field is $null$ this indicates that * operations from $editOperationLists[treeDistanceI][treeDistanceJ]$. */ EditOperation editOperation; /** * Outer index of an entry in $editOperationLists$ which should be taken in case * {@code editOperation} is $null$. */ int treeDistanceI; /** * Inner index of an entry in $editOperationLists$ which should be taken in case * {@code editOperation} is $null$. */ int treeDistanceJ; /** * Constructs an instance of entry for the given {@code cachePreviousPosI} * {@code cachePreviousPosJ} and {@code editOperation}. * * @param cachePreviousPosI outer index of the previous cache entry * @param cachePreviousPosJ inner index of the previous cache entry * @param editOperation edit operation of this entry */ public CacheEntry( int cachePreviousPosI, int cachePreviousPosJ, EditOperation editOperation) { this.cachePreviousPosI = cachePreviousPosI; this.cachePreviousPosJ = cachePreviousPosJ; this.editOperation = editOperation; } } }





© 2015 - 2024 Weber Informatics LLC | Privacy Policy