All Downloads are FREE. Search and download functionalities are using the official Maven repository.

eu.interedition.collatex.suffixarray.Traversals Maven / Gradle / Ivy

Go to download

A Java library for collating textual sources, for example, to produce an apparatus.

There is a newer version: 1.7.1
Show newest version
package eu.interedition.collatex.suffixarray;

import java.util.ArrayDeque;
import java.util.ArrayList;
import java.util.Deque;

/**
 * Suffix array traversal routines (emulating corresponding suffix tree traversals).
 *
 * @author Michał Nowak (Carrot Search)
 * @author Dawid Weiss (Carrot Search)
 */
public final class Traversals {
    /**
     * Visitor interface for post-order traversal methods in {@link Traversals}.
     */
    public interface IPostOrderVisitor {
        /**
         * Visits a node in the (virtual) suffix tree, labeled with length
         * objects starting at start in the input sequence.
         *
         * @param start  The node label's starting offset in the input sequence.
         * @param length The node label's length (number of symbols).
         * @param leaf   true if this node is a leaf.
         */
        public void visitNode(int start, int length, boolean leaf);
    }

    /**
     * Visitor interface for post-order traversal methods that compute an aggregated value
     * during the traversal.
     */
    public interface IPostOrderComputingVisitor {
        /**
         * Aggregate two values into the result. The aggregation function should be
         * symmetric, that is: value1 + value2 = value2 + value1.
         */
        public E aggregate(E value1, E value2);

        /**
         * Compute the initial value for a leaf node.
         *
         * @param saIndex     Index of the leaf node in the suffix array.
         * @param symbolIndex The node label's starting offset in the input sequence.
         * @param length      The node label's length (number of symbols).
         * @return Returns the initial function value for the leaf node.
         */
        public E leafValue(int saIndex, int symbolIndex, int length);

        /**
         * Visits a node in the (virtual) suffix tree, labeled with length
         * objects starting at start in the input sequence.
         *
         * @param start  The node label's starting offset in the input sequence.
         * @param length The node label's length (number of symbols).
         * @param leaf   true if this node is a leaf.
         * @param value  Aggregated value for all sub-nodes of the given node.
         */
        public void visitNode(int start, int length, boolean leaf, E value);
    }

    /**
     * 

* Post-order traversal of all branching nodes in a suffix tree (emulated using a * suffix array and the LCP array). Post-order traversal is also called bottom-up * traversal that is child nodes are reported before parent nodes (and the root is * the last node to process). *

* The algorithm implemented here is from Efficient Substring Traversal with Suffix * Arrays by Toru Kasai, Hiroki Arimura and Setsuo Arikawa, Dept of Informatics, * Kyushu University, Japan. * * @param sequenceLength Input sequence length for the suffix array and LCP array. * @param sa Suffix array. * @param lcp Corresponding LCP array for a given suffix array. * @param visitor Callback visitor. */ public static void postorder(final int sequenceLength, int[] sa, int[] lcp, IPostOrderVisitor visitor) { assert sequenceLength <= sa.length && sequenceLength <= lcp.length : "Input sequence length larger than suffix array or the LCP."; final Deque stack = new ArrayDeque(); // Push the stack bottom marker (sentinel). stack.push(-1); stack.push(-1); // Process every leaf. int top_h; for (int i = 0; i <= sequenceLength; i++) { final int h = (sequenceLength == i ? -1 : lcp[i]); while (true) { top_h = stack.peek(); if (top_h <= h) break; stack.pop(); // Visit the node and remove it from the end of the stack. final int top_i = stack.pop(); final boolean leaf = (top_i < 0); visitor.visitNode(sa[leaf ? -(top_i + 1) : top_i], top_h, leaf); } if (top_h < h) { stack.push(i); stack.push(h); } if (i < sequenceLength) { // Mark leaf nodes in the stack. stack.push(-(i + 1)); stack.push(sequenceLength - sa[i]); } } } /** *

* Post-order traversal of all branching nodes in a suffix tree (emulated using a * suffix array and the LCP array). Post-order traversal is also called bottom-up * traversal that is child nodes are reported before parent nodes (and the root is * the last node to process). *

* The algorithm implemented here is from Efficient Substring Traversal with Suffix * Arrays by Toru Kasai, Hiroki Arimura and Setsuo Arikawa, Dept of Informatics, * Kyushu University, Japan. * * @param sequenceLength Input sequence length for the suffix array and LCP array. * @param sa Suffix array. * @param lcp Corresponding LCP array for a given suffix array. * @param visitor Callback visitor computing aggregate values when traversing the * tree. * @param epsilon "Zero" value (epsilon) for computations. */ public static void postorder(final int sequenceLength, int[] sa, int[] lcp, E epsilon, IPostOrderComputingVisitor visitor) { assert sequenceLength <= sa.length && sequenceLength <= lcp.length : "Input sequence length larger than suffix array or the LCP."; final Deque stack = new ArrayDeque(); final ArrayList values = new ArrayList(); // Push the stack bottom marker (sentinel). stack.push(-1); stack.push(-1); values.add(epsilon); // Process every leaf. int top_h; E top_c; for (int i = 0; i <= sequenceLength; i++) { final int h = (sequenceLength == i ? -1 : lcp[i]); E ci = epsilon; while (true) { top_h = stack.peek(); if (top_h <= h) break; stack.pop(); // Visit the node and remove it from the end of the stack. top_c = values.remove(values.size() - 1); final int top_i = stack.pop(); final boolean leaf = (top_i < 0); ci = visitor.aggregate(top_c, ci); visitor.visitNode(sa[leaf ? -(top_i + 1) : top_i], top_h, leaf, ci); top_c = values.get(values.size() - 1); } if (top_h < h) { stack.push(i); stack.push(h); values.add(ci); } else { assert top_h == h; final int index = values.size() - 1; values.set(index, visitor.aggregate(ci, values.get(index))); } if (i < sequenceLength) { // Mark leaf nodes in the stack. stack.push(-(i + 1)); stack.push(sequenceLength - sa[i]); values.add(visitor.leafValue(i, sa[i], sequenceLength - sa[i])); } } } }





© 2015 - 2024 Weber Informatics LLC | Privacy Policy