All Downloads are FREE. Search and download functionalities are using the official Maven repository.

eu.interedition.collatex.suffixarray.Skew Maven / Gradle / Ivy

Go to download

A Java library for collating textual sources, for example, to produce an apparatus.

There is a newer version: 1.7.1
Show newest version
package eu.interedition.collatex.suffixarray;

import java.util.Arrays;

/**
 * 

* Straightforward reimplementation of the recursive algorithm given in: * J. Kärkkäinen and P. Sanders. Simple linear work suffix array construction. * In Proc. 13th International Conference on Automata, Languages and Programming, * Springer, 2003 * *

* This implementation is basically a translation of the C++ version given by Juha * Kärkkäinen and Peter Sanders. *

* The implementation of this algorithm makes some assumptions about the input. See * {@link #buildSuffixArray(int[], int, int)} for details. * * @author Michał Nowak (Carrot Search) * @author Dawid Weiss (Carrot Search) */ public final class Skew implements ISuffixArrayBuilder { /** * Lexicographic order for pairs. */ private final static boolean leq(int a1, int a2, int b1, int b2) { return (a1 < b1 || (a1 == b1 && a2 <= b2)); } /** * Lexicographic order for triples. */ private final static boolean leq(int a1, int a2, int a3, int b1, int b2, int b3) { return (a1 < b1 || (a1 == b1 && leq(a2, a3, b2, b3))); } /** * Stably sort indexes from src[0..n-1] to dst[0..n-1] with values in 0..K from v. A * constant offset of vi is added to indexes from src. */ private final static void radixPass(int[] src, int[] dst, int[] v, int vi, final int n, final int K, int start, int[] cnt) { // check counter array's size. assert cnt.length >= K + 1; Arrays.fill(cnt, 0, K + 1, 0); // count occurrences for (int i = 0; i < n; i++) cnt[v[start + vi + src[i]]]++; // exclusive prefix sums for (int i = 0, sum = 0; i <= K; i++) { final int t = cnt[i]; cnt[i] = sum; sum += t; } // sort for (int i = 0; i < n; i++) dst[cnt[v[start + vi + src[i]]]++] = src[i]; } /** * Find the suffix array SA of s[0..n-1] in {1..K}^n. require s[n] = s[n+1] = s[n+2] = * 0, n >= 2. */ static final int[] suffixArray(int[] s, int[] SA, int n, final int K, int start, int[] cnt) { final int n0 = (n + 2) / 3, n1 = (n + 1) / 3, n2 = n / 3, n02 = n0 + n2; final int[] s12 = new int[n02 + 3]; s12[n02] = s12[n02 + 1] = s12[n02 + 2] = 0; final int[] SA12 = new int[n02 + 3]; SA12[n02] = SA12[n02 + 1] = SA12[n02 + 2] = 0; final int[] s0 = new int[n0]; final int[] SA0 = new int[n0]; /* * generate positions of mod 1 and mod 2 suffixes the "+(n0-n1)" adds a dummy mod * 1 suffix if n%3 == 1 */ for (int i = 0, j = 0; i < n + (n0 - n1); i++) if ((i % 3) != 0) s12[j++] = i; // lsb radix sort the mod 1 and mod 2 triples cnt = ensureSize(cnt, K + 1); radixPass(s12, SA12, s, +2, n02, K, start, cnt); radixPass(SA12, s12, s, +1, n02, K, start, cnt); radixPass(s12, SA12, s, +0, n02, K, start, cnt); // find lexicographic names of triples int name = 0, c0 = -1, c1 = -1, c2 = -1; for (int i = 0; i < n02; i++) { if (s[start + SA12[i]] != c0 || s[start + SA12[i] + 1] != c1 || s[start + SA12[i] + 2] != c2) { name++; c0 = s[start + SA12[i]]; c1 = s[start + SA12[i] + 1]; c2 = s[start + SA12[i] + 2]; } if ((SA12[i] % 3) == 1) { // left half s12[SA12[i] / 3] = name; } else { // right half s12[SA12[i] / 3 + n0] = name; } } // recurse if names are not yet unique if (name < n02) { cnt = suffixArray(s12, SA12, n02, name, start, cnt); // store unique names in s12 using the suffix array for (int i = 0; i < n02; i++) s12[SA12[i]] = i + 1; } else { // generate the suffix array of s12 directly for (int i = 0; i < n02; i++) SA12[s12[i] - 1] = i; } // stably sort the mod 0 suffixes from SA12 by their first character for (int i = 0, j = 0; i < n02; i++) if (SA12[i] < n0) s0[j++] = 3 * SA12[i]; radixPass(s0, SA0, s, 0, n0, K, start, cnt); // merge sorted SA0 suffixes and sorted SA12 suffixes for (int p = 0, t = n0 - n1, k = 0; k < n; k++) { // pos of current offset 12 suffix final int i = (SA12[t] < n0 ? SA12[t] * 3 + 1 : (SA12[t] - n0) * 3 + 2); // pos of current offset 0 suffix final int j = SA0[p]; if (SA12[t] < n0 ? leq(s[start + i], s12[SA12[t] + n0], s[start + j], s12[j / 3]) : leq(s[start + i], s[start + i + 1], s12[SA12[t] - n0 + 1], s[start + j], s[start + j + 1], s12[j / 3 + n0])) { // suffix from SA12 is smaller SA[k] = i; t++; if (t == n02) { // done --- only SA0 suffixes left for (k++; p < n0; p++, k++) SA[k] = SA0[p]; } } else { SA[k] = j; p++; if (p == n0) { // done --- only SA12 suffixes left for (k++; t < n02; t++, k++) { SA[k] = (SA12[t] < n0 ? SA12[t] * 3 + 1 : (SA12[t] - n0) * 3 + 2); } } } } return cnt; } /** * Ensure array is large enough or reallocate (no copying). */ private static final int[] ensureSize(int[] tab, int length) { if (tab.length < length) { tab = null; tab = new int[length]; } return tab; } /** * {@inheritDoc} *

* Additional constraints enforced by Karkkainen-Sanders algorithm: *

    *
  • non-negative (>0) symbols in the input (because of radix sort)
  • *
  • input.length >= start + length + 3 (to simplify * border cases)
  • *
  • length >= 2
  • *
*

* If the input contains zero or negative values, or has no extra trailing cells, * adapters can be used in the following way: *

*

     * return new {@link DensePositiveDecorator}(
     *      new {@link ExtraTrailingCellsDecorator}(
     *          new {@link Skew}(), 3));
     * 
* * @see ExtraTrailingCellsDecorator * @see DensePositiveDecorator */ @Override public int[] buildSuffixArray(int[] input, int start, int length) { Tools.assertAlways(input != null, "input must not be null"); Tools.assertAlways(length >= 2, "input length must be >= 2"); Tools.assertAlways(input.length >= start + length + 3, "no extra space after input end"); assert Tools.allPositive(input, start, length); final int alphabetSize = Tools.max(input, start, length); final int[] SA = new int[length + 3]; // Preserve the tail of the input (destroyed when constructing the array). final int[] tail = new int[3]; System.arraycopy(input, start + length, tail, 0, 3); Arrays.fill(input, start + length, start + length + 3, 0); suffixArray(input, SA, length, alphabetSize, start, new int[alphabetSize + 2]); // Reconstruct the input's tail. System.arraycopy(tail, 0, input, start + length, 3); return SA; } }




© 2015 - 2024 Weber Informatics LLC | Privacy Policy