eu.interedition.collatex.suffixarray.BPR Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of collatex-core Show documentation
A Java library for collating textual sources, for example, to produce an apparatus.
There is a newer version: 1.7.1
package eu.interedition.collatex.suffixarray;

import java.util.Arrays;

/**
 * 
 * A straightforward reimplementation of the bucket pointer refinement algorithm given in:
 * 
 * Klaus-Bernd Schürmann, Suffix Arrays in Theory and Practice, Faculty of Technology of
 * Bielefeld University, Germany, 2007
 * 
 * 

 * This implementation is basically a translation of the C version given by Klaus-Bernd
 * Schürmann: http://bibiserv.techfak.uni-bielefeld.de/download/tools/bpr.html
 * 

 * The implementation of this algorithm makes some assumptions about the input. See
 * {@link #buildSuffixArray(int[], int, int)} for details.
 *
 * @author Michał Nowak (Carrot Search)
 * @author Dawid Weiss (Carrot Search)
 */
public class BPR implements ISuffixArrayBuilder {
    private final static class Alphabet {
        int size;
        int[] charArray;
        int[] alphaMapping;
        int[] charFreq;

        Alphabet(int[] thisString, int stringLength) {
            int tmpChar;
            size = 0;
            alphaMapping = new int[KBS_MAX_ALPHABET_SIZE];
            charFreq = new int[KBS_MAX_ALPHABET_SIZE];
            for (int i = 0; i < stringLength; i++) {
                tmpChar = thisString[i];
                Tools.assertAlways(tmpChar >= 0, "Input must be positive");
                if (charFreq[tmpChar] == 0) {
                    size++;
                }
                charFreq[tmpChar]++;
            }

            charArray = new int[size + 1];
            charArray[size] = 0;
            int k = 0;
            for (int i = 0; i < KBS_MAX_ALPHABET_SIZE; i++) {
                alphaMapping[i] = -1;
                if (charFreq[i] > 0) {
                    charArray[k] = i;
                    alphaMapping[i] = k;
                    k++;
                }
            }
            Tools.assertAlways(k == size, "k != size");
        }
    }

    public final static int KBS_MAX_ALPHABET_SIZE = 256;
    public final static int KBS_INSSORT_THRES_LEN = 15;
    public final static int KBS_STRING_EXTENSION_SIZE = 32;
    public final static int INSSORT_LIMIT = 15;

    /**
     * If true, {@link #buildSuffixArray(int[], int, int)} uses a copy of the
     * input so it is left intact.
     */
    private final boolean preserveInput;

    private int[] seq;
    private int length;
    private Alphabet alphabet;
    private int[] suffixArray;
    private int[] sufPtrMap;

    private int start;

    public BPR() {
        this(true);
    }

    public BPR(boolean preserveInput) {
        this.preserveInput = preserveInput;
    }

    /**
     * {@inheritDoc}
     * 

     * Additional constraints enforced by BPR algorithm:
     * 

     * input array must contain at least {@link #KBS_STRING_EXTENSION_SIZE} extra
     * cells
     * non-negative (≥0) symbols in the input
     * symbols limited by {@link #KBS_MAX_ALPHABET_SIZE} (<
     * KBS_MAX_ALPHABET_SIZE)
     * length ≥ 2
     * 
     * 
     */
    @Override
    public int[] buildSuffixArray(int[] input, int start, int length) {
        Tools.assertAlways(input != null, "input must not be null");
        Tools.assertAlways(input.length >= start + length + KBS_STRING_EXTENSION_SIZE,
            "input is too short");
        Tools.assertAlways(length >= 2, "input length must be >= 2");
        this.start = start;

        if (preserveInput) {

            seq = new int[length + KBS_STRING_EXTENSION_SIZE];
            this.start = 0;
            System.arraycopy(input, start, seq, 0, length);
        } else {
            seq = input;
        }

        this.alphabet = new Alphabet(seq, length);
        this.length = length;
        int alphaSize = alphabet.size;

        int q;
        if (alphaSize <= 9) {
            q = 7;
        } else if (9 < alphaSize && alphaSize <= 13) {
            q = 6;
        } else if (13 < alphaSize && alphaSize <= 21) {
            q = 5;
        } else if (21 < alphaSize && alphaSize <= 46) {
            q = 4;
        } else {
            q = 3;
        }

        kbs_buildDstepUsePrePlusCopyFreqOrder_SuffixArray(q);
        return suffixArray;
    }

    /**
     *
     */
    private void kbs_buildDstepUsePrePlusCopyFreqOrder_SuffixArray(int q) {
        int[] buckets = determine_Buckets_Sarray_Sptrmap(q);

        /* Sorting of all buckets */
        int mappedCharPtr = 0;
        int alphabetSize = alphabet.size;
        int bucketsInLevel3Bucket = kbs_power_Ulong(alphabetSize, q - 3);
        int bucketsInLevel2Bucket = bucketsInLevel3Bucket * alphabetSize;
        int bucketsInLevel1Bucket = bucketsInLevel2Bucket * alphabetSize;
        int[] alphaOrder = getCharWeightedOrder_Alphabet(buckets, bucketsInLevel2Bucket);
        int[] isNotSortedLevel1Char = new int[alphabetSize];
        Arrays.fill(isNotSortedLevel1Char, 1);

        /* Sort all level-1 buckets */
        int[] leftPtrList = new int[alphabetSize];
        int[] rightPtrList = new int[alphabetSize];
        int[] leftPtrList2 = new int[alphabetSize * alphabetSize];
        int[] rightPtrList2 = new int[alphabetSize * alphabetSize];
        int i;
        int j;
        int c1 = 0;
        for (i = 0; i < alphabetSize; i++) {
            c1 = alphaOrder[i];
            /* sort buckets cd to cz */
            for (j = i + 1; j < alphabetSize; j++) {
                int c2 = alphaOrder[j];
                int l;
                for (l = i; l < alphabetSize; l++) {
                    int c3 = alphaOrder[l];
                    int tmpUlong = c1 * bucketsInLevel1Bucket + c2
                        * bucketsInLevel2Bucket + c3 * bucketsInLevel3Bucket;
                    int k;
                    for (k = tmpUlong; k < tmpUlong + bucketsInLevel3Bucket; k++) {
                        int leftPtr = buckets[k];
                        int rightPtr = buckets[k + 1] - 1;
                        if (rightPtr - leftPtr > 0) {
                            if (rightPtr - leftPtr < INSSORT_LIMIT) insSortUpdateRecurse_SaBucket(
                                leftPtr, rightPtr, q, q);
                            else partitionUpdateRecurse_SaBucket(leftPtr, rightPtr, q, q);
                        }
                    }
                }
            }

            /* copy left buckets of cx */
            for (j = i; j < alphabetSize; j++) {
                int cp1 = alphaOrder[j];
                leftPtrList[cp1] = buckets[cp1 * bucketsInLevel1Bucket + c1
                    * bucketsInLevel2Bucket];
                int k;
                for (k = i + 1; k < alphabetSize; k++) {
                    int cp2 = alphaOrder[k];
                    leftPtrList2[cp2 * alphabetSize + cp1] = buckets[cp2
                        * bucketsInLevel1Bucket + cp1 * bucketsInLevel2Bucket + c1
                        * bucketsInLevel3Bucket];
                }
            }
            if (c1 == 0) {
                int cp1 = seq[start + mappedCharPtr + length - 1];
                int cp2 = seq[start + mappedCharPtr + length - 2];
                if (isNotSortedLevel1Char[cp1] != 0) {
                    leftPtrList[cp1]++;
                    leftPtrList2[cp1 * alphabetSize]++;
                    if (isNotSortedLevel1Char[cp2] != 0 && cp2 != c1) {
                        suffixArray[leftPtrList2[cp2 * alphabetSize + cp1]] = length - 2;
                        sufPtrMap[length - 2] = leftPtrList2[cp2 * alphabetSize + cp1];
                        leftPtrList2[cp2 * alphabetSize + cp1]++;
                    }
                }
            }

            int leftPtr = buckets[c1 * bucketsInLevel1Bucket];
            while (leftPtr < leftPtrList[c1]) {
                int cp1;
                int tmpUlong = suffixArray[leftPtr];
                if (tmpUlong != 0
                    && isNotSortedLevel1Char[cp1 = seq[start + mappedCharPtr + tmpUlong
                    - 1]] != 0) {
                    if (isNotSortedLevel1Char[seq[start + mappedCharPtr + tmpUlong + 1]] != 0) {
                        int tmpUlongPtr = leftPtrList[cp1];
                        sufPtrMap[tmpUlong - 1] = tmpUlongPtr;
                        suffixArray[tmpUlongPtr] = tmpUlong - 1;
                    }
                    leftPtrList[cp1]++;
                    int cp2;
                    if (tmpUlong > 1
                        && isNotSortedLevel1Char[cp2 = seq[start + mappedCharPtr
                        + tmpUlong - 2]] != 0 && cp2 != c1) {
                        int tmpUlongPtr = leftPtrList2[cp2 * alphabetSize + cp1]++;
                        sufPtrMap[tmpUlong - 2] = tmpUlongPtr;
                        suffixArray[tmpUlongPtr] = tmpUlong - 2;
                    }
                }
                leftPtr++;
            }

            /* copy right buckets of cx */
            for (j = i; j < alphabetSize; j++) {
                int cp1 = alphaOrder[j];
                int k;
                rightPtrList[cp1] = buckets[cp1 * bucketsInLevel1Bucket + (c1 + 1)
                    * bucketsInLevel2Bucket];
                for (k = i + 1; k < alphabetSize; k++) {
                    int cp2 = alphaOrder[k];
                    rightPtrList2[cp2 * alphabetSize + cp1] = buckets[cp2
                        * bucketsInLevel1Bucket + cp1 * bucketsInLevel2Bucket + (c1 + 1)
                        * bucketsInLevel3Bucket];
                }
            }
            int rightPtr = buckets[(c1 + 1) * bucketsInLevel1Bucket];
            while (leftPtr < rightPtr) {
                int cp1;
                rightPtr--;
                int tmpUlong = suffixArray[rightPtr];
                if (tmpUlong != 0
                    && isNotSortedLevel1Char[cp1 = seq[start + mappedCharPtr + tmpUlong
                    - 1]] != 0) {
                    rightPtrList[cp1]--;
                    if (isNotSortedLevel1Char[seq[start + mappedCharPtr + tmpUlong + 1]] != 0) {
                        int tmpUlongPtr = rightPtrList[cp1];
                        sufPtrMap[tmpUlong - 1] = tmpUlongPtr;
                        suffixArray[tmpUlongPtr] = tmpUlong - 1;
                    }
                    int cp2;
                    if (tmpUlong > 1
                        && isNotSortedLevel1Char[cp2 = seq[start + mappedCharPtr
                        + tmpUlong - 2]] != 0 && cp2 != c1) {
                        int tmpUlongPtr = --rightPtrList2[cp2 * alphabetSize + cp1];
                        sufPtrMap[tmpUlong - 2] = tmpUlongPtr;
                        suffixArray[tmpUlongPtr] = tmpUlong - 2;
                    }
                }
            }
            isNotSortedLevel1Char[c1] = 0;
        }

    }

    /**
     * Stably sorts a bucket at a refinement level regarding sort keys that are bucket
     * pointers in sufPtrMap with offset.
     *
     * @param leftPtr  points to the leftmost suffix of the current bucket.
     * @param rightPtr points to the rightmost suffix of the current bucket.
     * @param offset   is the length of the common prefix of the suffixes (a multiple of q).
     * @param q        is the initial prefix length used for the bucket sort. It also determines
     *                 the increase of offset.
     */
    private void insSortUpdateRecurse_SaBucket(int leftPtr, int rightPtr, int offset,
                                               int q) {
        int rightTmpPtr = leftPtr + 1;
        while (rightTmpPtr <= rightPtr) {
            int tempValue = suffixArray[rightTmpPtr];
            int tempHashValue = sufPtrMap[suffixArray[rightTmpPtr] + offset];
            int leftTmpPtr = rightTmpPtr;
            while (leftTmpPtr > leftPtr
                && sufPtrMap[suffixArray[leftTmpPtr - 1] + offset] > tempHashValue) {
                suffixArray[leftTmpPtr] = suffixArray[leftTmpPtr - 1];
                leftTmpPtr--;
            }
            suffixArray[leftTmpPtr] = tempValue;
            rightTmpPtr++;
        }
        updatePtrAndRefineBuckets_SaBucket(leftPtr, rightPtr, offset, q);
    }

    /**
     * The function determines the subbuckets after refining this bucket and recursively
     * calls the refinement function for the subbuckets.
     *
     * @param leftPtr  points to the leftmost suffix of the current bucket.
     * @param rightPtr points to the rightmost suffix of the current bucket.
     * @param offset   is the length of the common prefix of the suffixes (a multiple of q).
     * @param q        is the initial prefix length used for the bucket sort. It also determines
     *                 the increase of offset.
     */
    private void updatePtrAndRefineBuckets_SaBucket(int leftPtr, int rightPtr,
                                                    int offset, int q) {
        /*
         * for all buckets with resp. pointer > rightPtr determine buckets via setting
         * sufPtrMap
         */
        int leftIntervalPtr = rightPtr;
        int rightIntervalPtr = rightPtr;
        int tmpPtr;
        while (leftPtr <= leftIntervalPtr
            && rightPtr < (tmpPtr = sufPtrMap[suffixArray[leftIntervalPtr] + offset])) {
            do {
                sufPtrMap[suffixArray[leftIntervalPtr]] = rightIntervalPtr;
                leftIntervalPtr--;
            }
            while (leftPtr <= leftIntervalPtr
                && sufPtrMap[suffixArray[leftIntervalPtr] + offset] == tmpPtr);
            rightIntervalPtr = leftIntervalPtr;
        }

        /*
         * since the sufPtrMap for the suffixes between leftPtr and rightPtr might change
         * in previous 2 steps
         */

        /*
         * determine the bucket concerning suffixptr+offset between leftPtr and rightPtr
         * separately
         */
        rightIntervalPtr = leftIntervalPtr;
        while (leftPtr <= leftIntervalPtr
            && leftPtr <= sufPtrMap[suffixArray[leftIntervalPtr] + offset]
            && sufPtrMap[suffixArray[leftIntervalPtr] + offset] <= rightPtr) {
            sufPtrMap[suffixArray[leftIntervalPtr]] = rightIntervalPtr;
            leftIntervalPtr--;
        }

        /*
         * for all buckets with resp. pointer+offset < leftPtr determine buckets via
         * setting sufPtrMap
         */
        /*
         * start with rightIntervalPtr which indicates leftend-1 of bucket with resp.
         * pointer+offset between
         */
        /* leftPtr and rightPtr */
        int middleRightPtr = rightIntervalPtr;
        int middleLeftPtr = leftIntervalPtr;
        rightIntervalPtr = leftIntervalPtr;
        while (leftPtr <= leftIntervalPtr) {
            int tmpPtr2 = sufPtrMap[suffixArray[leftIntervalPtr] + offset];
            do {
                sufPtrMap[suffixArray[leftIntervalPtr]] = rightIntervalPtr;
                leftIntervalPtr--;
            }
            while (leftPtr <= leftIntervalPtr
                && sufPtrMap[suffixArray[leftIntervalPtr] + offset] == tmpPtr2);
            rightIntervalPtr = leftIntervalPtr;
        }

        int newOffset = offset + q;
        if (sufPtrMap[suffixArray[leftPtr]] == rightPtr) {
            newOffset = computeDiffDepthBucket_SaBucket(leftPtr, rightPtr, newOffset, q);
        }
        int leftTmpPtr = leftPtr;
        while (leftTmpPtr < middleLeftPtr) {
            int rightTmpPtr = sufPtrMap[suffixArray[leftTmpPtr]];
            int tmpLong = rightTmpPtr - leftTmpPtr;
            if (tmpLong > 0) {
                if (tmpLong == 1) {
                    computeBucketSize2_SaBucket(leftTmpPtr, rightTmpPtr, newOffset, q);
                    leftTmpPtr = rightTmpPtr + 1;
                    continue;
                }
                if (tmpLong == 2) {
                    computeBucketSize3_SaBucket(leftTmpPtr, rightTmpPtr, newOffset, q);
                    leftTmpPtr = rightTmpPtr + 1;
                    continue;
                }
                insSortUpdateRecurse_SaBucket(leftTmpPtr, rightTmpPtr, newOffset, q);
            }
            leftTmpPtr = rightTmpPtr + 1;
        }
        /* for buckets refering to this bucket, the offset can be doubled */
        if (middleRightPtr > middleLeftPtr + 1) {
            if (middleRightPtr - middleLeftPtr == 2) {
                computeBucketSize2_SaBucket(middleLeftPtr + 1, middleRightPtr, Math.max(
                    2 * offset, newOffset), q);
            } else {
                if (middleRightPtr - middleLeftPtr == 3) {
                    computeBucketSize3_SaBucket(middleLeftPtr + 1, middleRightPtr, Math
                        .max(2 * offset, newOffset), q);
                } else {
                    insSortUpdateRecurse_SaBucket(middleLeftPtr + 1, middleRightPtr, Math
                        .max(2 * offset, newOffset), q);
                }
            }
        }
        leftTmpPtr = middleRightPtr + 1;
        while (leftTmpPtr < rightPtr) {
            int rightTmpPtr = sufPtrMap[suffixArray[leftTmpPtr]];
            int tmpLong = rightTmpPtr - leftTmpPtr;
            if (tmpLong > 0) {
                if (tmpLong == 1) {
                    computeBucketSize2_SaBucket(leftTmpPtr, rightTmpPtr, newOffset, q);
                    leftTmpPtr = rightTmpPtr + 1;
                    continue;
                }
                if (tmpLong == 2) {
                    computeBucketSize3_SaBucket(leftTmpPtr, rightTmpPtr, newOffset, q);
                    leftTmpPtr = rightTmpPtr + 1;
                    continue;
                }
                insSortUpdateRecurse_SaBucket(leftTmpPtr, rightTmpPtr, newOffset, q);
            }
            leftTmpPtr = rightTmpPtr + 1;
        }

    }

    /**
     * Completely sorts buckets of size 3.
     *
     * @param leftPtr  points to the leftmost suffix of the current bucket.
     * @param rightPtr points to the rightmost suffix of the current bucket.
     * @param q        is the initial prefix length used for the bucket sort. It also determines
     *                 the increase of offset.
     * @param offset   is the length of the common prefix of the suffixes rounded down to a
     *                 multiple of q.
     */
    private void computeBucketSize3_SaBucket(int leftPtr, int rightPtr, int offset, int q) {
        int newOffset = offset;
        while (sufPtrMap[suffixArray[leftPtr] + newOffset] == sufPtrMap[suffixArray[leftPtr + 1]
            + newOffset]
            && sufPtrMap[suffixArray[leftPtr + 1] + newOffset] == sufPtrMap[suffixArray[rightPtr]
            + newOffset]) {
            newOffset += q;
        }
        if (sufPtrMap[suffixArray[leftPtr] + newOffset] > sufPtrMap[suffixArray[rightPtr]
            + newOffset]) {
            int swapTmp = suffixArray[leftPtr];
            suffixArray[leftPtr] = suffixArray[rightPtr];
            suffixArray[rightPtr] = swapTmp;
        }
        if (sufPtrMap[suffixArray[leftPtr] + newOffset] > sufPtrMap[suffixArray[leftPtr + 1]
            + newOffset]) {
            int swapTmp = suffixArray[leftPtr];
            suffixArray[leftPtr] = suffixArray[leftPtr + 1];
            suffixArray[leftPtr + 1] = swapTmp;
        }
        if (sufPtrMap[suffixArray[leftPtr + 1] + newOffset] > sufPtrMap[suffixArray[rightPtr]
            + newOffset]) {
            int swapTmp = suffixArray[rightPtr];
            suffixArray[rightPtr] = suffixArray[leftPtr + 1];
            suffixArray[leftPtr + 1] = swapTmp;
        }
        if (sufPtrMap[suffixArray[leftPtr] + newOffset] == sufPtrMap[suffixArray[leftPtr + 1]
            + newOffset]) {
            int suffix1 = suffixArray[leftPtr] + newOffset + q;
            int suffix2 = suffixArray[leftPtr + 1] + newOffset + q;
            while (sufPtrMap[suffix1] == sufPtrMap[suffix2]) {
                suffix1 += q;
                suffix2 += q;
            }
            if (sufPtrMap[suffix1] > sufPtrMap[suffix2]) {
                int tmpSwap = suffixArray[leftPtr];
                suffixArray[leftPtr] = suffixArray[leftPtr + 1];
                suffixArray[leftPtr + 1] = tmpSwap;
            }
            sufPtrMap[suffixArray[leftPtr]] = leftPtr;
            sufPtrMap[suffixArray[leftPtr + 1]] = leftPtr + 1;
            sufPtrMap[suffixArray[rightPtr]] = rightPtr;
            return;
        }
        if (sufPtrMap[suffixArray[leftPtr + 1] + newOffset] == sufPtrMap[suffixArray[rightPtr]
            + newOffset]) {
            sufPtrMap[suffixArray[leftPtr]] = leftPtr;
            int suffix1 = suffixArray[leftPtr + 1] + newOffset + q;
            int suffix2 = suffixArray[rightPtr] + newOffset + q;
            while (sufPtrMap[suffix1] == sufPtrMap[suffix2]) {
                suffix1 += q;
                suffix2 += q;
            }
            if (sufPtrMap[suffix1] > sufPtrMap[suffix2]) {
                int tmpSwap = suffixArray[rightPtr];
                suffixArray[rightPtr] = suffixArray[leftPtr + 1];
                suffixArray[leftPtr + 1] = tmpSwap;
            }
            sufPtrMap[suffixArray[leftPtr + 1]] = leftPtr + 1;
            sufPtrMap[suffixArray[rightPtr]] = rightPtr;
            return;
        }
        sufPtrMap[suffixArray[leftPtr]] = leftPtr;
        sufPtrMap[suffixArray[leftPtr + 1]] = (leftPtr + 1);
        sufPtrMap[suffixArray[rightPtr]] = rightPtr;
    }

    /**
     * Completely sorts buckets of size 2.
     *
     * @param leftPtr  points to the leftmost suffix of the current bucket.
     * @param rightPtr points to the rightmost suffix of the current bucket.
     * @param offset   is the length of the common prefix of the suffixes rounded down to a
     *                 multiple of q.
     * @param q        is the initial prefix length used for the bucket sort. It also determines
     *                 the increase of offset.
     */
    private void computeBucketSize2_SaBucket(int leftPtr, int rightPtr, int offset, int q) {
        int suffix1 = suffixArray[leftPtr] + offset;
        int suffix2 = suffixArray[rightPtr] + offset;
        while (sufPtrMap[suffix1] == sufPtrMap[suffix2]) {
            suffix1 += q;
            suffix2 += q;
        }
        if (sufPtrMap[suffix1] > sufPtrMap[suffix2]) {
            int tmpSwap = suffixArray[leftPtr];
            suffixArray[leftPtr] = suffixArray[rightPtr];
            suffixArray[rightPtr] = tmpSwap;
        }
        sufPtrMap[suffixArray[leftPtr]] = leftPtr;
        sufPtrMap[suffixArray[rightPtr]] = rightPtr;

    }

    /**
     * Computes about the LCP of all suffixes in this bucket. It will be the newoffset.
     *
     * @param leftPtr  points to the leftmost suffix of the current bucket.
     * @param rightPtr points to the rightmost suffix of the current bucket.
     * @param offset   is the length of the common prefix of the suffixes rounded down to a
     *                 multiple of q.
     * @param q        is the initial prefix length used for the bucket sort. It also determines
     *                 the increase of offset.
     * @return the LCP of suffixes in this bucket (newoffset).
     */
    private int computeDiffDepthBucket_SaBucket(int leftPtr, int rightPtr, int offset,
                                                int q) {
        int lcp = offset;
        while (true) {
            int runPtr = leftPtr;
            int a = suffixArray[rightPtr];
            int tmpPtr = sufPtrMap[a + lcp];
            while (runPtr < rightPtr) {
                if (sufPtrMap[suffixArray[runPtr] + lcp] != tmpPtr) {
                    return lcp;
                }
                runPtr++;
            }
            lcp += q;
        }
    }

    /**
     * Ternary partitioning of buckets with Lomuto's scheme. Subbuckets of size 2 and 3
     * are directly sorted and partitions smaller than a given threshold are sorted by
     * insertion sort.
     *
     * @param leftPtr  points to the leftmost position of the current bucket.
     * @param rightPtr points to the rightmost position of the current bucket.
     * @param offset   is the length of the common prefix of the suffixes (a multiple of q).
     * @param q        is the initial prefix length used for the bucket sort. It also determines
     *                 the increase of offset.
     */
    private void partitionUpdateRecurse_SaBucket(int leftPtr, int rightPtr, int offset,
                                                 int q) {
        int pivot;
        int tmpSize = rightPtr - leftPtr;
        if (tmpSize < 10000) {
            tmpSize = tmpSize / 4;
            pivot = sufPtrMap[suffixArray[leftPtr + tmpSize] + offset];
            int pivotb = sufPtrMap[suffixArray[leftPtr + 2 * tmpSize] + offset];
            int pivotc = sufPtrMap[suffixArray[rightPtr - tmpSize] + offset];
            int medNumber = medianOfThreeUlong(pivot, pivotb, pivotc);
            int pivotPtr = leftPtr + tmpSize;
            if (medNumber > 0) {
                pivotPtr = (medNumber == 1) ? (leftPtr + 2 * tmpSize)
                    : (rightPtr - tmpSize);
                pivot = (medNumber == 1) ? pivotb : pivotc;
            }
            int swapTmp = suffixArray[pivotPtr];
            suffixArray[pivotPtr] = suffixArray[leftPtr];
            suffixArray[leftPtr] = swapTmp;
        } else {
            int[] keyPtrList = new int[9];
            tmpSize = tmpSize / 10;
            int i;
            for (i = 0; i < 9; i++) {
                keyPtrList[i] = leftPtr + (i + 1) * tmpSize;
            }
            /* insertion sort */
            for (i = 1; i < 9; i++) {
                int tempValue = keyPtrList[i];
                int tempHashValue = sufPtrMap[suffixArray[tempValue] + offset];
                int j = i - 1;
                while (j >= 0
                    && sufPtrMap[suffixArray[keyPtrList[j]] + offset] > tempHashValue) {
                    keyPtrList[j + 1] = keyPtrList[j];
                    j--;
                }
                keyPtrList[j + 1] = tempValue;
            }
            int swapTmp = suffixArray[keyPtrList[4]];
            suffixArray[keyPtrList[4]] = suffixArray[leftPtr];
            suffixArray[leftPtr] = swapTmp;
            pivot = sufPtrMap[suffixArray[leftPtr] + offset];
        }

        int pivotRightPtr = leftPtr + 1;
        while (pivotRightPtr <= rightPtr
            && sufPtrMap[suffixArray[pivotRightPtr] + offset] == pivot) {
            ++pivotRightPtr;
        }
        int smallerPivotPtr = pivotRightPtr;
        while (smallerPivotPtr <= rightPtr
            && sufPtrMap[suffixArray[smallerPivotPtr] + offset] < pivot) {
            smallerPivotPtr++;
        }

        int frontPtr = smallerPivotPtr - 1;
        while (frontPtr++ < rightPtr) {
            int sortkey = sufPtrMap[suffixArray[frontPtr] + offset];
            if (sortkey <= pivot) {
                int swapTmp = suffixArray[frontPtr];
                suffixArray[frontPtr] = suffixArray[smallerPivotPtr];
                suffixArray[smallerPivotPtr] = swapTmp;
                if (sortkey == pivot) {
                    suffixArray[smallerPivotPtr] = suffixArray[pivotRightPtr];
                    suffixArray[pivotRightPtr++] = swapTmp;
                }
                smallerPivotPtr++;
            }
        }
        /* vector swap the pivot elements */
        int numberSmaller = smallerPivotPtr - pivotRightPtr;
        if (numberSmaller > 0) {
            int swapsize = Math.min((pivotRightPtr - leftPtr), numberSmaller);
            int pivotRightTmpPtr = leftPtr + swapsize - 1;
            vectorSwap(leftPtr, pivotRightTmpPtr, smallerPivotPtr - 1);

            /* recursively sort < partition */
            if (numberSmaller == 1) {
                sufPtrMap[suffixArray[leftPtr]] = leftPtr;
            } else {
                if (numberSmaller == 2) {
                    computeBucketSize2_SaBucket(leftPtr, leftPtr + 1, offset, q);
                } else {
                    if (numberSmaller == 3) computeBucketSize3_SaBucket(leftPtr,
                        leftPtr + 2, offset, q);
                    else partitionUpdateRecurse_SaBucket(leftPtr, leftPtr + numberSmaller
                        - 1, offset, q);
                }
            }
        }

        /* update pivots and recursively sort = partition */
        int leftTmpPtr = leftPtr + numberSmaller;
        smallerPivotPtr--;
        if (leftTmpPtr == smallerPivotPtr) {
            sufPtrMap[suffixArray[leftTmpPtr]] = leftTmpPtr;
            if (leftTmpPtr == rightPtr) return;
        } else {
            int newOffset = (pivot == rightPtr) ? (2 * offset) : offset + q;
            if (leftTmpPtr + 1 == smallerPivotPtr) {
                computeBucketSize2_SaBucket(leftTmpPtr, smallerPivotPtr, newOffset, q);
                if (rightPtr == smallerPivotPtr) return;
            } else {
                if (leftTmpPtr + 2 == smallerPivotPtr) {
                    computeBucketSize3_SaBucket(leftTmpPtr, smallerPivotPtr, newOffset, q);
                    if (rightPtr == smallerPivotPtr) return;
                } else {
                    if (rightPtr == smallerPivotPtr) {
                        newOffset = computeDiffDepthBucket_SaBucket(leftPtr
                            + numberSmaller, rightPtr, newOffset, q);
                        partitionUpdateRecurse_SaBucket(leftTmpPtr, rightPtr, newOffset,
                            q);
                        return;
                    }
                    while (leftTmpPtr <= smallerPivotPtr) {
                        sufPtrMap[suffixArray[leftTmpPtr]] = smallerPivotPtr;
                        leftTmpPtr++;
                    }
                    if (smallerPivotPtr < leftPtr + numberSmaller + INSSORT_LIMIT) {
                        insSortUpdateRecurse_SaBucket(leftPtr + numberSmaller,
                            smallerPivotPtr, newOffset, q);
                    } else partitionUpdateRecurse_SaBucket(leftPtr + numberSmaller,
                        smallerPivotPtr, newOffset, q);
                }
            }
        }

        /* recursively sort > partition */
        ++smallerPivotPtr;
        if (smallerPivotPtr == rightPtr) {
            sufPtrMap[suffixArray[rightPtr]] = rightPtr;
            return;
        }
        if (smallerPivotPtr + 1 == rightPtr) {
            computeBucketSize2_SaBucket(smallerPivotPtr, rightPtr, offset, q);
            return;
        }
        if (smallerPivotPtr + 2 == rightPtr) {
            computeBucketSize3_SaBucket(smallerPivotPtr, rightPtr, offset, q);
            return;
        }
        partitionUpdateRecurse_SaBucket(smallerPivotPtr, rightPtr, offset, q);

    }

    /**
     * @param leftPtr    points to the leftmost suffix of the first swap space.
     * @param rightPtr   points to the rightmost suffix of the first swap space.
     * @param swapEndPtr points to the leftmost suffix of the second swap space.
     */
    private void vectorSwap(int leftPtr, int rightPtr, int swapEndPtr) {
        int swapTmp = suffixArray[swapEndPtr];
        while (leftPtr < rightPtr) {
            suffixArray[swapEndPtr] = suffixArray[rightPtr];
            swapEndPtr--;
            suffixArray[rightPtr] = suffixArray[swapEndPtr];
            rightPtr--;
        }
        suffixArray[swapEndPtr] = suffixArray[leftPtr];
        suffixArray[leftPtr] = swapTmp;

    }

    /**
     * Sorts the alphabet concerning some weight concerning cc bucket size and alphabet
     * frequency Only works for mapped string with alphabet [0,alphaSize]
     *
     * @param buckets               - the bucket table
     * @param bucketsInLevel2Bucket - number of subbuckets of level-2 buckets
     * @return the order of the alphabet according to the weight on buckets with same
     * first and second character
     */
    private int[] getCharWeightedOrder_Alphabet(int[] buckets, int bucketsInLevel2Bucket) {
        int alphabetSize = alphabet.size;
        int[] charWeight = new int[alphabetSize];
        int tmpBucketFactor = bucketsInLevel2Bucket * (alphabetSize + 1);
        int i;
        for (i = 0; i < alphabetSize; i++) {
            charWeight[i] = alphabet.charFreq[i];
            charWeight[i] -= buckets[i * tmpBucketFactor + bucketsInLevel2Bucket]
                - buckets[i * tmpBucketFactor];
        }

        int[] targetCharArray = new int[alphabetSize + 1];
        for (i = 0; i < alphabetSize; i++) {
            targetCharArray[i] = i;
        }
        for (i = 1; i < alphabet.size; i++) {
            int tmpWeight = charWeight[i];
            int j = i;
            while (j > 0 && tmpWeight < charWeight[targetCharArray[j - 1]]) {
                targetCharArray[j] = targetCharArray[j - 1];
                j--;
            }
            targetCharArray[j] = i;
        }
        return targetCharArray;
    }

    /**
     * Constructs all buckets w.r.t. q-gram size q, the up to prefix q sorted suffix
     * array, and the bucket-pointer table.
     *
     * @param q size of q-gram.
     * @return Buckets containing pointers into the suffix array.
     */
    private int[] determine_Buckets_Sarray_Sptrmap(int q) {

        if (kbs_getExp_Ulong(2, alphabet.size) >= 0) {
            return determinePower2Alpha_Buckets_Sarray_Sptrmap(q);
        } else {
            return determineAll_Buckets_Sarray_Sptrmap(q);
        }
    }

    /**
     * Constructs all buckets w.r.t. q-gram size q, the up to prefix q sorted suffix
     * array, and the bucket-pointer table.
     *
     * @param q size of q-gram.
     * @return Buckets containing pointers into the suffix array.
     * @see #determine_Buckets_Sarray_Sptrmap
     */
    private int[] determineAll_Buckets_Sarray_Sptrmap(int q) {
        int[] buckets = determineAll_Buckets_Sarray(q);
        int strLen = length;
        sufPtrMap = new int[strLen + 2 * q + 1];

        /* computation of first hashvalue */
        int alphabetSize = alphabet.size;
        int mappedUcharArray = 0;
        int tempPower = 1;
        int hashCode = 0;
        int i;
        for (i = q - 1; i >= 0; i--) {
            hashCode += seq[start + mappedUcharArray + i] * tempPower;
            tempPower *= alphabetSize;
        }
        int tempModulo = kbs_power_Ulong(alphabetSize, q - 1);
        mappedUcharArray += q;
        int j;
        for (j = 0; j < strLen - 1; j++) {
            sufPtrMap[j] = (buckets[hashCode + 1]) - 1;
            hashCode -= (seq[start + mappedUcharArray - q]) * tempModulo;
            hashCode *= alphabetSize;
            hashCode += seq[start + mappedUcharArray];
            mappedUcharArray++;
        }
        sufPtrMap[j] = buckets[hashCode];
        /* set the values in sufPtrMap[strLen..strLen+2*d] to [-1, -2, ..., -2*d] */
        int beginPtr = -1;
        for (j = strLen; j <= strLen + 2 * q; j++) {
            sufPtrMap[j] = beginPtr--;
        }
        return buckets;
    }

    /**
     * Constructs all buckets w.r.t. q-gram size and the up to prefix q sorted suffix
     * array. Call determine_Buckets_Sarray(const Kbs_Ustring *const ustring, register
     * const Kbs_Ulong q, Kbs_Ulong **suffixArrayPtr) instead
     *
     * @param q size of q-gram.
     * @return Buckets containing pointers into the suffix array.
     * @see #determine_Buckets_Sarray_Sptrmap(int)
     */
    private int[] determineAll_Buckets_Sarray(int q) {

        int strLen = length;
        int alphabetSize = alphabet.size;
        int numberBuckets = kbs_power_Ulong(alphabetSize, q);
        int[] buckets = new int[numberBuckets + 1];
        for (int i = 0; i < q; i++) {
            seq[start + length + i] = alphabet.charArray[0];
        }
        for (int i = 0; i < KBS_STRING_EXTENSION_SIZE - q; i++) {
            seq[start + length + i + q] = 0;
        }
        /* computation of first hashvalue */
        int[] alphaMap = alphabet.alphaMapping;
        int mappedUcharArray = 0;
        int hashCode = 0;
        int tempPower = 1;
        int i;
        for (i = q - 1; i >= 0; i--) {
            hashCode += (seq[start + mappedUcharArray + i] = alphaMap[seq[start
                + mappedUcharArray + i]])
                * tempPower;
            tempPower *= alphabetSize;
        }
        int firstHashCode = hashCode;
        /* computation of the size of buckets */
        int tempModulo = kbs_power_Ulong(alphabetSize, q - 1);
        mappedUcharArray += q;
        buckets[hashCode]++;
        int j;
        for (j = 1; j < strLen; j++) {
            hashCode -= (seq[start + mappedUcharArray - q]) * tempModulo;
            hashCode *= alphabetSize;
            hashCode += seq[start + mappedUcharArray] = alphaMap[seq[start
                + mappedUcharArray]];
            mappedUcharArray++;
            buckets[hashCode]++;
        }
        /* update the alphabet for mapped string */
        for (j = 0; j < alphabetSize; j++) {
            alphabet.charFreq[j] = alphabet.charFreq[alphabet.charArray[j]];
            alphabet.charArray[j] = j;
            alphaMap[j] = j;
        }
        for (; j < KBS_MAX_ALPHABET_SIZE; j++) {
            alphaMap[j] = -1;
        }

        this.suffixArray = new int[strLen + 1];
        /* computation of the bucket pointers, pointers into the suffix array */
        for (j = 1; j <= numberBuckets; j++) {
            buckets[j] = buckets[j - 1] + buckets[j];
        }

        /* computation of the suffix array (buckets that are copied later are left out) */
        int[] charRank = getCharWeightedRank_Alphabet(buckets, q);

        mappedUcharArray = q;
        hashCode = firstHashCode;
        for (j = 0; j < strLen - 1; j++) {
            int c1;
            buckets[hashCode]--;
            if ((c1 = charRank[seq[start + mappedUcharArray - q]]) < charRank[seq[start
                + mappedUcharArray + 1 - q]]
                && c1 <= charRank[seq[start + mappedUcharArray + 2 - q]]) suffixArray[buckets[hashCode]] = j;
            hashCode -= (seq[start + mappedUcharArray - q]) * tempModulo;
            hashCode *= alphabetSize;
            hashCode += seq[start + mappedUcharArray];
            mappedUcharArray++;
        }

        buckets[hashCode]--;
        suffixArray[buckets[hashCode]] = strLen - 1;

        buckets[numberBuckets] = strLen;
        return buckets;
    }

    /**
     * Constructs all buckets w.r.t. q-gram size q, the up to prefix length q sorted
     * suffix array, and the bucket-pointer table.
     *
     * @param q size of q-gram.
     * @return Buckets containing pointers into the suffix array.
     * @see #determine_Buckets_Sarray_Sptrmap
     */
    private int[] determinePower2Alpha_Buckets_Sarray_Sptrmap(int q) {
        int strLen = length;
        int exp2 = kbs_getExp_Ulong(2, alphabet.size);
        if (exp2 < 0) {
            throw new RuntimeException("value out of bounds");
        }
        int[] buckets = determinePower2Alpha_Buckets_Sarray(q);
        this.sufPtrMap = new int[strLen + 2 * q + 1];
        int mappedUcharArray = 0;
        int hashCode = 0;
        int j;
        for (j = 0; j < q; j++) {
            hashCode = hashCode << exp2;
            hashCode += seq[start + mappedUcharArray + j];
        }
        int tempModulo = 0;
        tempModulo = ~tempModulo;
        tempModulo = tempModulo << (exp2 * (q - 1));
        tempModulo = ~tempModulo;
        mappedUcharArray += q;
        for (j = 0; j < strLen - 1; j++) {
            sufPtrMap[j] = (buckets[hashCode + 1]) - 1;
            hashCode = hashCode & tempModulo;
            hashCode = hashCode << exp2;
            hashCode = hashCode | seq[start + mappedUcharArray];
            mappedUcharArray++;
        }
        sufPtrMap[j] = buckets[hashCode];
        int beginPtr = -1;
        for (j = strLen; j <= strLen + 2 * q; j++) {
            sufPtrMap[j] = beginPtr--;
        }

        return buckets;
    }

    private int kbs_power_Ulong(int base, int exp) {
        int p;
        if (exp == 0) {
            return 1;
        }
        if (exp == 1) {
            return base;
        }
        if (base == 4) {
            if (exp > 15) {
                throw new RuntimeException();
            }
            return 4 << (2 * (exp - 1));
        }
        p = 1;
        for (; exp > 0; --exp) {
            p = p * base;
        }
        return p;
    }

    /**
     * Constructs all buckets w.r.t. q-gram size q and the up to prefix q sorted suffix
     * array. Precondition: ustring->alphabet->alphaSize = 2^x for some x; otherwise, call
     * determine_Buckets_Sarray.
     *
     * @param q size of q-gram.
     * @return Buckets containing pointers into the suffix array.
     * @see #determine_Buckets_Sarray_Sptrmap(int)
     */
    private int[] determinePower2Alpha_Buckets_Sarray(int q) {
        int exp2 = kbs_getExp_Ulong(2, alphabet.size);
        int strLen = length;
        int mappedUcharArray = 0;
        for (int i = 0; i < q; i++) {
            seq[start + length + i] = alphabet.charArray[0];
        }
        for (int i = length + q; i < length + KBS_STRING_EXTENSION_SIZE - q; i++) {
            seq[start + i] = 0;
        }
        int numberBuckets = kbs_power_Ulong(alphabet.size, q);
        int[] buckets = new int[numberBuckets + 1];
        int hashCode = 0;
        for (int j = 0; j < q; j++) {
            hashCode = hashCode << exp2;
            hashCode += (seq[start + mappedUcharArray + j] = alphabet.alphaMapping[seq[start
                + mappedUcharArray + j]]);
        }
        int firstHashCode = hashCode;

        int tempModulo = 0;
        tempModulo = ~tempModulo;
        tempModulo = tempModulo << (exp2 * (q - 1));
        tempModulo = ~tempModulo;
        mappedUcharArray += q;
        buckets[hashCode]++;

        for (int j = 1; j < strLen; j++) {
            hashCode = hashCode & tempModulo;
            hashCode = hashCode << exp2;
            hashCode = hashCode
                | (seq[start + mappedUcharArray] = alphabet.alphaMapping[seq[start
                + mappedUcharArray]]);
            mappedUcharArray++;
            buckets[hashCode]++;
        }

        /* update the alphabet for mapped string */
        int j;
        for (j = 0; j < alphabet.size; j++) {
            alphabet.charFreq[j] = alphabet.charFreq[alphabet.charArray[j]];
            alphabet.charArray[j] = j;
            alphabet.alphaMapping[j] = j;
        }
        for (; j < KBS_MAX_ALPHABET_SIZE; j++) {
            alphabet.alphaMapping[j] = -1;
        }

        this.suffixArray = new int[strLen + 1];

        /* computation of the bucket pointers, pointers into the suffix array */
        for (j = 1; j <= numberBuckets; j++) {
            buckets[j] = buckets[j - 1] + buckets[j];
        }

        /* computation of the suffix array */
        int[] charRank = getCharWeightedRank_Alphabet(buckets, q);
        mappedUcharArray = q;
        hashCode = firstHashCode;
        for (j = 0; j < strLen - 1; j++) {
            int c1;
            buckets[hashCode]--;
            if ((c1 = charRank[seq[start + mappedUcharArray - q]]) < charRank[seq[start
                + mappedUcharArray + 1 - q]]
                && (c1 <= charRank[seq[start + mappedUcharArray + 2 - q]])) suffixArray[buckets[hashCode]] = j;
            hashCode = hashCode & tempModulo;
            hashCode = hashCode << exp2;
            hashCode = hashCode | (seq[start + mappedUcharArray]);

            mappedUcharArray++;
        }
        buckets[hashCode]--;
        suffixArray[buckets[hashCode]] = strLen - 1;

        buckets[numberBuckets] = strLen;
        return buckets;
    }

    /**
     * Sorts the alphabet regarding some weight according to cc bucket size and alphabet
     * frequency Only works for mapped string with alphabet [0,alphaSize]
     *
     * @param buckets - the bucket table
     * @param q       - the initial q-gram size
     * @return the rank of each character
     */
    private int[] getCharWeightedRank_Alphabet(int[] buckets, int q) {

        int alphabetSize = alphabet.size;
        int[] charWeight = new int[alphabetSize];
        int bucketsInLevel2Bucket = kbs_power_Ulong(alphabetSize, q - 2);
        int tmpBucketFactor = bucketsInLevel2Bucket * (alphabetSize + 1);
        int i;
        charWeight[0] = alphabet.charFreq[0];
        charWeight[0] -= buckets[bucketsInLevel2Bucket - 1];
        for (i = 1; i < alphabetSize - 1; i++) {
            charWeight[i] = alphabet.charFreq[i];
            charWeight[i] -= buckets[i * tmpBucketFactor + bucketsInLevel2Bucket - 1]
                - buckets[i * tmpBucketFactor - 1];
        }
        charWeight[alphabetSize - 1] = alphabet.charFreq[i];
        charWeight[alphabetSize - 1] -= buckets[(alphabetSize - 1) * tmpBucketFactor
            + bucketsInLevel2Bucket - 1]
            - buckets[(alphabetSize - 1) * tmpBucketFactor - 1];

        int[] targetCharArray = new int[alphabetSize];
        for (i = 0; i < alphabetSize; i++) {
            targetCharArray[i] = i;
        }
        /* insertion sort by charWeight */
        for (i = 1; i < alphabet.size; i++) {
            int tmpWeight = charWeight[i];
            int j = i;
            while (j > 0 && tmpWeight < charWeight[targetCharArray[j - 1]]) {
                targetCharArray[j] = targetCharArray[j - 1];
                j--;
            }
            targetCharArray[j] = i;
        }
        int[] charRank = new int[alphabetSize + 1];
        for (i = 0; i < alphabetSize; i++) {
            charRank[targetCharArray[i]] = i;
        }
        return charRank;
    }

    /**
     *
     */
    private int kbs_getExp_Ulong(int base, int value) {
        int exp = 0;
        int tmpValue = 1;
        while (tmpValue < value) {
            tmpValue *= base;
            exp++;
        }
        if (tmpValue == value) {
            return exp;
        } else {
            return -1;
        }

    }

    /**
     * @param a first key
     * @param b second key
     * @param c third key
     * @return 0 if a is the median, 1 if b is the median, 2 if c is the median.
     */
    private int medianOfThreeUlong(int a, int b, int c) {
        if (a == b || a == c) {
            return 0;
        }
        if (b == c) {
            return 2;
        }
        return a < b ? (b < c ? 1 : (a < c ? 2 : 0)) : (b > c ? 1 : (a < c ? 0 : 2));
    }
}