org.apache.tools.bzip2.BlockSort Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of tajin-all Show documentation
The newest version!
/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied.  See the License for the
 * specific language governing permissions and limitations
 * under the License.
 */
package org.apache.tools.bzip2;

import java.util.BitSet;

/**
 * Encapsulates the Burrows-Wheeler sorting algorithm needed by {@link
 * CBZip2OutputStream}.
 *
 * This class is based on a Java port of Julian Seward's
 * blocksort.c in his libbzip2
 *
 * The Burrows-Wheeler transform is a reversible transform of the
 * original data that is supposed to group similiar bytes close to
 * each other.  The idea is to sort all permutations of the input and
 * only keep the last byte of each permutation.  E.g. for "Commons
 * Compress" you'd get:
 *
 *  *  CompressCommons
 * Commons Compress
 * CompressCommons 
 * essCommons Compr
 * mmons CompressCo
 * mons CompressCom
 * mpressCommons Co
 * ns CompressCommo
 * ommons CompressC
 * ompressCommons C
 * ons CompressComm
 * pressCommons Com
 * ressCommons Comp
 * s CompressCommon
 * sCommons Compres
 * ssCommons Compre
 * 
 *
 * Which results in a new text "ss romooCCmmpnse", in adition the
 * index of the first line that contained the original text is kept -
 * in this case it is 1.  The idea is that in a long English text all
 * permutations that start with "he" are likely suffixes of a "the" and
 * thus they end in "t" leading to a larger block of "t"s that can
 * better be compressed by the subsequent Move-to-Front, run-length
 * und Huffman encoding steps.
 *
 * For more information see for example:
 * 
 *   Burrows,
 *   M. and Wheeler, D.: A Block-sorting Lossless Data Compression
 *   Algorithm
 *   Manber, U. and
 *   Myers, G.: Suffix arrays: A new method for on-line string
 *   searches
 *   Bentley,
 *   J.L. and Sedgewick, R.: Fast Algorithms for Sorting and Searching
 *   Strings
 * 
 *
 * @NotThreadSafe
 */
class BlockSort {

    /*
     * Some of the constructs used in the C code cannot be ported
     * literally to Java - for example macros, unsigned types.  Some
     * code has been hand-tuned to improve performance.  In order to
     * avoid memory pressure some structures are reused for several
     * blocks and some memory is even shared between sorting and the
     * MTF stage even though either algorithm uses it for its own
     * purpose.
     *
     * Comments preserved from the actual C code are prefixed with
     * "LBZ2:".
     */

    /*
     * 2012-05-20 Stefan Bodewig:
     *
     * This class seems to mix several revisions of libbzip2's code.
     * The mainSort function and those used by it look closer to the
     * 0.9.5 version but show some variations introduced later.  At
     * the same time the logic of Compress 1.4 to randomize the block
     * on bad input has been dropped after libbzip2 0.9.0 and replaced
     * by a fallback sorting algorithm.
     *
     * I've added the fallbackSort function of 1.0.6 and tried to
     * integrate it with the existing code without touching too much.
     * I've also removed the now unused randomization code.
     */

    /*
     * LBZ2: If you are ever unlucky/improbable enough to get a stack
     * overflow whilst sorting, increase the following constant and
     * try again. In practice I have never seen the stack go above 27
     * elems, so the following limit seems very generous.
     */
    private static final int QSORT_STACK_SIZE = 1000;

    private static final int FALLBACK_QSORT_STACK_SIZE = 100;

    private static final int STACK_SIZE =
        QSORT_STACK_SIZE < FALLBACK_QSORT_STACK_SIZE
        ? FALLBACK_QSORT_STACK_SIZE : QSORT_STACK_SIZE;

    /*
     * Used when sorting. If too many long comparisons happen, we stop sorting,
     * and use fallbackSort instead.
     */
    private int workDone;
    private int workLimit;
    private boolean firstAttempt;

    private final int[] stack_ll = new int[STACK_SIZE]; // 4000 byte
    private final int[] stack_hh = new int[STACK_SIZE]; // 4000 byte
    private final int[] stack_dd = new int[QSORT_STACK_SIZE]; // 4000 byte

    private final int[] mainSort_runningOrder = new int[256]; // 1024 byte
    private final int[] mainSort_copy = new int[256]; // 1024 byte
    private final boolean[] mainSort_bigDone = new boolean[256]; // 256 byte

    private final int[] ftab = new int[65537]; // 262148 byte

    /**
     * Array instance identical to Data's sfmap, both are used only
     * temporarily and indepently, so we do not need to allocate
     * additional memory.
     */
    private final char[] quadrant;

    BlockSort(final CBZip2OutputStream.Data data) {
        this.quadrant = data.sfmap;
    }

    void blockSort(final CBZip2OutputStream.Data data, final int last) {
        this.workLimit = WORK_FACTOR * last;
        this.workDone = 0;
        this.firstAttempt = true;

        if (last + 1 < 10000) {
            fallbackSort(data, last);
        } else {
            mainSort(data, last);

            if (this.firstAttempt && (this.workDone > this.workLimit)) {
                fallbackSort(data, last);
            }
        }

        final int[] fmap = data.fmap;
        data.origPtr = -1;
        for (int i = 0; i <= last; i++) {
            if (fmap[i] == 0) {
                data.origPtr = i;
                break;
            }
        }

        // assert (data.origPtr != -1) : data.origPtr;
    }

    /**
     * Adapt fallbackSort to the expected interface of the rest of the
     * code, in particular deal with the fact that block starts at
     * offset 1 (in libbzip2 1.0.6 it starts at 0).
     */
    final void fallbackSort(final CBZip2OutputStream.Data data,
                            final int last) {
        data.block[0] = data.block[last + 1];
        fallbackSort(data.fmap, data.block, last + 1);
        for (int i = 0; i < last + 1; i++) {
            --data.fmap[i];
        }
        for (int i = 0; i < last + 1; i++) {
            if (data.fmap[i] == -1) {
                data.fmap[i] = last;
                break;
            }
        }
    }

/*---------------------------------------------*/

/*---------------------------------------------*/
/*--- LBZ2: Fallback O(N log(N)^2) sorting        ---*/
/*--- algorithm, for repetitive blocks      ---*/
/*---------------------------------------------*/

    /*
     * This is the fallback sorting algorithm libbzip2 1.0.6 uses for
     * repetitive or very short inputs.
     *
     * The idea is inspired by Manber-Myers string suffix sorting
     * algorithm.  First a bucket sort places each permutation of the
     * block into a bucket based on its first byte.  Permutations are
     * represented by pointers to their first character kept in
     * (partially) sorted order inside the array ftab.
     *
     * The next step visits all buckets in order and performs a
     * quicksort on all permutations of the bucket based on the index
     * of the bucket the second byte of the permutation belongs to,
     * thereby forming new buckets.  When arrived here the
     * permutations are sorted up to the second character and we have
     * buckets of permutations that are identical up to two
     * characters.
     *
     * Repeat the step of quicksorting each bucket, now based on the
     * bucket holding the sequence of the third and forth character
     * leading to four byte buckets.  Repeat this doubling of bucket
     * sizes until all buckets only contain single permutations or the
     * bucket size exceeds the block size.
     *
     * I.e.
     *
     * "abraba" form three buckets for the chars "a", "b", and "r" in
     * the first step with
     *
     * fmap = { 'a:' 5, 3, 0, 'b:' 4, 1, 'r', 2 }
     *
     * when looking at the bucket of "a"s the second characters are in
     * the buckets that start with fmap-index 0 (rolled over), 3 and 3
     * respectively, forming two new buckets "aa" and "ab", so we get
     *
     * fmap = { 'aa:' 5, 'ab:' 3, 0, 'ba:' 4, 'br': 1, 'ra:' 2 }
     *
     * since the last bucket only contained a single item it didn't
     * have to be sorted at all.
     *
     * There now is just one bucket with more than one permutation
     * that remains to be sorted.  For the permutation that starts
     * with index 3 the third and forth char are in bucket 'aa' at
     * index 0 and for the one starting at block index 0 they are in
     * bucket 'ra' with sort index 5.  The fully sorted order then becomes.
     *
     * fmap = { 5, 3, 0, 4, 1, 2 }
     * 
     */

    /**
     * @param fmap points to the index of the starting point of a
     *        permutation inside the block of data in the current
     *        partially sorted order
     * @param eclass points from the index of a character inside the
     *        block to the first index in fmap that contains the
     *        bucket of its suffix that is sorted in this step.
     * @param lo lower boundary of the fmap-interval to be sorted 
     * @param hi upper boundary of the fmap-interval to be sorted 
     */
    private void fallbackSimpleSort(int[] fmap, 
                                    int[] eclass, 
                                    int lo, 
                                    int hi) {
        if (lo == hi) {
            return;
        }

        int j;
        if (hi - lo > 3) {
            for (int i = hi - 4; i >= lo; i--) {
                int tmp = fmap[i];
                int ec_tmp = eclass[tmp];
                for (j = i + 4; j <= hi && ec_tmp > eclass[fmap[j]];
                     j += 4) {
                    fmap[j - 4] = fmap[j];
                }
                fmap[j - 4] = tmp;
            }
        }

        for (int i = hi - 1; i >= lo; i--) {
            int tmp = fmap[i];
            int ec_tmp = eclass[tmp];
            for (j = i + 1; j <= hi && ec_tmp > eclass[fmap[j]]; j++) {
                fmap[j - 1] = fmap[j];
            }
            fmap[j-1] = tmp;
        }
    }

    private static final int FALLBACK_QSORT_SMALL_THRESH = 10;

    /**
     * swaps two values in fmap
     */
    private void fswap(int[] fmap, int zz1, int zz2) {
        int zztmp = fmap[zz1];
        fmap[zz1] = fmap[zz2];
        fmap[zz2] = zztmp;
    }

    /**
     * swaps two intervals starting at yyp1 and yyp2 of length yyn inside fmap.
     */
    private void fvswap(int[] fmap, int yyp1, int yyp2, int yyn) {
        while (yyn > 0) {
            fswap(fmap, yyp1, yyp2);
            yyp1++; yyp2++; yyn--;
        }
    }

    private int fmin(int a, int b) {
        return a < b ? a : b;
    }

    private void fpush(int sp, int lz, int hz) {
        stack_ll[sp] = lz;
        stack_hh[sp] = hz;
    }

    private int[] fpop(int sp) {
        return new int[] { stack_ll[sp], stack_hh[sp] };
    }

    /**
     * @param fmap points to the index of the starting point of a
     *        permutation inside the block of data in the current
     *        partially sorted order
     * @param eclass points from the index of a character inside the
     *        block to the first index in fmap that contains the
     *        bucket of its suffix that is sorted in this step.
     * @param loSt lower boundary of the fmap-interval to be sorted 
     * @param hiSt upper boundary of the fmap-interval to be sorted 
     */
    private void fallbackQSort3(int[] fmap, 
                                int[] eclass, 
                                int loSt, 
                                int hiSt) {
        int lo, unLo, ltLo, hi, unHi, gtHi, n;

        long r = 0;
        int sp = 0;
        fpush(sp++, loSt, hiSt);

        while (sp > 0) {
            int[] s = fpop(--sp);
            lo = s[0]; hi = s[1];

            if (hi - lo < FALLBACK_QSORT_SMALL_THRESH) {
                fallbackSimpleSort(fmap, eclass, lo, hi);
                continue;
            }

            /* LBZ2: Random partitioning.  Median of 3 sometimes fails to
               avoid bad cases.  Median of 9 seems to help but 
               looks rather expensive.  This too seems to work but
               is cheaper.  Guidance for the magic constants 
               7621 and 32768 is taken from Sedgewick's algorithms
               book, chapter 35.
            */
            r = ((r * 7621) + 1) % 32768;
            long r3 = r % 3, med;
            if (r3 == 0) {
                med = eclass[fmap[lo]]; 
            } else if (r3 == 1) {
                med = eclass[fmap[(lo + hi) >>> 1]];
            } else {
                med = eclass[fmap[hi]];
            }

            unLo = ltLo = lo;
            unHi = gtHi = hi;

            // looks like the ternary partition attributed to Wegner
            // in the cited Sedgewick paper
            while (true) {
                while (true) {
                    if (unLo > unHi) {
                        break;
                    }
                    n = eclass[fmap[unLo]] - (int) med;
                    if (n == 0) { 
                        fswap(fmap, unLo, ltLo); 
                        ltLo++; unLo++; 
                        continue; 
                    }
                    if (n > 0) {
                        break;
                    }
                    unLo++;
                }
                while (true) {
                    if (unLo > unHi) {
                        break;
                    }
                    n = eclass[fmap[unHi]] - (int) med;
                    if (n == 0) {
                        fswap(fmap, unHi, gtHi); 
                        gtHi--; unHi--; 
                        continue; 
                    }
                    if (n < 0) {
                        break;
                    }
                    unHi--;
                }
                if (unLo > unHi) {
                    break;
                }
                fswap(fmap, unLo, unHi); unLo++; unHi--;
            }

            if (gtHi < ltLo) {
                continue;
            }

            n = fmin(ltLo - lo, unLo - ltLo);
            fvswap(fmap, lo, unLo - n, n);
            int m = fmin(hi - gtHi, gtHi - unHi);
            fvswap(fmap, unHi + 1, hi - m + 1, m);

            n = lo + unLo - ltLo - 1;
            m = hi - (gtHi - unHi) + 1;

            if (n - lo > hi - m) {
                fpush(sp++, lo, n);
                fpush(sp++, m, hi);
            } else {
                fpush(sp++, m, hi);
                fpush(sp++, lo, n);
            }
        }
    }


/*---------------------------------------------*/

    private int[] eclass;

    private int[] getEclass() {
        return eclass == null
            ? (eclass = new int[quadrant.length / 2]) : eclass;
    }

    /*
     * The C code uses an array of ints (each int holding 32 flags) to
     * represents the bucket-start flags (bhtab).  It also contains
     * optimizations to skip over 32 consecutively set or
     * consecutively unset bits on word boundaries at once.  For now
     * I've chosen to use the simpler but potentially slower code
     * using BitSet - also in the hope that using the BitSet#nextXXX
     * methods may be fast enough.
     */

    /**
     * @param fmap points to the index of the starting point of a
     *        permutation inside the block of data in the current
     *        partially sorted order
     * @param block the original data
     * @param nblock size of the block
     * @param off offset of first byte to sort in block
     */
    final void fallbackSort(int[] fmap, byte[] block, int nblock) {
        final int[] ftab = new int[257];
        int H, i, j, k, l, r, cc, cc1;
        int nNotDone;
        int nBhtab;
        final int[] eclass = getEclass();

        for (i = 0; i < nblock; i++) {
            eclass[i] = 0;
        }
        /*--
          LBZ2: Initial 1-char radix sort to generate
          initial fmap and initial BH bits.
          --*/
        for (i = 0; i < nblock; i++) {
            ftab[block[i] & 0xff]++;
        }
        for (i = 1; i < 257;    i++) {
            ftab[i] += ftab[i - 1];
        }

        for (i = 0; i < nblock; i++) {
            j = block[i] & 0xff;
            k = ftab[j] - 1;
            ftab[j] = k;
            fmap[k] = i;
        }

        nBhtab = 64 + nblock;
        BitSet bhtab = new BitSet(nBhtab);
        for (i = 0; i < 256; i++) {
            bhtab.set(ftab[i]);
        }

        /*--
          LBZ2: Inductively refine the buckets.  Kind-of an
          "exponential radix sort" (!), inspired by the
          Manber-Myers suffix array construction algorithm.
          --*/

        /*-- LBZ2: set sentinel bits for block-end detection --*/
        for (i = 0; i < 32; i++) { 
            bhtab.set(nblock + 2 * i);
            bhtab.clear(nblock + 2 * i + 1);
        }

        /*-- LBZ2: the log(N) loop --*/
        H = 1;
        while (true) {

            j = 0;
            for (i = 0; i < nblock; i++) {
                if (bhtab.get(i)) {
                    j = i;
                }
                k = fmap[i] - H;
                if (k < 0) {
                    k += nblock;
                }
                eclass[k] = j;
            }

            nNotDone = 0;
            r = -1;
            while (true) {

                /*-- LBZ2: find the next non-singleton bucket --*/
                k = r + 1;
                k = bhtab.nextClearBit(k);
                l = k - 1;
                if (l >= nblock) {
                    break;
                }
                k = bhtab.nextSetBit(k + 1);
                r = k - 1;
                if (r >= nblock) {
                    break;
                }

                /*-- LBZ2: now [l, r] bracket current bucket --*/
                if (r > l) {
                    nNotDone += (r - l + 1);
                    fallbackQSort3(fmap, eclass, l, r);

                    /*-- LBZ2: scan bucket and generate header bits-- */
                    cc = -1;
                    for (i = l; i <= r; i++) {
                        cc1 = eclass[fmap[i]];
                        if (cc != cc1) {
                            bhtab.set(i);
                            cc = cc1;
                        }
                    }
                }
            }

            H *= 2;
            if (H > nblock || nNotDone == 0) {
                break;
            }
        }
    }

/*---------------------------------------------*/

    /*
     * LBZ2: Knuth's increments seem to work better than Incerpi-Sedgewick here.
     * Possibly because the number of elems to sort is usually small, typically
     * <= 20.
     */
    private static final int[] INCS = { 1, 4, 13, 40, 121, 364, 1093, 3280,
                                        9841, 29524, 88573, 265720, 797161,
                                        2391484 };

    /**
     * This is the most hammered method of this class.
     *
     * 
     * This is the version using unrolled loops. Normally I never use such ones
     * in Java code. The unrolling has shown a noticable performance improvement
     * on JRE 1.4.2 (Linux i586 / HotSpot Client). Of course it depends on the
     * JIT compiler of the vm.
     * 
     */
    private boolean mainSimpleSort(final CBZip2OutputStream.Data dataShadow,
                                   final int lo, final int hi, final int d,
                                   final int lastShadow) {
        final int bigN = hi - lo + 1;
        if (bigN < 2) {
            return this.firstAttempt && (this.workDone > this.workLimit);
        }

        int hp = 0;
        while (INCS[hp] < bigN) {
            hp++;
        }

        final int[] fmap = dataShadow.fmap;
        final char[] quadrant = this.quadrant;
        final byte[] block = dataShadow.block;
        final int lastPlus1 = lastShadow + 1;
        final boolean firstAttemptShadow = this.firstAttempt;
        final int workLimitShadow = this.workLimit;
        int workDoneShadow = this.workDone;

        // Following block contains unrolled code which could be shortened by
        // coding it in additional loops.

        HP: while (--hp >= 0) {
            final int h = INCS[hp];
            final int mj = lo + h - 1;

            for (int i = lo + h; i <= hi;) {
                // copy
                for (int k = 3; (i <= hi) && (--k >= 0); i++) {
                    final int v = fmap[i];
                    final int vd = v + d;
                    int j = i;

                    // for (int a;
                    // (j > mj) && mainGtU((a = fmap[j - h]) + d, vd,
                    // block, quadrant, lastShadow);
                    // j -= h) {
                    // fmap[j] = a;
                    // }
                    //
                    // unrolled version:

                    // start inline mainGTU
                    boolean onceRunned = false;
                    int a = 0;

                    HAMMER: while (true) {
                        if (onceRunned) {
                            fmap[j] = a;
                            if ((j -= h) <= mj) {
                                break HAMMER;
                            }
                        } else {
                            onceRunned = true;
                        }

                        a = fmap[j - h];
                        int i1 = a + d;
                        int i2 = vd;

                        // following could be done in a loop, but
                        // unrolled it for performance:
                        if (block[i1 + 1] == block[i2 + 1]) {
                            if (block[i1 + 2] == block[i2 + 2]) {
                                if (block[i1 + 3] == block[i2 + 3]) {
                                    if (block[i1 + 4] == block[i2 + 4]) {
                                        if (block[i1 + 5] == block[i2 + 5]) {
                                            if (block[(i1 += 6)] == block[(i2 += 6)]) {
                                                int x = lastShadow;
                                                X: while (x > 0) {
                                                    x -= 4;

                                                    if (block[i1 + 1] == block[i2 + 1]) {
                                                        if (quadrant[i1] == quadrant[i2]) {
                                                            if (block[i1 + 2] == block[i2 + 2]) {
                                                                if (quadrant[i1 + 1] == quadrant[i2 + 1]) {
                                                                    if (block[i1 + 3] == block[i2 + 3]) {
                                                                        if (quadrant[i1 + 2] == quadrant[i2 + 2]) {
                                                                            if (block[i1 + 4] == block[i2 + 4]) {
                                                                                if (quadrant[i1 + 3] == quadrant[i2 + 3]) {
                                                                                    if ((i1 += 4) >= lastPlus1) {
                                                                                        i1 -= lastPlus1;
                                                                                    }
                                                                                    if ((i2 += 4) >= lastPlus1) {
                                                                                        i2 -= lastPlus1;
                                                                                    }
                                                                                    workDoneShadow++;
                                                                                    continue X;
                                                                                } else if ((quadrant[i1 + 3] > quadrant[i2 + 3])) {
                                                                                    continue HAMMER;
                                                                                } else {
                                                                                    break HAMMER;
                                                                                }
                                                                            } else if ((block[i1 + 4] & 0xff) > (block[i2 + 4] & 0xff)) {
                                                                                continue HAMMER;
                                                                            } else {
                                                                                break HAMMER;
                                                                            }
                                                                        } else if ((quadrant[i1 + 2] > quadrant[i2 + 2])) {
                                                                            continue HAMMER;
                                                                        } else {
                                                                            break HAMMER;
                                                                        }
                                                                    } else if ((block[i1 + 3] & 0xff) > (block[i2 + 3] & 0xff)) {
                                                                        continue HAMMER;
                                                                    } else {
                                                                        break HAMMER;
                                                                    }
                                                                } else if ((quadrant[i1 + 1] > quadrant[i2 + 1])) {
                                                                    continue HAMMER;
                                                                } else {
                                                                    break HAMMER;
                                                                }
                                                            } else if ((block[i1 + 2] & 0xff) > (block[i2 + 2] & 0xff)) {
                                                                continue HAMMER;
                                                            } else {
                                                                break HAMMER;
                                                            }
                                                        } else if ((quadrant[i1] > quadrant[i2])) {
                                                            continue HAMMER;
                                                        } else {
                                                            break HAMMER;
                                                        }
                                                    } else if ((block[i1 + 1] & 0xff) > (block[i2 + 1] & 0xff)) {
                                                        continue HAMMER;
                                                    } else {
                                                        break HAMMER;
                                                    }

                                                }
                                                break HAMMER;
                                            } // while x > 0
                                            else {
                                                if ((block[i1] & 0xff) > (block[i2] & 0xff)) {
                                                    continue HAMMER;
                                                } else {
                                                    break HAMMER;
                                                }
                                            }
                                        } else if ((block[i1 + 5] & 0xff) > (block[i2 + 5] & 0xff)) {
                                            continue HAMMER;
                                        } else {
                                            break HAMMER;
                                        }
                                    } else if ((block[i1 + 4] & 0xff) > (block[i2 + 4] & 0xff)) {
                                        continue HAMMER;
                                    } else {
                                        break HAMMER;
                                    }
                                } else if ((block[i1 + 3] & 0xff) > (block[i2 + 3] & 0xff)) {
                                    continue HAMMER;
                                } else {
                                    break HAMMER;
                                }
                            } else if ((block[i1 + 2] & 0xff) > (block[i2 + 2] & 0xff)) {
                                continue HAMMER;
                            } else {
                                break HAMMER;
                            }
                        } else if ((block[i1 + 1] & 0xff) > (block[i2 + 1] & 0xff)) {
                            continue HAMMER;
                        } else {
                            break HAMMER;
                        }

                    } // HAMMER
                    // end inline mainGTU

                    fmap[j] = v;
                }

                if (firstAttemptShadow && (i <= hi)
                    && (workDoneShadow > workLimitShadow)) {
                    break HP;
                }
            }
        }

        this.workDone = workDoneShadow;
        return firstAttemptShadow && (workDoneShadow > workLimitShadow);
    }

/*--
   LBZ2: The following is an implementation of
   an elegant 3-way quicksort for strings,
   described in a paper "Fast Algorithms for
   Sorting and Searching Strings", by Robert
   Sedgewick and Jon L. Bentley.
--*/

    private static void vswap(int[] fmap, int p1, int p2, int n) {
        n += p1;
        while (p1 < n) {
            int t = fmap[p1];
            fmap[p1++] = fmap[p2];
            fmap[p2++] = t;
        }
    }

    private static byte med3(byte a, byte b, byte c) {
        return (a < b) ? (b < c ? b : a < c ? c : a) : (b > c ? b : a > c ? c
                                                        : a);
    }

    private static final int SMALL_THRESH = 20;
    private static final int DEPTH_THRESH = 10;
    private static final int WORK_FACTOR = 30;

    /**
     * Method "mainQSort3", file "blocksort.c", BZip2 1.0.2
     */
    private void mainQSort3(final CBZip2OutputStream.Data dataShadow,
                            final int loSt, final int hiSt, final int dSt,
                            final int last) {
        final int[] stack_ll = this.stack_ll;
        final int[] stack_hh = this.stack_hh;
        final int[] stack_dd = this.stack_dd;
        final int[] fmap = dataShadow.fmap;
        final byte[] block = dataShadow.block;

        stack_ll[0] = loSt;
        stack_hh[0] = hiSt;
        stack_dd[0] = dSt;

        for (int sp = 1; --sp >= 0;) {
            final int lo = stack_ll[sp];
            final int hi = stack_hh[sp];
            final int d = stack_dd[sp];

            if ((hi - lo < SMALL_THRESH) || (d > DEPTH_THRESH)) {
                if (mainSimpleSort(dataShadow, lo, hi, d, last)) {
                    return;
                }
            } else {
                final int d1 = d + 1;
                final int med = med3(block[fmap[lo] + d1],
                                     block[fmap[hi] + d1], block[fmap[(lo + hi) >>> 1] + d1]) & 0xff;

                int unLo = lo;
                int unHi = hi;
                int ltLo = lo;
                int gtHi = hi;

                while (true) {
                    while (unLo <= unHi) {
                        final int n = (block[fmap[unLo] + d1] & 0xff)
                            - med;
                        if (n == 0) {
                            final int temp = fmap[unLo];
                            fmap[unLo++] = fmap[ltLo];
                            fmap[ltLo++] = temp;
                        } else if (n < 0) {
                            unLo++;
                        } else {
                            break;
                        }
                    }

                    while (unLo <= unHi) {
                        final int n = (block[fmap[unHi] + d1] & 0xff)
                            - med;
                        if (n == 0) {
                            final int temp = fmap[unHi];
                            fmap[unHi--] = fmap[gtHi];
                            fmap[gtHi--] = temp;
                        } else if (n > 0) {
                            unHi--;
                        } else {
                            break;
                        }
                    }

                    if (unLo <= unHi) {
                        final int temp = fmap[unLo];
                        fmap[unLo++] = fmap[unHi];
                        fmap[unHi--] = temp;
                    } else {
                        break;
                    }
                }

                if (gtHi < ltLo) {
                    stack_ll[sp] = lo;
                    stack_hh[sp] = hi;
                    stack_dd[sp] = d1;
                    sp++;
                } else {
                    int n = ((ltLo - lo) < (unLo - ltLo)) ? (ltLo - lo)
                        : (unLo - ltLo);
                    vswap(fmap, lo, unLo - n, n);
                    int m = ((hi - gtHi) < (gtHi - unHi)) ? (hi - gtHi)
                        : (gtHi - unHi);
                    vswap(fmap, unLo, hi - m + 1, m);

                    n = lo + unLo - ltLo - 1;
                    m = hi - (gtHi - unHi) + 1;

                    stack_ll[sp] = lo;
                    stack_hh[sp] = n;
                    stack_dd[sp] = d;
                    sp++;

                    stack_ll[sp] = n + 1;
                    stack_hh[sp] = m - 1;
                    stack_dd[sp] = d1;
                    sp++;

                    stack_ll[sp] = m;
                    stack_hh[sp] = hi;
                    stack_dd[sp] = d;
                    sp++;
                }
            }
        }
    }

    private static final int SETMASK = (1 << 21);
    private static final int CLEARMASK = (~SETMASK);

    final void mainSort(final CBZip2OutputStream.Data dataShadow,
                        final int lastShadow) {
        final int[] runningOrder = this.mainSort_runningOrder;
        final int[] copy = this.mainSort_copy;
        final boolean[] bigDone = this.mainSort_bigDone;
        final int[] ftab = this.ftab;
        final byte[] block = dataShadow.block;
        final int[] fmap = dataShadow.fmap;
        final char[] quadrant = this.quadrant;
        final int workLimitShadow = this.workLimit;
        final boolean firstAttemptShadow = this.firstAttempt;

        // LBZ2: Set up the 2-byte frequency table
        for (int i = 65537; --i >= 0;) {
            ftab[i] = 0;
        }

        /*
         * In the various block-sized structures, live data runs from 0 to
         * last+NUM_OVERSHOOT_BYTES inclusive. First, set up the overshoot area
         * for block.
         */
        for (int i = 0; i < BZip2Constants.NUM_OVERSHOOT_BYTES; i++) {
            block[lastShadow + i + 2] = block[(i % (lastShadow + 1)) + 1];
        }
        for (int i = lastShadow + BZip2Constants.NUM_OVERSHOOT_BYTES +1; --i >= 0;) {
            quadrant[i] = 0;
        }
        block[0] = block[lastShadow + 1];

        // LBZ2: Complete the initial radix sort:

        int c1 = block[0] & 0xff;
        for (int i = 0; i <= lastShadow; i++) {
            final int c2 = block[i + 1] & 0xff;
            ftab[(c1 << 8) + c2]++;
            c1 = c2;
        }

        for (int i = 1; i <= 65536; i++) {
            ftab[i] += ftab[i - 1];
        }

        c1 = block[1] & 0xff;
        for (int i = 0; i < lastShadow; i++) {
            final int c2 = block[i + 2] & 0xff;
            fmap[--ftab[(c1 << 8) + c2]] = i;
            c1 = c2;
        }

        fmap[--ftab[((block[lastShadow + 1] & 0xff) << 8) + (block[1] & 0xff)]] = lastShadow;

        /*
         * LBZ2: Now ftab contains the first loc of every small bucket. Calculate the
         * running order, from smallest to largest big bucket.
         */
        for (int i = 256; --i >= 0;) {
            bigDone[i] = false;
            runningOrder[i] = i;
        }

        for (int h = 364; h != 1;) {
            h /= 3;
            for (int i = h; i <= 255; i++) {
                final int vv = runningOrder[i];
                final int a = ftab[(vv + 1) << 8] - ftab[vv << 8];
                final int b = h - 1;
                int j = i;
                for (int ro = runningOrder[j - h]; (ftab[(ro + 1) << 8] - ftab[ro << 8]) > a; ro = runningOrder[j
                                                                                                                - h]) {
                    runningOrder[j] = ro;
                    j -= h;
                    if (j <= b) {
                        break;
                    }
                }
                runningOrder[j] = vv;
            }
        }

        /*
         * LBZ2: The main sorting loop.
         */
        for (int i = 0; i <= 255; i++) {
            /*
             * LBZ2: Process big buckets, starting with the least full.
             */
            final int ss = runningOrder[i];

            // Step 1:
            /*
             * LBZ2: Complete the big bucket [ss] by quicksorting any unsorted small
             * buckets [ss, j]. Hopefully previous pointer-scanning phases have
             * already completed many of the small buckets [ss, j], so we don't
             * have to sort them at all.
             */
            for (int j = 0; j <= 255; j++) {
                final int sb = (ss << 8) + j;
                final int ftab_sb = ftab[sb];
                if ((ftab_sb & SETMASK) != SETMASK) {
                    final int lo = ftab_sb & CLEARMASK;
                    final int hi = (ftab[sb + 1] & CLEARMASK) - 1;
                    if (hi > lo) {
                        mainQSort3(dataShadow, lo, hi, 2, lastShadow);
                        if (firstAttemptShadow
                            && (this.workDone > workLimitShadow)) {
                            return;
                        }
                    }
                    ftab[sb] = ftab_sb | SETMASK;
                }
            }

            // Step 2:
            // LBZ2: Now scan this big bucket so as to synthesise the
            // sorted order for small buckets [t, ss] for all t != ss.

            for (int j = 0; j <= 255; j++) {
                copy[j] = ftab[(j << 8) + ss] & CLEARMASK;
            }

            for (int j = ftab[ss << 8] & CLEARMASK, hj = (ftab[(ss + 1) << 8] & CLEARMASK); j < hj; j++) {
                final int fmap_j = fmap[j];
                c1 = block[fmap_j] & 0xff;
                if (!bigDone[c1]) {
                    fmap[copy[c1]] = (fmap_j == 0) ? lastShadow : (fmap_j - 1);
                    copy[c1]++;
                }
            }

            for (int j = 256; --j >= 0;) {
                ftab[(j << 8) + ss] |= SETMASK;
            }

            // Step 3:
            /*
             * LBZ2: The ss big bucket is now done. Record this fact, and update the
             * quadrant descriptors. Remember to update quadrants in the
             * overshoot area too, if necessary. The "if (i < 255)" test merely
             * skips this updating for the last bucket processed, since updating
             * for the last bucket is pointless.
             */
            bigDone[ss] = true;

            if (i < 255) {
                final int bbStart = ftab[ss << 8] & CLEARMASK;
                final int bbSize = (ftab[(ss + 1) << 8] & CLEARMASK) - bbStart;
                int shifts = 0;

                while ((bbSize >> shifts) > 65534) {
                    shifts++;
                }

                for (int j = 0; j < bbSize; j++) {
                    final int a2update = fmap[bbStart + j];
                    final char qVal = (char) (j >> shifts);
                    quadrant[a2update] = qVal;
                    if (a2update < BZip2Constants.NUM_OVERSHOOT_BYTES) {
                        quadrant[a2update + lastShadow + 1] = qVal;
                    }
                }
            }

        }
    }

}