org.apache.commons.compress.harmony.pack200.BHSDCodec Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of commons-compress Show documentation
Apache Commons Compress defines an API for working with compression and archive formats. These include bzip2, gzip, pack200, LZMA, XZ, Snappy, traditional Unix Compress, DEFLATE, DEFLATE64, LZ4, Brotli, Zstandard and ar, cpio, jar, tar, zip, dump, 7z, arj.
There is a newer version: 1.27.1
Show newest version
/*
 *  Licensed to the Apache Software Foundation (ASF) under one or more
 *  contributor license agreements.  See the NOTICE file distributed with
 *  this work for additional information regarding copyright ownership.
 *  The ASF licenses this file to You under the Apache License, Version 2.0
 *  (the "License"); you may not use this file except in compliance with
 *  the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 *  Unless required by applicable law or agreed to in writing, software
 *  distributed under the License is distributed on an "AS IS" BASIS,
 *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 *  See the License for the specific language governing permissions and
 *  limitations under the License.
 */
package org.apache.commons.compress.harmony.pack200;

import java.io.EOFException;
import java.io.IOException;
import java.io.InputStream;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;

import org.apache.commons.compress.utils.ExactMath;

/**
 * A BHSD codec is a means of encoding integer values as a sequence of bytes or vice versa using a specified "BHSD" encoding mechanism. It uses a
 * variable-length encoding and a modified sign representation such that small numbers are represented as a single byte, whilst larger numbers take more bytes
 * to encode. The number may be signed or unsigned; if it is unsigned, it can be weighted towards positive numbers or equally distributed using a one's
 * complement. The Codec also supports delta coding, where a sequence of numbers is represented as a series of first-order differences. So a delta encoding of
 * the integers [1..10] would be represented as a sequence of 10x1s. This allows the absolute value of a coded integer to fall outside of the 'small number'
 * range, whilst still being encoded as a single byte.
 * 
 * A BHSD codec is configured with four parameters:
 * 
 * 
 * B
 * The maximum number of bytes that each value is encoded as. B must be a value between [1..5]. For a pass-through coding (where each byte is encoded as
 * itself, aka {@link #BYTE1}, B is 1 (each byte takes a maximum of 1 byte).
 * H
 * The radix of the integer. Values are defined as a sequence of values, where value {@code n} is multiplied by {@code H^ⁿ}. So the number 1234
 * may be represented as the sequence 4 3 2 1 with a radix (H) of 10. Note that other permutations are also possible; 43 2 1 will also encode 1234. The
 * co-parameter L is defined as 256-H. This is important because only the last value in a sequence may be < L; all prior values must be > L.
 * S
 * Whether the codec represents signed values (or not). This may have 3 values; 0 (unsigned), 1 (signed, one's complement) or 2 (signed, two's
 * complement)
 * D
 * Whether the codec represents a delta encoding. This may be 0 (no delta) or 1 (delta encoding). A delta encoding of 1 indicates that values are
 * cumulative; a sequence of {@code 1 1 1 1 1} will represent the sequence {@code 1 2 3 4 5}. For this reason, the codec supports two variants of decode; one
 * {@link #decode(InputStream, long) with} and one {@link #decode(InputStream) without} a {@code last} parameter. If the codec is a non-delta encoding, then the
 * value is ignored if passed. If the codec is a delta encoding, it is a run-time error to call the value without the extra parameter, and the previous value
 * should be returned. (It was designed this way to support multi-threaded access without requiring a new instance of the Codec to be cloned for each use.)
 * 
 * 
 * Codecs are notated as (B,H,S,D) and either D or S,D may be omitted if zero. Thus {@link #BYTE1} is denoted (1,256,0,0) or (1,256). The {@link #toString()}
 * method prints out the condensed form of the encoding. Often, the last character in the name ({@link #BYTE1}, {@link #UNSIGNED5}) gives a clue as to the B
 * value. Those that start with U ({@link #UDELTA5}, {@link #UNSIGNED5}) are unsigned; otherwise, in most cases, they are signed. The presence of the word Delta
 * ({@link #DELTA5}, {@link #UDELTA5}) indicates a delta encoding is used.
 * 
 */
public final class BHSDCodec extends Codec {

    /**
     * The maximum number of bytes in each coding word. B must be a value between [1..5]. For a pass-through coding (where each byte is encoded as itself, aka
     * {@link #BYTE1}, B is 1 (each byte takes a maximum of 1 byte).
     */
    private final int b;

    /**
     * Whether delta encoding is used (0=false,1=true).
     */
    private final int d;

    /**
     * The radix of the encoding.
     */
    private final int h;

    /**
     * The co-parameter of h; 256-h.
     */
    private final int l;

    /**
     * Represents signed numbers or not, 0 (unsigned), 1 (signed, one's complement) or 2 (signed, two's complement).
     */
    private final int s;

    private long cardinality;

    private final long smallest;

    private final long largest;

    /**
     * radix^i powers
     */
    private final long[] powers;

    /**
     * Constructs an unsigned, non-delta Codec with the given B and H values.
     *
     * @param b the maximum number of bytes that a value can be encoded as [1..5]
     * @param h the radix of the encoding [1..256]
     */
    public BHSDCodec(final int b, final int h) {
        this(b, h, 0, 0);
    }

    /**
     * Constructs a non-delta Codec with the given B, H and S values.
     *
     * @param b the maximum number of bytes that a value can be encoded as [1..5]
     * @param h the radix of the encoding [1..256]
     * @param s whether the encoding represents signed numbers (s=0 is unsigned; s=1 is signed with 1s complement; s=2 is signed with ?)
     */
    public BHSDCodec(final int b, final int h, final int s) {
        this(b, h, s, 0);
    }

    /**
     * Constructs a Codec with the given B, H, S and D values.
     *
     * @param b the maximum number of bytes that a value can be encoded as [1..5]
     * @param h the radix of the encoding [1..256]
     * @param s whether the encoding represents signed numbers (s=0 is unsigned; s=1 is signed with 1s complement; s=2 is signed with ?)
     * @param d whether this is a delta encoding (d=0 is non-delta; d=1 is delta)
     */
    public BHSDCodec(final int b, final int h, final int s, final int d) {
        if (b < 1 || b > 5) {
            throw new IllegalArgumentException("1<=b<=5");
        }
        if (h < 1 || h > 256) {
            throw new IllegalArgumentException("1<=h<=256");
        }
        if (s < 0 || s > 2) {
            throw new IllegalArgumentException("0<=s<=2");
        }
        if (d < 0 || d > 1) {
            throw new IllegalArgumentException("0<=d<=1");
        }
        if (b == 1 && h != 256) {
            throw new IllegalArgumentException("b=1 -> h=256");
        }
        if (h == 256 && b == 5) {
            throw new IllegalArgumentException("h=256 -> b!=5");
        }
        this.b = b;
        this.h = h;
        this.s = s;
        this.d = d;
        this.l = 256 - h;
        if (h == 1) {
            cardinality = b * 255 + 1;
        } else {
            cardinality = (long) ((long) (l * (1 - Math.pow(h, b)) / (1 - h)) + Math.pow(h, b));
        }
        smallest = calculateSmallest();
        largest = calculateLargest();

        powers = new long[b];
        Arrays.setAll(powers, c -> (long) Math.pow(h, c));
    }

    private long calculateLargest() {
        long result;
        // TODO This can probably be optimized into a better mathematical statement
        if (d == 1) {
            return new BHSDCodec(b, h).largest();
        }
        switch (s) {
        case 0:
            result = cardinality() - 1;
            break;
        case 1:
            result = cardinality() / 2 - 1;
            break;
        case 2:
            result = 3L * cardinality() / 4 - 1;
            break;
        default:
            throw new Error("Unknown s value");
        }
        return Math.min((s == 0 ? (long) Integer.MAX_VALUE << 1 : Integer.MAX_VALUE) - 1, result);
    }

    private long calculateSmallest() {
        long result;
        if (d == 1 || !isSigned()) {
            if (cardinality >= 4294967296L) { // 2^32
                result = Integer.MIN_VALUE;
            } else {
                result = 0;
            }
        } else {
            result = Math.max(Integer.MIN_VALUE, -cardinality() / (1 << s));
        }
        return result;
    }

    /**
     * Returns the cardinality of this codec; that is, the number of distinct values that it can contain.
     *
     * @return the cardinality of this codec
     */
    public long cardinality() {
        return cardinality;
    }

    @Override
    public int decode(final InputStream in) throws IOException, Pack200Exception {
        if (d != 0) {
            throw new Pack200Exception("Delta encoding used without passing in last value; this is a coding error");
        }
        return decode(in, 0);
    }

    @Override
    public int decode(final InputStream in, final long last) throws IOException, Pack200Exception {
        int n = 0;
        long z = 0;
        long x = 0;

        do {
            x = in.read();
            lastBandLength++;
            z += x * powers[n];
            n++;
        } while (x >= l && n < b);

        if (x == -1) {
            throw new EOFException("End of stream reached whilst decoding");
        }

        if (isSigned()) {
            final int u = (1 << s) - 1;
            if ((z & u) == u) {
                z = z >>> s ^ -1L;
            } else {
                z -= z >>> s;
            }
        }
        // This algorithm does the same thing, but is probably slower. Leaving
        // in for now for readability
        // if (isSigned()) {
        // long u = z;
        // long twoPowS = (long) Math.pow(2, s);
        // double twoPowSMinusOne = twoPowS-1;
        // if (u % twoPowS < twoPowSMinusOne) {
        // if (cardinality < Math.pow(2, 32)) {
        // z = (long) (u - (Math.floor(u/ twoPowS)));
        // } else {
        // z = cast32((long) (u - (Math.floor(u/ twoPowS))));
        // }
        // } else {
        // z = (long) (-Math.floor(u/ twoPowS) - 1);
        // }
        // }
        if (isDelta()) {
            z += last;
        }
        return (int) z;
    }

    // private long cast32(long u) {
    // u = (long) ((long) ((u + Math.pow(2, 31)) % Math.pow(2, 32)) -
    // Math.pow(2, 31));
    // return u;
    // }

    @Override
    public int[] decodeInts(final int n, final InputStream in) throws IOException, Pack200Exception {
        final int[] band = super.decodeInts(n, in);
        if (isDelta()) {
            for (int i = 0; i < band.length; i++) {
                while (band[i] > largest) {
                    band[i] -= cardinality;
                }
                while (band[i] < smallest) {
                    band[i] = ExactMath.add(band[i], cardinality);
                }
            }
        }
        return band;
    }

    @Override
    public int[] decodeInts(final int n, final InputStream in, final int firstValue) throws IOException, Pack200Exception {
        final int[] band = super.decodeInts(n, in, firstValue);
        if (isDelta()) {
            for (int i = 0; i < band.length; i++) {
                while (band[i] > largest) {
                    band[i] -= cardinality;
                }
                while (band[i] < smallest) {
                    band[i] = ExactMath.add(band[i], cardinality);
                }
            }
        }
        return band;
    }

    @Override
    public byte[] encode(final int value) throws Pack200Exception {
        return encode(value, 0);
    }

    @Override
    public byte[] encode(final int value, final int last) throws Pack200Exception {
        if (!encodes(value)) {
            throw new Pack200Exception("The codec " + this + " does not encode the value " + value);
        }

        long z = value;
        if (isDelta()) {
            z -= last;
        }
        if (isSigned()) {
            if (z < Integer.MIN_VALUE) {
                z += 4294967296L;
            } else if (z > Integer.MAX_VALUE) {
                z -= 4294967296L;
            }
            if (z < 0) {
                z = (-z << s) - 1;
            } else if (s == 1) {
                z = z << s;
            } else {
                z += (z - z % 3) / 3;
            }
        } else if (z < 0) {
            // Need to use integer overflow here to represent negatives.
            // 4294967296L is the 1 << 32.
            z += Math.min(cardinality, 4294967296L);
        }
        if (z < 0) {
            throw new Pack200Exception("unable to encode");
        }

        final List byteList = new ArrayList<>();
        for (int n = 0; n < b; n++) {
            long byteN;
            if (z < l) {
                byteN = z;
            } else {
                byteN = z % h;
                while (byteN < l) {
                    byteN += h;
                }
            }
            byteList.add(Byte.valueOf((byte) byteN));
            if (byteN < l) {
                break;
            }
            z -= byteN;
            z /= h;
        }
        final byte[] bytes = new byte[byteList.size()];
        for (int i = 0; i < bytes.length; i++) {
            bytes[i] = byteList.get(i).byteValue();
        }
        return bytes;
    }

    /**
     * True if this encoding can code the given value
     *
     * @param value the value to check
     * @return {@code true} if the encoding can encode this value
     */
    public boolean encodes(final long value) {
        return value >= smallest && value <= largest;
    }

    @Override
    public boolean equals(final Object o) {
        if (o instanceof BHSDCodec) {
            final BHSDCodec codec = (BHSDCodec) o;
            return codec.b == b && codec.h == h && codec.s == s && codec.d == d;
        }
        return false;
    }

    /**
     * Gets the B.
     *
     * @return the b
     */
    public int getB() {
        return b;
    }

    /**
     * Gets the H.
     *
     * @return the h
     */
    public int getH() {
        return h;
    }

    /**
     * Gets the L.
     *
     * @return the l
     */
    public int getL() {
        return l;
    }

    /**
     * Gets the S.
     *
     * @return the s
     */
    public int getS() {
        return s;
    }

    @Override
    public int hashCode() {
        return ((b * 37 + h) * 37 + s) * 37 + d;
    }

    /**
     * Returns true if this codec is a delta codec
     *
     * @return true if this codec is a delta codec
     */
    public boolean isDelta() {
        return d != 0;
    }

    /**
     * Returns true if this codec is a signed codec
     *
     * @return true if this codec is a signed codec
     */
    public boolean isSigned() {
        return s != 0;
    }

    /**
     * Returns the largest value that this codec can represent.
     *
     * @return the largest value that this codec can represent.
     */
    public long largest() {
        return largest;
    }

    /**
     * Returns the smallest value that this codec can represent.
     *
     * @return the smallest value that this codec can represent.
     */
    public long smallest() {
        return smallest;
    }

    /**
     * Returns the codec in the form (1,256) or (1,64,1,1). Note that trailing zero fields are not shown.
     */
    @Override
    public String toString() {
        final StringBuilder buffer = new StringBuilder(11);
        buffer.append('(');
        buffer.append(b);
        buffer.append(',');
        buffer.append(h);
        if (s != 0 || d != 0) {
            buffer.append(',');
            buffer.append(s);
        }
        if (d != 0) {
            buffer.append(',');
            buffer.append(d);
        }
        buffer.append(')');
        return buffer.toString();
    }
}