All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.lucene.util.packed.PackedInts Maven / Gradle / Ivy

The newest version!
/*
 * COPIED FROM APACHE LUCENE 4.7.2
 *
 * Git URL: [email protected]:apache/lucene.git, tag: releases/lucene-solr/4.7.2, path: lucene/core/src/java
 *
 * (see https://issues.apache.org/jira/browse/OAK-10786 for details)
 */

package org.apache.lucene.util.packed;

/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

import java.io.IOException;
import java.util.Arrays;

import org.apache.lucene.codecs.CodecUtil;
import org.apache.lucene.index.NumericDocValues;
import org.apache.lucene.store.DataInput;
import org.apache.lucene.store.DataOutput;
import org.apache.lucene.store.IndexInput;
import org.apache.lucene.util.LongsRef;
import org.apache.lucene.util.RamUsageEstimator;

/**
 * Simplistic compression for array of unsigned long values.
 * Each value is >= 0 and <= a specified maximum value.  The
 * values are stored as packed ints, with each value
 * consuming a fixed number of bits.
 *
 * @lucene.internal
 */
public class PackedInts {

  /**
   * At most 700% memory overhead, always select a direct implementation.
   */
  public static final float FASTEST = 7f;

  /**
   * At most 50% memory overhead, always select a reasonably fast implementation.
   */
  public static final float FAST = 0.5f;

  /**
   * At most 20% memory overhead.
   */
  public static final float DEFAULT = 0.2f;

  /**
   * No memory overhead at all, but the returned implementation may be slow.
   */
  public static final float COMPACT = 0f;

  /**
   * Default amount of memory to use for bulk operations.
   */
  public static final int DEFAULT_BUFFER_SIZE = 1024; // 1K

  public final static String CODEC_NAME = "PackedInts";
  public final static int VERSION_START = 0; // PackedInts were long-aligned
  public final static int VERSION_BYTE_ALIGNED = 1;
  public final static int VERSION_CURRENT = VERSION_BYTE_ALIGNED;

  /**
   * Check the validity of a version number.
   */
  public static void checkVersion(int version) {
    if (version < VERSION_START) {
      throw new IllegalArgumentException("Version is too old, should be at least " + VERSION_START + " (got " + version + ")");
    } else if (version > VERSION_CURRENT) {
      throw new IllegalArgumentException("Version is too new, should be at most " + VERSION_CURRENT + " (got " + version + ")");
    }
  }

  /**
   * A format to write packed ints.
   *
   * @lucene.internal
   */
  public enum Format {
    /**
     * Compact format, all bits are written contiguously.
     */
    PACKED(0) {

      @Override
      public long byteCount(int packedIntsVersion, int valueCount, int bitsPerValue) {
        if (packedIntsVersion < VERSION_BYTE_ALIGNED) {
          return 8L *  (long) Math.ceil((double) valueCount * bitsPerValue / 64);
        } else {
          return (long) Math.ceil((double) valueCount * bitsPerValue / 8);
        }
      }

    },

    /**
     * A format that may insert padding bits to improve encoding and decoding
     * speed. Since this format doesn't support all possible bits per value, you
     * should never use it directly, but rather use
     * {@link PackedInts#fastestFormatAndBits(int, int, float)} to find the
     * format that best suits your needs.
     */
    PACKED_SINGLE_BLOCK(1) {

      @Override
      public int longCount(int packedIntsVersion, int valueCount, int bitsPerValue) {
        final int valuesPerBlock = 64 / bitsPerValue;
        return (int) Math.ceil((double) valueCount / valuesPerBlock);
      }

      @Override
      public boolean isSupported(int bitsPerValue) {
        return Packed64SingleBlock.isSupported(bitsPerValue);
      }

      @Override
      public float overheadPerValue(int bitsPerValue) {
        assert isSupported(bitsPerValue);
        final int valuesPerBlock = 64 / bitsPerValue;
        final int overhead = 64 % bitsPerValue;
        return (float) overhead / valuesPerBlock;
      }

    };

    /**
     * Get a format according to its ID.
     */
    public static Format byId(int id) {
      for (Format format : Format.values()) {
        if (format.getId() == id) {
          return format;
        }
      }
      throw new IllegalArgumentException("Unknown format id: " + id);
    }

    private Format(int id) {
      this.id = id;
    }

    public int id;

    /**
     * Returns the ID of the format.
     */
    public int getId() {
      return id;
    }

    /**
     * Computes how many byte blocks are needed to store values
     * values of size bitsPerValue.
     */
    public long byteCount(int packedIntsVersion, int valueCount, int bitsPerValue) {
      assert bitsPerValue >= 0 && bitsPerValue <= 64 : bitsPerValue;
      // assume long-aligned
      return 8L * longCount(packedIntsVersion, valueCount, bitsPerValue);
    }

    /**
     * Computes how many long blocks are needed to store values
     * values of size bitsPerValue.
     */
    public int longCount(int packedIntsVersion, int valueCount, int bitsPerValue) {
      assert bitsPerValue >= 0 && bitsPerValue <= 64 : bitsPerValue;
      final long byteCount = byteCount(packedIntsVersion, valueCount, bitsPerValue);
      assert byteCount < 8L * Integer.MAX_VALUE;
      if ((byteCount % 8) == 0) {
        return (int) (byteCount / 8);
      } else {
        return (int) (byteCount / 8 + 1);
      }
    }

    /**
     * Tests whether the provided number of bits per value is supported by the
     * format.
     */
    public boolean isSupported(int bitsPerValue) {
      return bitsPerValue >= 1 && bitsPerValue <= 64;
    }

    /**
     * Returns the overhead per value, in bits.
     */
    public float overheadPerValue(int bitsPerValue) {
      assert isSupported(bitsPerValue);
      return 0f;
    }

    /**
     * Returns the overhead ratio (overhead per value / bits per value).
     */
    public final float overheadRatio(int bitsPerValue) {
      assert isSupported(bitsPerValue);
      return overheadPerValue(bitsPerValue) / bitsPerValue;
    }
  }

  /**
   * Simple class that holds a format and a number of bits per value.
   */
  public static class FormatAndBits {
    public final Format format;
    public final int bitsPerValue;
    public FormatAndBits(Format format, int bitsPerValue) {
      this.format = format;
      this.bitsPerValue = bitsPerValue;
    }

    @Override
    public String toString() {
      return "FormatAndBits(format=" + format + " bitsPerValue=" + bitsPerValue + ")";
    }
  }

  /**
   * Try to find the {@link Format} and number of bits per value that would
   * restore from disk the fastest reader whose overhead is less than
   * acceptableOverheadRatio.
   * 

* The acceptableOverheadRatio parameter makes sense for * random-access {@link Reader}s. In case you only plan to perform * sequential access on this stream later on, you should probably use * {@link PackedInts#COMPACT}. *

* If you don't know how many values you are going to write, use * valueCount = -1. */ public static FormatAndBits fastestFormatAndBits(int valueCount, int bitsPerValue, float acceptableOverheadRatio) { if (valueCount == -1) { valueCount = Integer.MAX_VALUE; } acceptableOverheadRatio = Math.max(COMPACT, acceptableOverheadRatio); acceptableOverheadRatio = Math.min(FASTEST, acceptableOverheadRatio); float acceptableOverheadPerValue = acceptableOverheadRatio * bitsPerValue; // in bits int maxBitsPerValue = bitsPerValue + (int) acceptableOverheadPerValue; int actualBitsPerValue = -1; Format format = Format.PACKED; if (bitsPerValue <= 8 && maxBitsPerValue >= 8) { actualBitsPerValue = 8; } else if (bitsPerValue <= 16 && maxBitsPerValue >= 16) { actualBitsPerValue = 16; } else if (bitsPerValue <= 32 && maxBitsPerValue >= 32) { actualBitsPerValue = 32; } else if (bitsPerValue <= 64 && maxBitsPerValue >= 64) { actualBitsPerValue = 64; } else if (valueCount <= Packed8ThreeBlocks.MAX_SIZE && bitsPerValue <= 24 && maxBitsPerValue >= 24) { actualBitsPerValue = 24; } else if (valueCount <= Packed16ThreeBlocks.MAX_SIZE && bitsPerValue <= 48 && maxBitsPerValue >= 48) { actualBitsPerValue = 48; } else { for (int bpv = bitsPerValue; bpv <= maxBitsPerValue; ++bpv) { if (Format.PACKED_SINGLE_BLOCK.isSupported(bpv)) { float overhead = Format.PACKED_SINGLE_BLOCK.overheadPerValue(bpv); float acceptableOverhead = acceptableOverheadPerValue + bitsPerValue - bpv; if (overhead <= acceptableOverhead) { actualBitsPerValue = bpv; format = Format.PACKED_SINGLE_BLOCK; break; } } } if (actualBitsPerValue < 0) { actualBitsPerValue = bitsPerValue; } } return new FormatAndBits(format, actualBitsPerValue); } /** * A decoder for packed integers. */ public static interface Decoder { /** * The minimum number of long blocks to encode in a single iteration, when * using long encoding. */ int longBlockCount(); /** * The number of values that can be stored in {@link #longBlockCount()} long * blocks. */ int longValueCount(); /** * The minimum number of byte blocks to encode in a single iteration, when * using byte encoding. */ int byteBlockCount(); /** * The number of values that can be stored in {@link #byteBlockCount()} byte * blocks. */ int byteValueCount(); /** * Read iterations * blockCount() blocks from blocks, * decode them and write iterations * valueCount() values into * values. * * @param blocks the long blocks that hold packed integer values * @param blocksOffset the offset where to start reading blocks * @param values the values buffer * @param valuesOffset the offset where to start writing values * @param iterations controls how much data to decode */ void decode(long[] blocks, int blocksOffset, long[] values, int valuesOffset, int iterations); /** * Read 8 * iterations * blockCount() blocks from blocks, * decode them and write iterations * valueCount() values into * values. * * @param blocks the long blocks that hold packed integer values * @param blocksOffset the offset where to start reading blocks * @param values the values buffer * @param valuesOffset the offset where to start writing values * @param iterations controls how much data to decode */ void decode(byte[] blocks, int blocksOffset, long[] values, int valuesOffset, int iterations); /** * Read iterations * blockCount() blocks from blocks, * decode them and write iterations * valueCount() values into * values. * * @param blocks the long blocks that hold packed integer values * @param blocksOffset the offset where to start reading blocks * @param values the values buffer * @param valuesOffset the offset where to start writing values * @param iterations controls how much data to decode */ void decode(long[] blocks, int blocksOffset, int[] values, int valuesOffset, int iterations); /** * Read 8 * iterations * blockCount() blocks from blocks, * decode them and write iterations * valueCount() values into * values. * * @param blocks the long blocks that hold packed integer values * @param blocksOffset the offset where to start reading blocks * @param values the values buffer * @param valuesOffset the offset where to start writing values * @param iterations controls how much data to decode */ void decode(byte[] blocks, int blocksOffset, int[] values, int valuesOffset, int iterations); } /** * An encoder for packed integers. */ public static interface Encoder { /** * The minimum number of long blocks to encode in a single iteration, when * using long encoding. */ int longBlockCount(); /** * The number of values that can be stored in {@link #longBlockCount()} long * blocks. */ int longValueCount(); /** * The minimum number of byte blocks to encode in a single iteration, when * using byte encoding. */ int byteBlockCount(); /** * The number of values that can be stored in {@link #byteBlockCount()} byte * blocks. */ int byteValueCount(); /** * Read iterations * valueCount() values from values, * encode them and write iterations * blockCount() blocks into * blocks. * * @param blocks the long blocks that hold packed integer values * @param blocksOffset the offset where to start writing blocks * @param values the values buffer * @param valuesOffset the offset where to start reading values * @param iterations controls how much data to encode */ void encode(long[] values, int valuesOffset, long[] blocks, int blocksOffset, int iterations); /** * Read iterations * valueCount() values from values, * encode them and write 8 * iterations * blockCount() blocks into * blocks. * * @param blocks the long blocks that hold packed integer values * @param blocksOffset the offset where to start writing blocks * @param values the values buffer * @param valuesOffset the offset where to start reading values * @param iterations controls how much data to encode */ void encode(long[] values, int valuesOffset, byte[] blocks, int blocksOffset, int iterations); /** * Read iterations * valueCount() values from values, * encode them and write iterations * blockCount() blocks into * blocks. * * @param blocks the long blocks that hold packed integer values * @param blocksOffset the offset where to start writing blocks * @param values the values buffer * @param valuesOffset the offset where to start reading values * @param iterations controls how much data to encode */ void encode(int[] values, int valuesOffset, long[] blocks, int blocksOffset, int iterations); /** * Read iterations * valueCount() values from values, * encode them and write 8 * iterations * blockCount() blocks into * blocks. * * @param blocks the long blocks that hold packed integer values * @param blocksOffset the offset where to start writing blocks * @param values the values buffer * @param valuesOffset the offset where to start reading values * @param iterations controls how much data to encode */ void encode(int[] values, int valuesOffset, byte[] blocks, int blocksOffset, int iterations); } /** * A read-only random access array of positive integers. * @lucene.internal */ public static abstract class Reader extends NumericDocValues { /** * Bulk get: read at least one and at most len longs starting * from index into arr[off:off+len] and return * the actual number of values that have been read. */ public int get(int index, long[] arr, int off, int len) { assert len > 0 : "len must be > 0 (got " + len + ")"; assert index >= 0 && index < size(); assert off + len <= arr.length; final int gets = Math.min(size() - index, len); for (int i = index, o = off, end = index + gets; i < end; ++i, ++o) { arr[o] = get(i); } return gets; } /** * @return the number of bits used to store any given value. * Note: This does not imply that memory usage is * {@code bitsPerValue * #values} as implementations are free to * use non-space-optimal packing of bits. */ public abstract int getBitsPerValue(); /** * @return the number of values. */ public abstract int size(); /** * Return the in-memory size in bytes. */ public abstract long ramBytesUsed(); /** * Expert: if the bit-width of this reader matches one of * java's native types, returns the underlying array * (ie, byte[], short[], int[], long[]); else, returns * null. Note that when accessing the array you must * upgrade the type (bitwise AND with all ones), to * interpret the full value as unsigned. Ie, * bytes[idx]&0xFF, shorts[idx]&0xFFFF, etc. */ public Object getArray() { assert !hasArray(); return null; } /** * Returns true if this implementation is backed by a * native java array. * * @see #getArray */ public boolean hasArray() { return false; } } /** * Run-once iterator interface, to decode previously saved PackedInts. */ public static interface ReaderIterator { /** Returns next value */ long next() throws IOException; /** Returns at least 1 and at most count next values, * the returned ref MUST NOT be modified */ LongsRef next(int count) throws IOException; /** Returns number of bits per value */ int getBitsPerValue(); /** Returns number of values */ int size(); /** Returns the current position */ int ord(); } static abstract class ReaderIteratorImpl implements ReaderIterator { protected final DataInput in; protected final int bitsPerValue; protected final int valueCount; protected ReaderIteratorImpl(int valueCount, int bitsPerValue, DataInput in) { this.in = in; this.bitsPerValue = bitsPerValue; this.valueCount = valueCount; } @Override public long next() throws IOException { LongsRef nextValues = next(1); assert nextValues.length > 0; final long result = nextValues.longs[nextValues.offset]; ++nextValues.offset; --nextValues.length; return result; } @Override public int getBitsPerValue() { return bitsPerValue; } @Override public int size() { return valueCount; } } /** * A packed integer array that can be modified. * @lucene.internal */ public static abstract class Mutable extends Reader { /** * Set the value at the given index in the array. * @param index where the value should be positioned. * @param value a value conforming to the constraints set by the array. */ public abstract void set(int index, long value); /** * Bulk set: set at least one and at most len longs starting * at off in arr into this mutable, starting at * index. Returns the actual number of values that have been * set. */ public int set(int index, long[] arr, int off, int len) { assert len > 0 : "len must be > 0 (got " + len + ")"; assert index >= 0 && index < size(); len = Math.min(len, size() - index); assert off + len <= arr.length; for (int i = index, o = off, end = index + len; i < end; ++i, ++o) { set(i, arr[o]); } return len; } /** * Fill the mutable from fromIndex (inclusive) to * toIndex (exclusive) with val. */ public void fill(int fromIndex, int toIndex, long val) { assert val <= maxValue(getBitsPerValue()); assert fromIndex <= toIndex; for (int i = fromIndex; i < toIndex; ++i) { set(i, val); } } /** * Sets all values to 0. */ public void clear() { fill(0, size(), 0); } /** * Save this mutable into out. Instantiating a reader from * the generated data will return a reader with the same number of bits * per value. */ public void save(DataOutput out) throws IOException { Writer writer = getWriterNoHeader(out, getFormat(), size(), getBitsPerValue(), DEFAULT_BUFFER_SIZE); writer.writeHeader(); for (int i = 0; i < size(); ++i) { writer.add(get(i)); } writer.finish(); } /** The underlying format. */ Format getFormat() { return Format.PACKED; } } /** * A simple base for Readers that keeps track of valueCount and bitsPerValue. * @lucene.internal */ static abstract class ReaderImpl extends Reader { protected final int bitsPerValue; protected final int valueCount; protected ReaderImpl(int valueCount, int bitsPerValue) { this.bitsPerValue = bitsPerValue; assert bitsPerValue > 0 && bitsPerValue <= 64 : "bitsPerValue=" + bitsPerValue; this.valueCount = valueCount; } @Override public abstract long get(int index); @Override public final int getBitsPerValue() { return bitsPerValue; } @Override public final int size() { return valueCount; } } static abstract class MutableImpl extends Mutable { protected final int valueCount; protected final int bitsPerValue; protected MutableImpl(int valueCount, int bitsPerValue) { this.valueCount = valueCount; assert bitsPerValue > 0 && bitsPerValue <= 64 : "bitsPerValue=" + bitsPerValue; this.bitsPerValue = bitsPerValue; } @Override public final int getBitsPerValue() { return bitsPerValue; } @Override public final int size() { return valueCount; } } /** A {@link Reader} which has all its values equal to 0 (bitsPerValue = 0). */ public static final class NullReader extends Reader { private final int valueCount; /** Sole constructor. */ public NullReader(int valueCount) { this.valueCount = valueCount; } @Override public long get(int index) { return 0; } @Override public int get(int index, long[] arr, int off, int len) { assert len > 0 : "len must be > 0 (got " + len + ")"; assert index >= 0 && index < valueCount; len = Math.min(len, valueCount - index); Arrays.fill(arr, off, off + len, 0); return len; } @Override public int getBitsPerValue() { return 0; } @Override public int size() { return valueCount; } @Override public long ramBytesUsed() { return RamUsageEstimator.alignObjectSize(RamUsageEstimator.NUM_BYTES_OBJECT_HEADER + RamUsageEstimator.NUM_BYTES_INT); } } /** A write-once Writer. * @lucene.internal */ public static abstract class Writer { protected final DataOutput out; protected final int valueCount; protected final int bitsPerValue; protected Writer(DataOutput out, int valueCount, int bitsPerValue) { assert bitsPerValue <= 64; assert valueCount >= 0 || valueCount == -1; this.out = out; this.valueCount = valueCount; this.bitsPerValue = bitsPerValue; } void writeHeader() throws IOException { assert valueCount != -1; CodecUtil.writeHeader(out, CODEC_NAME, VERSION_CURRENT); out.writeVInt(bitsPerValue); out.writeVInt(valueCount); out.writeVInt(getFormat().getId()); } /** The format used to serialize values. */ protected abstract PackedInts.Format getFormat(); /** Add a value to the stream. */ public abstract void add(long v) throws IOException; /** The number of bits per value. */ public final int bitsPerValue() { return bitsPerValue; } /** Perform end-of-stream operations. */ public abstract void finish() throws IOException; /** * Returns the current ord in the stream (number of values that have been * written so far minus one). */ public abstract int ord(); } /** * Get a {@link Decoder}. * * @param format the format used to store packed ints * @param version the compatibility version * @param bitsPerValue the number of bits per value * @return a decoder */ public static Decoder getDecoder(Format format, int version, int bitsPerValue) { checkVersion(version); return BulkOperation.of(format, bitsPerValue); } /** * Get an {@link Encoder}. * * @param format the format used to store packed ints * @param version the compatibility version * @param bitsPerValue the number of bits per value * @return an encoder */ public static Encoder getEncoder(Format format, int version, int bitsPerValue) { checkVersion(version); return BulkOperation.of(format, bitsPerValue); } /** * Expert: Restore a {@link Reader} from a stream without reading metadata at * the beginning of the stream. This method is useful to restore data from * streams which have been created using * {@link PackedInts#getWriterNoHeader(DataOutput, Format, int, int, int)}. * * @param in the stream to read data from, positioned at the beginning of the packed values * @param format the format used to serialize * @param version the version used to serialize the data * @param valueCount how many values the stream holds * @param bitsPerValue the number of bits per value * @return a Reader * @throws IOException If there is a low-level I/O error * @see PackedInts#getWriterNoHeader(DataOutput, Format, int, int, int) * @lucene.internal */ public static Reader getReaderNoHeader(DataInput in, Format format, int version, int valueCount, int bitsPerValue) throws IOException { checkVersion(version); switch (format) { case PACKED_SINGLE_BLOCK: return Packed64SingleBlock.create(in, valueCount, bitsPerValue); case PACKED: switch (bitsPerValue) { case 8: return new Direct8(version, in, valueCount); case 16: return new Direct16(version, in, valueCount); case 32: return new Direct32(version, in, valueCount); case 64: return new Direct64(version, in, valueCount); case 24: if (valueCount <= Packed8ThreeBlocks.MAX_SIZE) { return new Packed8ThreeBlocks(version, in, valueCount); } break; case 48: if (valueCount <= Packed16ThreeBlocks.MAX_SIZE) { return new Packed16ThreeBlocks(version, in, valueCount); } break; } return new Packed64(version, in, valueCount, bitsPerValue); default: throw new AssertionError("Unknown Writer format: " + format); } } /** * Expert: Restore a {@link Reader} from a stream without reading metadata at * the beginning of the stream. This method is useful to restore data when * metadata has been previously read using {@link #readHeader(DataInput)}. * * @param in the stream to read data from, positioned at the beginning of the packed values * @param header metadata result from readHeader() * @return a Reader * @throws IOException If there is a low-level I/O error * @see #readHeader(DataInput) * @lucene.internal */ public static Reader getReaderNoHeader(DataInput in, Header header) throws IOException { return getReaderNoHeader(in, header.format, header.version, header.valueCount, header.bitsPerValue); } /** * Restore a {@link Reader} from a stream. * * @param in the stream to read data from * @return a Reader * @throws IOException If there is a low-level I/O error * @lucene.internal */ public static Reader getReader(DataInput in) throws IOException { final int version = CodecUtil.checkHeader(in, CODEC_NAME, VERSION_START, VERSION_CURRENT); final int bitsPerValue = in.readVInt(); assert bitsPerValue > 0 && bitsPerValue <= 64: "bitsPerValue=" + bitsPerValue; final int valueCount = in.readVInt(); final Format format = Format.byId(in.readVInt()); return getReaderNoHeader(in, format, version, valueCount, bitsPerValue); } /** * Expert: Restore a {@link ReaderIterator} from a stream without reading * metadata at the beginning of the stream. This method is useful to restore * data from streams which have been created using * {@link PackedInts#getWriterNoHeader(DataOutput, Format, int, int, int)}. * * @param in the stream to read data from, positioned at the beginning of the packed values * @param format the format used to serialize * @param version the version used to serialize the data * @param valueCount how many values the stream holds * @param bitsPerValue the number of bits per value * @param mem how much memory the iterator is allowed to use to read-ahead (likely to speed up iteration) * @return a ReaderIterator * @see PackedInts#getWriterNoHeader(DataOutput, Format, int, int, int) * @lucene.internal */ public static ReaderIterator getReaderIteratorNoHeader(DataInput in, Format format, int version, int valueCount, int bitsPerValue, int mem) { checkVersion(version); return new PackedReaderIterator(format, version, valueCount, bitsPerValue, in, mem); } /** * Retrieve PackedInts as a {@link ReaderIterator} * @param in positioned at the beginning of a stored packed int structure. * @param mem how much memory the iterator is allowed to use to read-ahead (likely to speed up iteration) * @return an iterator to access the values * @throws IOException if the structure could not be retrieved. * @lucene.internal */ public static ReaderIterator getReaderIterator(DataInput in, int mem) throws IOException { final int version = CodecUtil.checkHeader(in, CODEC_NAME, VERSION_START, VERSION_CURRENT); final int bitsPerValue = in.readVInt(); assert bitsPerValue > 0 && bitsPerValue <= 64: "bitsPerValue=" + bitsPerValue; final int valueCount = in.readVInt(); final Format format = Format.byId(in.readVInt()); return getReaderIteratorNoHeader(in, format, version, valueCount, bitsPerValue, mem); } /** * Expert: Construct a direct {@link Reader} from a stream without reading * metadata at the beginning of the stream. This method is useful to restore * data from streams which have been created using * {@link PackedInts#getWriterNoHeader(DataOutput, Format, int, int, int)}. *

* The returned reader will have very little memory overhead, but every call * to {@link Reader#get(int)} is likely to perform a disk seek. * * @param in the stream to read data from * @param format the format used to serialize * @param version the version used to serialize the data * @param valueCount how many values the stream holds * @param bitsPerValue the number of bits per value * @return a direct Reader * @lucene.internal */ public static Reader getDirectReaderNoHeader(final IndexInput in, Format format, int version, int valueCount, int bitsPerValue) { checkVersion(version); switch (format) { case PACKED: final long byteCount = format.byteCount(version, valueCount, bitsPerValue); if (byteCount != format.byteCount(VERSION_CURRENT, valueCount, bitsPerValue)) { assert version == VERSION_START; final long endPointer = in.getFilePointer() + byteCount; // Some consumers of direct readers assume that reading the last value // will make the underlying IndexInput go to the end of the packed // stream, but this is not true because packed ints storage used to be // long-aligned and is now byte-aligned, hence this additional // condition when reading the last value return new DirectPackedReader(bitsPerValue, valueCount, in) { @Override public long get(int index) { final long result = super.get(index); if (index == valueCount - 1) { try { in.seek(endPointer); } catch (IOException e) { throw new IllegalStateException("failed", e); } } return result; } }; } else { return new DirectPackedReader(bitsPerValue, valueCount, in); } case PACKED_SINGLE_BLOCK: return new DirectPacked64SingleBlockReader(bitsPerValue, valueCount, in); default: throw new AssertionError("Unknwown format: " + format); } } /** * Expert: Construct a direct {@link Reader} from an {@link IndexInput} * without reading metadata at the beginning of the stream. This method is * useful to restore data when metadata has been previously read using * {@link #readHeader(DataInput)}. * * @param in the stream to read data from, positioned at the beginning of the packed values * @param header metadata result from readHeader() * @return a Reader * @throws IOException If there is a low-level I/O error * @see #readHeader(DataInput) * @lucene.internal */ public static Reader getDirectReaderNoHeader(IndexInput in, Header header) throws IOException { return getDirectReaderNoHeader(in, header.format, header.version, header.valueCount, header.bitsPerValue); } /** * Construct a direct {@link Reader} from an {@link IndexInput}. This method * is useful to restore data from streams which have been created using * {@link PackedInts#getWriter(DataOutput, int, int, float)}. *

* The returned reader will have very little memory overhead, but every call * to {@link Reader#get(int)} is likely to perform a disk seek. * * @param in the stream to read data from * @return a direct Reader * @throws IOException If there is a low-level I/O error * @lucene.internal */ public static Reader getDirectReader(IndexInput in) throws IOException { final int version = CodecUtil.checkHeader(in, CODEC_NAME, VERSION_START, VERSION_CURRENT); final int bitsPerValue = in.readVInt(); assert bitsPerValue > 0 && bitsPerValue <= 64: "bitsPerValue=" + bitsPerValue; final int valueCount = in.readVInt(); final Format format = Format.byId(in.readVInt()); return getDirectReaderNoHeader(in, format, version, valueCount, bitsPerValue); } /** * Create a packed integer array with the given amount of values initialized * to 0. the valueCount and the bitsPerValue cannot be changed after creation. * All Mutables known by this factory are kept fully in RAM. *

* Positive values of acceptableOverheadRatio will trade space * for speed by selecting a faster but potentially less memory-efficient * implementation. An acceptableOverheadRatio of * {@link PackedInts#COMPACT} will make sure that the most memory-efficient * implementation is selected whereas {@link PackedInts#FASTEST} will make sure * that the fastest implementation is selected. * * @param valueCount the number of elements * @param bitsPerValue the number of bits available for any given value * @param acceptableOverheadRatio an acceptable overhead * ratio per value * @return a mutable packed integer array * @lucene.internal */ public static Mutable getMutable(int valueCount, int bitsPerValue, float acceptableOverheadRatio) { final FormatAndBits formatAndBits = fastestFormatAndBits(valueCount, bitsPerValue, acceptableOverheadRatio); return getMutable(valueCount, formatAndBits.bitsPerValue, formatAndBits.format); } /** Same as {@link #getMutable(int, int, float)} with a pre-computed number * of bits per value and format. * @lucene.internal */ public static Mutable getMutable(int valueCount, int bitsPerValue, PackedInts.Format format) { assert valueCount >= 0; switch (format) { case PACKED_SINGLE_BLOCK: return Packed64SingleBlock.create(valueCount, bitsPerValue); case PACKED: switch (bitsPerValue) { case 8: return new Direct8(valueCount); case 16: return new Direct16(valueCount); case 32: return new Direct32(valueCount); case 64: return new Direct64(valueCount); case 24: if (valueCount <= Packed8ThreeBlocks.MAX_SIZE) { return new Packed8ThreeBlocks(valueCount); } break; case 48: if (valueCount <= Packed16ThreeBlocks.MAX_SIZE) { return new Packed16ThreeBlocks(valueCount); } break; } return new Packed64(valueCount, bitsPerValue); default: throw new AssertionError(); } } /** * Expert: Create a packed integer array writer for the given output, format, * value count, and number of bits per value. *

* The resulting stream will be long-aligned. This means that depending on * the format which is used, up to 63 bits will be wasted. An easy way to * make sure that no space is lost is to always use a valueCount * that is a multiple of 64. *

* This method does not write any metadata to the stream, meaning that it is * your responsibility to store it somewhere else in order to be able to * recover data from the stream later on: *

    *
  • format (using {@link Format#getId()}),
  • *
  • valueCount,
  • *
  • bitsPerValue,
  • *
  • {@link #VERSION_CURRENT}.
  • *
*

* It is possible to start writing values without knowing how many of them you * are actually going to write. To do this, just pass -1 as * valueCount. On the other hand, for any positive value of * valueCount, the returned writer will make sure that you don't * write more values than expected and pad the end of stream with zeros in * case you have written less than valueCount when calling * {@link Writer#finish()}. *

* The mem parameter lets you control how much memory can be used * to buffer changes in memory before flushing to disk. High values of * mem are likely to improve throughput. On the other hand, if * speed is not that important to you, a value of 0 will use as * little memory as possible and should already offer reasonable throughput. * * @param out the data output * @param format the format to use to serialize the values * @param valueCount the number of values * @param bitsPerValue the number of bits per value * @param mem how much memory (in bytes) can be used to speed up serialization * @return a Writer * @see PackedInts#getReaderIteratorNoHeader(DataInput, Format, int, int, int, int) * @see PackedInts#getReaderNoHeader(DataInput, Format, int, int, int) * @lucene.internal */ public static Writer getWriterNoHeader( DataOutput out, Format format, int valueCount, int bitsPerValue, int mem) { return new PackedWriter(format, out, valueCount, bitsPerValue, mem); } /** * Create a packed integer array writer for the given output, format, value * count, and number of bits per value. *

* The resulting stream will be long-aligned. This means that depending on * the format which is used under the hoods, up to 63 bits will be wasted. * An easy way to make sure that no space is lost is to always use a * valueCount that is a multiple of 64. *

* This method writes metadata to the stream, so that the resulting stream is * sufficient to restore a {@link Reader} from it. You don't need to track * valueCount or bitsPerValue by yourself. In case * this is a problem, you should probably look at * {@link #getWriterNoHeader(DataOutput, Format, int, int, int)}. *

* The acceptableOverheadRatio parameter controls how * readers that will be restored from this stream trade space * for speed by selecting a faster but potentially less memory-efficient * implementation. An acceptableOverheadRatio of * {@link PackedInts#COMPACT} will make sure that the most memory-efficient * implementation is selected whereas {@link PackedInts#FASTEST} will make sure * that the fastest implementation is selected. In case you are only interested * in reading this stream sequentially later on, you should probably use * {@link PackedInts#COMPACT}. * * @param out the data output * @param valueCount the number of values * @param bitsPerValue the number of bits per value * @param acceptableOverheadRatio an acceptable overhead ratio per value * @return a Writer * @throws IOException If there is a low-level I/O error * @lucene.internal */ public static Writer getWriter(DataOutput out, int valueCount, int bitsPerValue, float acceptableOverheadRatio) throws IOException { assert valueCount >= 0; final FormatAndBits formatAndBits = fastestFormatAndBits(valueCount, bitsPerValue, acceptableOverheadRatio); final Writer writer = getWriterNoHeader(out, formatAndBits.format, valueCount, formatAndBits.bitsPerValue, DEFAULT_BUFFER_SIZE); writer.writeHeader(); return writer; } /** Returns how many bits are required to hold values up * to and including maxValue * @param maxValue the maximum value that should be representable. * @return the amount of bits needed to represent values from 0 to maxValue. * @lucene.internal */ public static int bitsRequired(long maxValue) { if (maxValue < 0) { throw new IllegalArgumentException("maxValue must be non-negative (got: " + maxValue + ")"); } return Math.max(1, 64 - Long.numberOfLeadingZeros(maxValue)); } /** * Calculates the maximum unsigned long that can be expressed with the given * number of bits. * @param bitsPerValue the number of bits available for any given value. * @return the maximum value for the given bits. * @lucene.internal */ public static long maxValue(int bitsPerValue) { return bitsPerValue == 64 ? Long.MAX_VALUE : ~(~0L << bitsPerValue); } /** * Copy src[srcPos:srcPos+len] into * dest[destPos:destPos+len] using at most mem * bytes. */ public static void copy(Reader src, int srcPos, Mutable dest, int destPos, int len, int mem) { assert srcPos + len <= src.size(); assert destPos + len <= dest.size(); final int capacity = mem >>> 3; if (capacity == 0) { for (int i = 0; i < len; ++i) { dest.set(destPos++, src.get(srcPos++)); } } else if (len > 0) { // use bulk operations final long[] buf = new long[Math.min(capacity, len)]; copy(src, srcPos, dest, destPos, len, buf); } } /** Same as {@link #copy(Reader, int, Mutable, int, int, int)} but using a pre-allocated buffer. */ static void copy(Reader src, int srcPos, Mutable dest, int destPos, int len, long[] buf) { assert buf.length > 0; int remaining = 0; while (len > 0) { final int read = src.get(srcPos, buf, remaining, Math.min(len, buf.length - remaining)); assert read > 0; srcPos += read; len -= read; remaining += read; final int written = dest.set(destPos, buf, 0, remaining); assert written > 0; destPos += written; if (written < remaining) { System.arraycopy(buf, written, buf, 0, remaining - written); } remaining -= written; } while (remaining > 0) { final int written = dest.set(destPos, buf, 0, remaining); destPos += written; remaining -= written; System.arraycopy(buf, written, buf, 0, remaining); } } /** * Expert: reads only the metadata from a stream. This is useful to later * restore a stream or open a direct reader via * {@link #getReaderNoHeader(DataInput, Header)} * or {@link #getDirectReaderNoHeader(IndexInput, Header)}. * @param in the stream to read data * @return packed integer metadata. * @throws IOException If there is a low-level I/O error * @see #getReaderNoHeader(DataInput, Header) * @see #getDirectReaderNoHeader(IndexInput, Header) */ public static Header readHeader(DataInput in) throws IOException { final int version = CodecUtil.checkHeader(in, CODEC_NAME, VERSION_START, VERSION_CURRENT); final int bitsPerValue = in.readVInt(); assert bitsPerValue > 0 && bitsPerValue <= 64: "bitsPerValue=" + bitsPerValue; final int valueCount = in.readVInt(); final Format format = Format.byId(in.readVInt()); return new Header(format, valueCount, bitsPerValue, version); } /** Header identifying the structure of a packed integer array. */ public static class Header { private final Format format; private final int valueCount; private final int bitsPerValue; private final int version; public Header(Format format, int valueCount, int bitsPerValue, int version) { this.format = format; this.valueCount = valueCount; this.bitsPerValue = bitsPerValue; this.version = version; } } /** Check that the block size is a power of 2, in the right bounds, and return * its log in base 2. */ static int checkBlockSize(int blockSize, int minBlockSize, int maxBlockSize) { if (blockSize < minBlockSize || blockSize > maxBlockSize) { throw new IllegalArgumentException("blockSize must be >= " + minBlockSize + " and <= " + maxBlockSize + ", got " + blockSize); } if ((blockSize & (blockSize - 1)) != 0) { throw new IllegalArgumentException("blockSize must be a power of two, got " + blockSize); } return Integer.numberOfTrailingZeros(blockSize); } /** Return the number of blocks required to store size values on * blockSize. */ static int numBlocks(long size, int blockSize) { final int numBlocks = (int) (size / blockSize) + (size % blockSize == 0 ? 0 : 1); if ((long) numBlocks * blockSize < size) { throw new IllegalArgumentException("size is too large for this block size"); } return numBlocks; } }





© 2015 - 2024 Weber Informatics LLC | Privacy Policy