org.apache.lucene.backward_codecs.lucene87.DeflateWithPresetDictCompressionMode Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of lucene-backward-codecs Show documentation
Apache Lucene (module: backward-codecs)
There is a newer version: 9.11.1
Show newest version
/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.lucene.backward_codecs.lucene87;

import java.io.IOException;
import java.util.zip.DataFormatException;
import java.util.zip.Deflater;
import java.util.zip.Inflater;
import org.apache.lucene.codecs.compressing.CompressionMode;
import org.apache.lucene.codecs.compressing.Compressor;
import org.apache.lucene.codecs.compressing.Decompressor;
import org.apache.lucene.index.CorruptIndexException;
import org.apache.lucene.store.DataInput;
import org.apache.lucene.store.DataOutput;
import org.apache.lucene.util.ArrayUtil;
import org.apache.lucene.util.BytesRef;

/**
 * A compression mode that trades speed for compression ratio. Although compression and
 * decompression might be slow, this compression mode should provide a good compression ratio. This
 * mode might be interesting if/when your index size is much bigger than your OS cache.
 *
 * @lucene.internal
 */
public final class DeflateWithPresetDictCompressionMode extends CompressionMode {

  // Shoot for 10 sub blocks
  private static final int NUM_SUB_BLOCKS = 10;
  // And a dictionary whose size is about 6x smaller than sub blocks
  private static final int DICT_SIZE_FACTOR = 6;

  /** Sole constructor. */
  public DeflateWithPresetDictCompressionMode() {}

  @Override
  public Compressor newCompressor() {
    // notes:
    // 3 is the highest level that doesn't have lazy match evaluation
    // 6 is the default, higher than that is just a waste of cpu
    return new DeflateWithPresetDictCompressor(6);
  }

  @Override
  public Decompressor newDecompressor() {
    return new DeflateWithPresetDictDecompressor();
  }

  @Override
  public String toString() {
    return "BEST_COMPRESSION";
  }

  private static final class DeflateWithPresetDictDecompressor extends Decompressor {

    byte[] compressed;

    DeflateWithPresetDictDecompressor() {
      compressed = new byte[0];
    }

    private void doDecompress(DataInput in, Inflater decompressor, BytesRef bytes)
        throws IOException {
      final int compressedLength = in.readVInt();
      if (compressedLength == 0) {
        return;
      }
      // pad with extra "dummy byte": see javadocs for using Inflater(true)
      // we do it for compliance, but it's unnecessary for years in zlib.
      final int paddedLength = compressedLength + 1;
      compressed = ArrayUtil.grow(compressed, paddedLength);
      in.readBytes(compressed, 0, compressedLength);
      compressed[compressedLength] = 0; // explicitly set dummy byte to 0

      // extra "dummy byte"
      decompressor.setInput(compressed, 0, paddedLength);
      try {
        bytes.length +=
            decompressor.inflate(bytes.bytes, bytes.length, bytes.bytes.length - bytes.length);
      } catch (DataFormatException e) {
        throw new IOException(e);
      }
      if (decompressor.finished() == false) {
        throw new CorruptIndexException(
            "Invalid decoder state: needsInput="
                + decompressor.needsInput()
                + ", needsDict="
                + decompressor.needsDictionary(),
            in);
      }
    }

    @Override
    public void decompress(DataInput in, int originalLength, int offset, int length, BytesRef bytes)
        throws IOException {
      assert offset + length <= originalLength;
      if (length == 0) {
        bytes.length = 0;
        return;
      }
      final int dictLength = in.readVInt();
      final int blockLength = in.readVInt();
      bytes.bytes = ArrayUtil.grow(bytes.bytes, dictLength);
      bytes.offset = bytes.length = 0;

      final Inflater decompressor = new Inflater(true);
      try {
        // Read the dictionary
        doDecompress(in, decompressor, bytes);
        if (dictLength != bytes.length) {
          throw new CorruptIndexException("Unexpected dict length", in);
        }

        int offsetInBlock = dictLength;
        int offsetInBytesRef = offset;

        // Skip unneeded blocks
        while (offsetInBlock + blockLength < offset) {
          final int compressedLength = in.readVInt();
          in.skipBytes(compressedLength);
          offsetInBlock += blockLength;
          offsetInBytesRef -= blockLength;
        }

        // Read blocks that intersect with the interval we need
        while (offsetInBlock < offset + length) {
          bytes.bytes = ArrayUtil.grow(bytes.bytes, bytes.length + blockLength);
          decompressor.reset();
          decompressor.setDictionary(bytes.bytes, 0, dictLength);
          doDecompress(in, decompressor, bytes);
          offsetInBlock += blockLength;
        }

        bytes.offset = offsetInBytesRef;
        bytes.length = length;
        assert bytes.isValid();
      } finally {
        decompressor.end();
      }
    }

    @Override
    public Decompressor clone() {
      return new DeflateWithPresetDictDecompressor();
    }
  }

  private static class DeflateWithPresetDictCompressor extends Compressor {

    final Deflater compressor;
    final BugfixDeflater_JDK8252739 deflaterBugfix;
    byte[] compressed;
    boolean closed;

    DeflateWithPresetDictCompressor(int level) {
      compressor = new Deflater(level, true);
      deflaterBugfix = BugfixDeflater_JDK8252739.createBugfix(compressor);
      compressed = new byte[64];
    }

    private void doCompress(byte[] bytes, int off, int len, DataOutput out) throws IOException {
      if (len == 0) {
        out.writeVInt(0);
        return;
      }
      compressor.setInput(bytes, off, len);
      compressor.finish();
      if (compressor.needsInput()) {
        throw new IllegalStateException();
      }

      int totalCount = 0;
      for (; ; ) {
        final int count =
            compressor.deflate(compressed, totalCount, compressed.length - totalCount);
        totalCount += count;
        assert totalCount <= compressed.length;
        if (compressor.finished()) {
          break;
        } else {
          compressed = ArrayUtil.grow(compressed);
        }
      }

      out.writeVInt(totalCount);
      out.writeBytes(compressed, totalCount);
    }

    @Override
    public void compress(byte[] bytes, int off, int len, DataOutput out) throws IOException {
      final int dictLength = len / (NUM_SUB_BLOCKS * DICT_SIZE_FACTOR);
      final int blockLength = (len - dictLength + NUM_SUB_BLOCKS - 1) / NUM_SUB_BLOCKS;
      out.writeVInt(dictLength);
      out.writeVInt(blockLength);
      final int end = off + len;

      // Compress the dictionary first
      compressor.reset();
      doCompress(bytes, off, dictLength, out);

      // And then sub blocks
      for (int start = off + dictLength; start < end; start += blockLength) {
        compressor.reset();
        deflaterBugfix.setDictionary(bytes, off, dictLength);
        doCompress(bytes, start, Math.min(blockLength, off + len - start), out);
      }
    }

    @Override
    public void close() throws IOException {
      if (closed == false) {
        compressor.end();
        closed = true;
      }
    }
  }
}