All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.druid.segment.data.CompressionFactory Maven / Gradle / Ivy

There is a newer version: 30.0.1
Show newest version
/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *   http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied.  See the License for the
 * specific language governing permissions and limitations
 * under the License.
 */

package org.apache.druid.segment.data;

import com.fasterxml.jackson.annotation.JsonCreator;
import com.fasterxml.jackson.annotation.JsonValue;
import com.google.common.base.Supplier;
import org.apache.druid.java.util.common.IAE;
import org.apache.druid.java.util.common.StringUtils;
import org.apache.druid.segment.serde.MetaSerdeHelper;
import org.apache.druid.segment.writeout.SegmentWriteOutMedium;
import org.apache.druid.segment.writeout.WriteOutBytes;

import java.io.IOException;
import java.nio.ByteBuffer;
import java.nio.ByteOrder;
import java.util.HashMap;
import java.util.Map;
import java.util.function.Function;

/**
 * Compression of metrics is done by using a combination of {@link CompressionStrategy}
 * and Encoding(such as {@link LongEncodingStrategy} for type Long). CompressionStrategy is unaware of the data type
 * and is based on byte operations. It must compress and decompress in block of bytes. Encoding refers to compression
 * method relies on data format, so a different set of Encodings exist for each data type.
 * 

* Storage Format : * Byte 1 : version (currently 0x02) * Byte 2 - 5 : number of values * Byte 6 - 9 : size per block (even if block format isn't used, this is needed for backward compatibility) * Byte 10 : compression strategy (contains a flag if there's an encoding byte, see below for how the flag is defined) * Byte 11(optional) : encoding type *

* Encoding specific header (described below) *

* Block related header (if block compression is used, described in GenericIndexed) *

* Values */ public class CompressionFactory { private CompressionFactory() { // No instantiation } public static final LongEncodingStrategy DEFAULT_LONG_ENCODING_STRATEGY = LongEncodingStrategy.LONGS; // encoding format for segments created prior to the introduction of encoding formats public static final LongEncodingFormat LEGACY_LONG_ENCODING_FORMAT = LongEncodingFormat.LONGS; /** * Delta Encoding Header v1: * Byte 1 : version * Byte 2 - 9 : base value * Byte 10 - 13 : number of bits per value */ public static final byte DELTA_ENCODING_VERSION = 0x1; /** * Table Encoding Header v1 : * Byte 1 : version * Byte 2 - 5 : table size * Byte 6 - (6 + 8 * table size - 1) : table of encoding, where the ith 8-byte value is encoded as i */ public static final byte TABLE_ENCODING_VERSION = 0x1; public static final int MAX_TABLE_SIZE = 256; /* * There is no header or version for Longs encoding for backward compatibility */ /* * This is the flag mechanism for determine whether an encoding byte exist in the header. This is needed for * backward compatibility, since segment created prior to the introduction of encoding formats does not have the * encoding strategy byte. The flag is encoded in the compression strategy byte using the setEncodingFlag and * clearEncodingFlag function. */ // 0xFE(-2) should be the smallest valid compression strategy id private static byte FLAG_BOUND = (byte) 0xFE; // 126 is the value here since -2 - 126 = -128, which is the lowest byte value private static int FLAG_VALUE = 126; public static boolean hasEncodingFlag(byte strategyId) { return strategyId < FLAG_BOUND; } public static byte setEncodingFlag(byte strategyId) { return hasEncodingFlag(strategyId) ? strategyId : (byte) (strategyId - FLAG_VALUE); } public static byte clearEncodingFlag(byte strategyId) { return hasEncodingFlag(strategyId) ? (byte) (strategyId + FLAG_VALUE) : strategyId; } /** * The compression of decompression of encodings are separated into different enums. EncodingStrategy refers to the * strategy used to encode the data, and EncodingFormat refers to the format the data is encoded in. Note there is not * necessarily an one-to-one mapping between to two. For instance, the AUTO LongEncodingStrategy scans the data once * and decide on which LongEncodingFormat to use based on data property, so it's possible for the EncodingStrategy to * write in any of the LongEncodingFormat. On the other hand, there are no LongEncodingStrategy that always write in * TABLE LongEncodingFormat since it only works for data with low cardinality. */ public enum LongEncodingStrategy { /** * AUTO strategy scans all values once before encoding them. It stores the value cardinality and maximum offset * of the values to determine whether to use DELTA, TABLE, or LONGS format. */ AUTO, /** * LONGS strategy always encode the values using LONGS format */ LONGS; @JsonValue @Override public String toString() { return StringUtils.toLowerCase(this.name()); } @JsonCreator public static LongEncodingStrategy fromString(String name) { return valueOf(StringUtils.toUpperCase(name)); } } public enum LongEncodingFormat { /** * DELTA format encodes a series of longs by finding the smallest value first, and stores all values * as offset to the smallest value. The maximum value is also found to calculate how many bits are required * to store each offset using {@link VSizeLongSerde}. */ DELTA((byte) 0x0) { @Override public LongEncodingReader getReader(ByteBuffer buffer, ByteOrder order) { return new DeltaLongEncodingReader(buffer); } }, /** * TABLE format encodes a series of longs by mapping each unique value to an id, and string the id with the * minimum number of bits similar to how DELTA stores offset. TABLE format is only applicable to values with * less unique values than {@link CompressionFactory#MAX_TABLE_SIZE}. */ TABLE((byte) 0x1) { @Override public LongEncodingReader getReader(ByteBuffer buffer, ByteOrder order) { return new TableLongEncodingReader(buffer); } }, /** * LONGS format encodes longs as is, using 8 bytes for each value. */ LONGS((byte) 0xFF) { @Override public LongEncodingReader getReader(ByteBuffer buffer, ByteOrder order) { return new LongsLongEncodingReader(buffer, order); } }; final byte id; LongEncodingFormat(byte id) { this.id = id; } public byte getId() { return id; } static final Map ID_MAP = new HashMap<>(); static { for (LongEncodingFormat format : LongEncodingFormat.values()) { ID_MAP.put(format.getId(), format); } } public abstract LongEncodingReader getReader(ByteBuffer buffer, ByteOrder order); public static LongEncodingFormat forId(byte id) { return ID_MAP.get(id); } } /** * This writer output encoded values to the given ByteBuffer or OutputStream. {@link #setBuffer(ByteBuffer)} or * {@link #setOutputStream(WriteOutBytes)} must be called before any value is written, and {@link #flush()} must * be called before calling setBuffer or setOutputStream again to set another output. */ public interface LongEncodingWriter { /** * Data will be written starting from current position of the buffer, and the position of the buffer will be * updated as content is written. */ void setBuffer(ByteBuffer buffer); void setOutputStream(WriteOutBytes output); void write(long value) throws IOException; /** * Flush the unwritten content to the current output. */ void flush() throws IOException; /** * Output the header values of the associating encoding format to the given outputStream. The header also include * bytes for compression strategy and encoding format(optional) as described above in Compression Storage Format. */ void putMeta(ByteBuffer metaOut, CompressionStrategy strategy); int metaSize(); /** * Get the number of values that can be encoded into each block for the given block size in bytes */ int getBlockSize(int bytesPerBlock); /** * Get the number of bytes required to encoding the given number of values */ int getNumBytes(int values); } static MetaSerdeHelper.FieldWriter longEncodingWriter( Function getWriter, Function getCompressionStrategy ) { return new MetaSerdeHelper.FieldWriter() { @Override public void writeTo(ByteBuffer buffer, T x) { getWriter.apply(x).putMeta(buffer, getCompressionStrategy.apply(x)); } @Override public int size(T x) { return getWriter.apply(x).metaSize(); } }; } public interface LongEncodingReader { void setBuffer(ByteBuffer buffer); long read(int index); void read(long[] out, int outPosition, int startIndex, int length); int read(long[] out, int outPosition, int[] indexes, int length, int indexOffset, int limit); /** * Duplicates this reader, creating a new reader that does not share any state. Important to achieve thread-safety, * because a common pattern is to duplicate a reader multiple times and then call {@link #setBuffer} on the * various duplicates. */ LongEncodingReader duplicate(); } public static Supplier getLongSupplier( int totalSize, int sizePer, ByteBuffer fromBuffer, ByteOrder order, LongEncodingFormat encodingFormat, CompressionStrategy strategy ) { if (strategy == CompressionStrategy.NONE) { return new EntireLayoutColumnarLongsSupplier(totalSize, encodingFormat.getReader(fromBuffer, order)); } else { return new BlockLayoutColumnarLongsSupplier( totalSize, sizePer, fromBuffer, order, encodingFormat.getReader(fromBuffer, order), strategy ); } } public static ColumnarLongsSerializer getLongSerializer( String columnName, SegmentWriteOutMedium segmentWriteOutMedium, String filenameBase, ByteOrder order, LongEncodingStrategy encodingStrategy, CompressionStrategy compressionStrategy ) { if (encodingStrategy == LongEncodingStrategy.AUTO) { return new IntermediateColumnarLongsSerializer( columnName, segmentWriteOutMedium, filenameBase, order, compressionStrategy ); } else if (encodingStrategy == LongEncodingStrategy.LONGS) { if (compressionStrategy == CompressionStrategy.NONE) { return new EntireLayoutColumnarLongsSerializer( columnName, segmentWriteOutMedium, new LongsLongEncodingWriter(order) ); } else { return new BlockLayoutColumnarLongsSerializer( columnName, segmentWriteOutMedium, filenameBase, order, new LongsLongEncodingWriter(order), compressionStrategy ); } } else { throw new IAE("unknown encoding strategy : %s", encodingStrategy.toString()); } } // Float currently does not support any encoding types, and stores values as 4 byte float public static Supplier getFloatSupplier( int totalSize, int sizePer, ByteBuffer fromBuffer, ByteOrder order, CompressionStrategy strategy ) { if (strategy == CompressionStrategy.NONE) { return new EntireLayoutColumnarFloatsSupplier(totalSize, fromBuffer, order); } else { return new BlockLayoutColumnarFloatsSupplier(totalSize, sizePer, fromBuffer, order, strategy); } } public static ColumnarFloatsSerializer getFloatSerializer( String columnName, SegmentWriteOutMedium segmentWriteOutMedium, String filenameBase, ByteOrder order, CompressionStrategy compressionStrategy ) { if (compressionStrategy == CompressionStrategy.NONE) { return new EntireLayoutColumnarFloatsSerializer(columnName, segmentWriteOutMedium, order); } else { return new BlockLayoutColumnarFloatsSerializer( columnName, segmentWriteOutMedium, filenameBase, order, compressionStrategy ); } } public static Supplier getDoubleSupplier( int totalSize, int sizePer, ByteBuffer fromBuffer, ByteOrder byteOrder, CompressionStrategy strategy ) { switch (strategy) { case NONE: return new EntireLayoutColumnarDoublesSupplier(totalSize, fromBuffer, byteOrder); default: return new BlockLayoutColumnarDoublesSupplier(totalSize, sizePer, fromBuffer, byteOrder, strategy); } } public static ColumnarDoublesSerializer getDoubleSerializer( String columnName, SegmentWriteOutMedium segmentWriteOutMedium, String filenameBase, ByteOrder byteOrder, CompressionStrategy compression ) { if (compression == CompressionStrategy.NONE) { return new EntireLayoutColumnarDoublesSerializer(columnName, segmentWriteOutMedium, byteOrder); } else { return new BlockLayoutColumnarDoublesSerializer( columnName, segmentWriteOutMedium, filenameBase, byteOrder, compression ); } } }





© 2015 - 2024 Weber Informatics LLC | Privacy Policy