All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.indeed.mph.TableConfig Maven / Gradle / Ivy

There is a newer version: 1.0.5
Show newest version
package com.indeed.mph;

import javax.annotation.Nonnull;
import javax.annotation.Nullable;
import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
import java.io.Serializable;

/**
 * Configuration for TableWriter (serialized and loaded automatically for TableReader).
 *
 * Fluent API constructor allows you to set the following fields:
 *
 *   keySerializer: A SmartSerializer applied to the keys (required).
 *
 *   valueSerializer: A SmartSerializer applied to the values
 *     (optional, without which values are not stored).
 *
 *   offsetStorage: The hash function maps keys to hash buckets, which
 *     we then need to map to offsets in the data file.  By default,
 *     the most compact storage representation is chosen
 *     automatically, but you can manually override:
 *       - FIXED: if the entries are all a fixed size, we don't need to store offsets
 *       - INDEXED: offsets are just a flat table indexed by hash bucket
 *       - SELECTED: offsets are represented as a bit-vector of all bytes in the data
 *           file, and we use a Rank/Select algorithm to quickly map from hash bucket
 *           to corresponding starting offset
 *     In general, if you have many small entries SELECTED will be
 *     better, but INDEXED is better if individual entries are large.
 *
 *   keyStorage: EXPLICIT by default, but can be set to IMPLICIT
 *     (along with specifying a signatureWidth) to omit the keys from
 *     table at the expense of allowing false positives.  In many
 *     cases, you know you will only be querying existing keys so
 *     there's no reason to store them.
 *
 *   signatureWidth: The number of bits per key to use in a bloom
 *     filter (required for IMPLICIT keyStorage).
 *
 *   maxHeapUsage: If positive, the limit beyond which offsets are
 *     mmapped instead of being stored directly in the heap.  By
 *     default we always store offsets in the heap.
 *
 *   maxDataHeapUsage: If positive, the limit beyond which data is
 *     mmapped instead of being stored directly in the heap.  By
 *     default we never store data in the heap.
 *
 *   debugDuplicateKeys: If true, asks TableWriter to attempt to
 *     determine which keys were duplicated on failure to build the
 *     hash function.
 *
 * @param  key type
 * @param  value type
 *
 * @author alexs
 */
public class TableConfig implements Serializable {
    public static final long DEFAULT_SHARD_SIZE = 64 * 1024 * 1024;
    private static final long serialVersionUID = 927763169;
    private final SmartSerializer keySerializer;
    private final SmartSerializer valueSerializer;
    private final LinearDiophantineEquation entrySizeEq;
    private final KeyValidator keyValidator;
    private final KeyStorage keyStorage;
    private final OffsetStorage offsetStorage;
    private final int signatureWidth;
    private final long maxHeapUsage;
    private final long maxDataHeapUsage;
    private final long tempShardSize;
    private final boolean debugDuplicateKeys;

    TableConfig(@Nullable final SmartSerializer keySerializer,
                @Nullable final SmartSerializer valueSerializer,
                @Nullable final KeyValidator keyValidator,
                final KeyStorage keyStorage,
                final OffsetStorage offsetStorage,
                final int signatureWidth,
                final long maxHeapUsage,
                final long maxDataHeapUsage,
                final long tempShardSize,
                final boolean debugDuplicateKeys) {
        this.keySerializer = keySerializer;
        this.valueSerializer = valueSerializer;
        this.keyValidator = keyValidator;
        this.keyStorage = keyStorage;
        this.offsetStorage = offsetStorage;
        this.signatureWidth = signatureWidth;
        this.maxHeapUsage = maxHeapUsage;
        this.maxDataHeapUsage = maxDataHeapUsage;
        this.tempShardSize = tempShardSize;
        this.debugDuplicateKeys = debugDuplicateKeys;
        final LinearDiophantineEquation valueSizeEq = valueSerializer == null ?
            LinearDiophantineEquation.constantValue(0L) :
            valueSerializer.size() == null ? LinearDiophantineEquation.multipleOf(1L) : valueSerializer.size();
        final LinearDiophantineEquation keySizeEq =
            (KeyStorage.IMPLICIT.equals(keyStorage) || keySerializer == null) ?
            LinearDiophantineEquation.constantValue(0L) : keySerializer.size();
        this.entrySizeEq =
            (keySizeEq == null ? LinearDiophantineEquation.multipleOf(1L) : keySizeEq).add(valueSizeEq);
    }

    public TableConfig() {
        this(null, null, new EqualKeyValidator(), KeyStorage.EXPLICIT, OffsetStorage.AUTOMATIC, 0, 0, 0, DEFAULT_SHARD_SIZE, false);
    }

    public SmartSerializer getKeySerializer() {
        return keySerializer;
    }

    public SmartSerializer getValueSerializer() {
        return valueSerializer;
    }

    public KeyValidator getKeyValidator() {
        return keyValidator;
    }

    public KeyStorage getKeyStorage() {
        return keyStorage;
    }

    public OffsetStorage getOffsetStorage() {
        return offsetStorage;
    }

    public int getSignatureWidth() {
        return signatureWidth;
    }

    public long getMaxHeapUsage() {
        return maxHeapUsage;
    }

    public long getMaxDataHeapUsage() {
        return maxDataHeapUsage;
    }

    public long getTempShardSize() {
        return tempShardSize;
    }

    public boolean getDebugDuplicateKeys() {
        return debugDuplicateKeys;
    }

    public LinearDiophantineEquation getEntrySize() {
        return entrySizeEq;
    }

    public long sizeOf(final K key, final V value) throws IOException {
        return (TableConfig.KeyStorage.IMPLICIT.equals(getKeyStorage()) ? 0 : getKeySerializer().sizeOf(key)) +
            (getValueSerializer() == null ? 0 : getValueSerializer().sizeOf(value));
    }

    // We add in an extra 1*n to ensure that every compressed offset
    // is unique.
    // TODO: consider checking case-by-case if this is needed

    public long compressOffset(final long offset, final long n) {
        return entrySizeEq.solveForNth(offset, n) + n;
    }

    public long decompressOffset(final long value, final long n) {
        return entrySizeEq.applyNth(value - n, n);
    }

    public OffsetStorage chooseBestOffsetStorage(final long numEntries, final long dataSize) {
        if (entrySizeEq.isConstant()) {
            return OffsetStorage.FIXED;
        }
        final long indexedSize = getIndexedOffsetSize(numEntries, dataSize);
        final long selectedSize = getSelectedOffsetSize(numEntries, dataSize);
        return indexedSize <= selectedSize ? OffsetStorage.INDEXED : OffsetStorage.SELECTED;
    }

    public long getIndexedOffsetSize(final long numEntries, final long dataSize) {
        return numEntries * bytesPerOffset(numEntries, dataSize);
    }

    public long getSelectedOffsetSize(final long numEntries, final long dataSize) {
        final long maxValue = compressOffset(dataSize, numEntries);
        return ((maxValue * 3L) / 64L);
    }

    public int bytesPerOffset(final long numEntries, final long dataSize) {
        return bytesPerLong(dataSize);
    }

    public static int bytesPerLong(final long maxValue) { // currently handle only power of 2 bytes
        if (maxValue < Short.MAX_VALUE) {
            return 2;
        }
        if (maxValue < Integer.MAX_VALUE) {
            return 4;
        }
        return 8;
    }

    public boolean isValid() {
        return keySerializer != null &&
            (!OffsetStorage.FIXED.equals(offsetStorage) || entrySizeEq.isConstant());
    }

    public K readKey(final DataInput in) throws IOException {
        return (K) ((TableConfig.KeyStorage.IMPLICIT.equals(keyStorage) || keySerializer == null) ? null :
                    keySerializer.read(in));
    }

    public V readValue(final DataInput in) throws IOException {
        return (V) ((valueSerializer == null) ? null : valueSerializer.read(in));
    }

    public void write(final K k, final V v, final DataOutput out) throws IOException {
        if (!KeyStorage.IMPLICIT.equals(keyStorage)) {
            keySerializer.write(k, out);
        }
        if (valueSerializer != null) {
            valueSerializer.write(v, out);
        }
    }

    public TableConfig withKeySerializer(final SmartSerializer serializer) {
        return new TableConfig(serializer, valueSerializer, keyValidator, keyStorage, offsetStorage, signatureWidth, maxHeapUsage, maxDataHeapUsage, tempShardSize, debugDuplicateKeys);
    }

    public TableConfig withValueSerializer(final SmartSerializer serializer) {
        return new TableConfig(keySerializer, serializer, keyValidator, keyStorage, offsetStorage, signatureWidth, maxHeapUsage, maxDataHeapUsage, tempShardSize, debugDuplicateKeys);
    }

    public TableConfig withKeyValidator(final KeyValidator validator) {
        return new TableConfig(keySerializer, valueSerializer, validator, keyStorage, offsetStorage, signatureWidth, maxHeapUsage, maxDataHeapUsage, tempShardSize, debugDuplicateKeys);
    }

    public TableConfig withKeyStorage(final KeyStorage storage) {
        return new TableConfig(keySerializer, valueSerializer, keyValidator, storage, offsetStorage, signatureWidth, maxHeapUsage, maxDataHeapUsage, tempShardSize, debugDuplicateKeys);
    }

    public TableConfig withOffsetStorage(final OffsetStorage storage) {
        return new TableConfig(keySerializer, valueSerializer, keyValidator, keyStorage, storage, signatureWidth, maxHeapUsage, maxDataHeapUsage, tempShardSize, debugDuplicateKeys);
    }

    public TableConfig withSignatureWidth(final int width) {
        return new TableConfig(keySerializer, valueSerializer, keyValidator, keyStorage, offsetStorage, width, maxHeapUsage, maxDataHeapUsage, tempShardSize, debugDuplicateKeys);
    }

    public TableConfig withMaxHeapUsage(final long maxHeap) {
        return new TableConfig(keySerializer, valueSerializer, keyValidator, keyStorage, offsetStorage, signatureWidth, maxHeap, maxDataHeapUsage, tempShardSize, debugDuplicateKeys);
    }

    public TableConfig withMaxDataHeapUsage(final long maxDataHeap) {
        return new TableConfig(keySerializer, valueSerializer, keyValidator, keyStorage, offsetStorage, signatureWidth, maxHeapUsage, maxDataHeap, tempShardSize, debugDuplicateKeys);
    }

    public TableConfig withTempShardSize(final long shardSize) {
        return new TableConfig(keySerializer, valueSerializer, keyValidator, keyStorage, offsetStorage, signatureWidth, maxHeapUsage, maxDataHeapUsage, shardSize, debugDuplicateKeys);
    }

    public TableConfig withDebugDuplicateKeys(final boolean debugDupKeys) {
        return new TableConfig(keySerializer, valueSerializer, keyValidator, keyStorage, offsetStorage, signatureWidth, maxHeapUsage, maxDataHeapUsage, tempShardSize, debugDupKeys);
    }

    public String toString() {
        return "[TableConfig keys: " + keySerializer + " values: " + valueSerializer +
            " keyStorage: " + keyStorage + " offsetStorage: " + offsetStorage +
            " validator: " + keyValidator + " signatureWidth: " + signatureWidth +
            " maxHeapUsage: " + maxHeapUsage + " maxDataHeapUsage: " + maxDataHeapUsage +
            " entrySize: " + entrySizeEq + " debugDupKeys: " + debugDuplicateKeys + "]";
    }

    public enum KeyStorage {
        EXPLICIT,               // default
        IMPLICIT                // don't actually store the keys
    }

    public enum OffsetStorage {
        AUTOMATIC,              // choose optimal storage
        INDEXED,                // an indexed array of offsets per hash
        SELECTED,               // a rank-select lookup per hash
        FIXED                   // fixed size entries
    }
}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy