All Downloads are FREE. Search and download functionalities are using the official Maven repository.

io.druid.segment.data.GenericIndexed Maven / Gradle / Ivy

/*
 * Licensed to Metamarkets Group Inc. (Metamarkets) under one
 * or more contributor license agreements. See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership. Metamarkets licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License. You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied. See the License for the
 * specific language governing permissions and limitations
 * under the License.
 */

package io.druid.segment.data;

import com.google.common.primitives.Ints;
import io.druid.common.utils.SerializerUtils;
import io.druid.io.ZeroCopyByteArrayOutputStream;
import io.druid.java.util.common.IAE;
import io.druid.java.util.common.StringUtils;
import io.druid.java.util.common.guava.CloseQuietly;
import io.druid.java.util.common.guava.Comparators;
import io.druid.java.util.common.io.smoosh.SmooshedFileMapper;
import io.druid.query.monomorphicprocessing.RuntimeShapeInspector;
import it.unimi.dsi.fastutil.bytes.ByteArrays;

import java.io.Closeable;
import java.io.IOException;
import java.nio.ByteBuffer;
import java.nio.ByteOrder;
import java.nio.channels.WritableByteChannel;
import java.util.Arrays;
import java.util.Iterator;

/**
 * A generic, flat storage mechanism.  Use static methods fromArray() or fromIterable() to construct.  If input
 * is sorted, supports binary search index lookups.  If input is not sorted, only supports array-like index lookups.
 * 

* V1 Storage Format: *

* byte 1: version (0x1) * byte 2 == 0x1 =>; allowReverseLookup * bytes 3-6 =>; numBytesUsed * bytes 7-10 =>; numElements * bytes 10-((numElements * 4) + 10): integers representing *end* offsets of byte serialized values * bytes ((numElements * 4) + 10)-(numBytesUsed + 2): 4-byte integer representing length of value, followed by bytes * for value *

* V2 Storage Format * Meta, header and value files are separate and header file stored in native endian byte order. * Meta File: * byte 1: version (0x2) * byte 2 == 0x1 =>; allowReverseLookup * bytes 3-6: numberOfElementsPerValueFile expressed as power of 2. That means all the value files contains same * number of items except last value file and may have fewer elements. * bytes 7-10 =>; numElements * bytes 11-14 =>; columnNameLength * bytes 15-columnNameLength =>; columnName *

* Header file name is identified as: StringUtils.format("%s_header", columnName) * value files are identified as: StringUtils.format("%s_value_%d", columnName, fileNumber) * number of value files == numElements/numberOfElementsPerValueFile */ public class GenericIndexed implements Indexed { static final byte VERSION_ONE = 0x1; static final byte VERSION_TWO = 0x2; static final byte REVERSE_LOOKUP_ALLOWED = 0x1; static final byte REVERSE_LOOKUP_DISALLOWED = 0x0; private static final SerializerUtils SERIALIZER_UTILS = new SerializerUtils(); public static final ObjectStrategy STRING_STRATEGY = new CacheableObjectStrategy() { @Override public Class getClazz() { return String.class; } @Override public String fromByteBuffer(final ByteBuffer buffer, final int numBytes) { return StringUtils.fromUtf8(buffer, numBytes); } @Override public byte[] toBytes(String val) { if (val == null) { return ByteArrays.EMPTY_ARRAY; } return StringUtils.toUtf8(val); } @Override public int compare(String o1, String o2) { return Comparators.naturalNullsFirst().compare(o1, o2); } }; public static GenericIndexed read(ByteBuffer buffer, ObjectStrategy strategy) { byte versionFromBuffer = buffer.get(); if (VERSION_ONE == versionFromBuffer) { return createGenericIndexedVersionOne(buffer, strategy); } else if (VERSION_TWO == versionFromBuffer) { throw new IAE( "use read(ByteBuffer buffer, ObjectStrategy strategy, SmooshedFileMapper fileMapper)" + " to read version 2 indexed." ); } throw new IAE("Unknown version[%d]", (int) versionFromBuffer); } public static GenericIndexed read(ByteBuffer buffer, ObjectStrategy strategy, SmooshedFileMapper fileMapper) { byte versionFromBuffer = buffer.get(); if (VERSION_ONE == versionFromBuffer) { return createGenericIndexedVersionOne(buffer, strategy); } else if (VERSION_TWO == versionFromBuffer) { return createGenericIndexedVersionTwo(buffer, strategy, fileMapper); } throw new IAE("Unknown version [%s]", versionFromBuffer); } public static GenericIndexed fromArray(T[] objects, ObjectStrategy strategy) { return fromIterable(Arrays.asList(objects), strategy); } public static GenericIndexed fromIterable(Iterable objectsIterable, ObjectStrategy strategy) { return fromIterableVersionOne(objectsIterable, strategy); } static int getNumberOfFilesRequired(int bagSize, long numWritten) { int numberOfFilesRequired = (int) (numWritten / bagSize); if ((numWritten % bagSize) != 0) { numberOfFilesRequired += 1; } return numberOfFilesRequired; } private final boolean versionOne; private final ObjectStrategy strategy; private final boolean allowReverseLookup; private final int size; private final ByteBuffer headerBuffer; private final ByteBuffer firstValueBuffer; private final ByteBuffer[] valueBuffers; private int logBaseTwoOfElementsPerValueFile; private int relativeIndexMask; private final ByteBuffer theBuffer; /** * Constructor for version one. */ GenericIndexed( ByteBuffer buffer, ObjectStrategy strategy, boolean allowReverseLookup ) { this.versionOne = true; this.theBuffer = buffer; this.strategy = strategy; this.allowReverseLookup = allowReverseLookup; size = theBuffer.getInt(); int indexOffset = theBuffer.position(); int valuesOffset = theBuffer.position() + size * Ints.BYTES; buffer.position(valuesOffset); // Ensure the value buffer's limit equals to capacity. firstValueBuffer = buffer.slice(); valueBuffers = new ByteBuffer[]{firstValueBuffer}; buffer.position(indexOffset); headerBuffer = buffer.slice(); } /** * Constructor for version two. */ GenericIndexed( ByteBuffer[] valueBuffs, ByteBuffer headerBuff, ObjectStrategy strategy, boolean allowReverseLookup, int logBaseTwoOfElementsPerValueFile, int numWritten ) { this.versionOne = false; this.theBuffer = null; this.strategy = strategy; this.allowReverseLookup = allowReverseLookup; this.valueBuffers = valueBuffs; this.firstValueBuffer = valueBuffers[0]; this.headerBuffer = headerBuff; this.size = numWritten; this.logBaseTwoOfElementsPerValueFile = logBaseTwoOfElementsPerValueFile; this.relativeIndexMask = (1 << logBaseTwoOfElementsPerValueFile) - 1; headerBuffer.order(ByteOrder.nativeOrder()); } /** * Checks if {@code index} a valid `element index` in GenericIndexed. * Similar to Preconditions.checkElementIndex() except this method throws {@link IAE} with custom error message. *

* Used here to get existing behavior(same error message and exception) of V1 GenericIndexed. * * @param index index identifying an element of an GenericIndexed. */ private void checkIndex(int index) { if (index < 0) { throw new IAE("Index[%s] < 0", index); } if (index >= size) { throw new IAE("Index[%d] >= size[%d]", index, size); } } @Override public Class getClazz() { return strategy.getClazz(); } @Override public int size() { return size; } @Override public T get(int index) { return versionOne ? getVersionOne(index) : getVersionTwo(index); } /** * Returns the index of "value" in this GenericIndexed object, or (-(insertion point) - 1) if the value is not * present, in the manner of Arrays.binarySearch. This strengthens the contract of Indexed, which only guarantees * that values-not-found will return some negative number. * * @param value value to search for * * @return index of value, or negative number equal to (-(insertion point) - 1). */ @Override public int indexOf(T value) { return indexOf(this, value); } private int indexOf(Indexed indexed, T value) { if (!allowReverseLookup) { throw new UnsupportedOperationException("Reverse lookup not allowed."); } value = (value != null && value.equals("")) ? null : value; int minIndex = 0; int maxIndex = size - 1; while (minIndex <= maxIndex) { int currIndex = (minIndex + maxIndex) >>> 1; T currValue = indexed.get(currIndex); int comparison = strategy.compare(currValue, value); if (comparison == 0) { return currIndex; } if (comparison < 0) { minIndex = currIndex + 1; } else { maxIndex = currIndex - 1; } } return -(minIndex + 1); } @Override public Iterator iterator() { return IndexedIterable.create(this).iterator(); } public long getSerializedSize() { if (!versionOne) { throw new UnsupportedOperationException("Method not supported for version 2 GenericIndexed."); } return getSerializedSizeVersionOne(); } public void writeToChannel(WritableByteChannel channel) throws IOException { if (versionOne) { writeToChannelVersionOne(channel); } else { throw new UnsupportedOperationException( "GenericIndexed serialization for V2 is unsupported. Use GenericIndexedWriter instead."); } } /** * Create a non-thread-safe Indexed, which may perform better than the underlying Indexed. * * @return a non-thread-safe Indexed */ public GenericIndexed.BufferIndexed singleThreaded() { return versionOne ? singleThreadedVersionOne() : singleThreadedVersionTwo(); } private T copyBufferAndGet(ByteBuffer valueBuffer, int startOffset, int endOffset) { final int size = endOffset - startOffset; if (size == 0) { return null; } ByteBuffer copyValueBuffer = valueBuffer.asReadOnlyBuffer(); copyValueBuffer.position(startOffset); // fromByteBuffer must not modify the buffer limit return strategy.fromByteBuffer(copyValueBuffer, size); } @Override public void inspectRuntimeShape(RuntimeShapeInspector inspector) { inspector.visit("versionOne", versionOne); inspector.visit("headerBuffer", headerBuffer); if (versionOne) { inspector.visit("firstValueBuffer", firstValueBuffer); } else { // Inspecting just one example of valueBuffer, not needed to inspect the whole array, because all buffers in it // are the same. inspector.visit("valueBuffer", valueBuffers.length > 0 ? valueBuffers[0] : null); } inspector.visit("strategy", strategy); } abstract class BufferIndexed implements Indexed { int lastReadSize; @Override public Class getClazz() { return strategy.getClazz(); } @Override public int size() { return size; } T bufferedIndexedGet(ByteBuffer copyValueBuffer, int startOffset, int endOffset) { final int size = endOffset - startOffset; lastReadSize = size; if (size == 0) { return null; } // ObjectStrategy.fromByteBuffer() is allowed to reset the limit of the buffer. So if the limit is changed, // position() call in the next line could throw an exception, if the position is set beyond the new limit. clear() // sets the limit to the maximum possible, the capacity. It is safe to reset the limit to capacity, because the // value buffer(s) initial limit equals to capacity. copyValueBuffer.clear(); copyValueBuffer.position(startOffset); return strategy.fromByteBuffer(copyValueBuffer, size); } /** * This method makes no guarantees with respect to thread safety * * @return the size in bytes of the last value read */ int getLastValueSize() { return lastReadSize; } @Override public int indexOf(T value) { return GenericIndexed.this.indexOf(this, value); } @Override public Iterator iterator() { return GenericIndexed.this.iterator(); } } /////////////// // VERSION ONE /////////////// private static GenericIndexed createGenericIndexedVersionOne(ByteBuffer byteBuffer, ObjectStrategy strategy) { boolean allowReverseLookup = byteBuffer.get() == REVERSE_LOOKUP_ALLOWED; int size = byteBuffer.getInt(); ByteBuffer bufferToUse = byteBuffer.asReadOnlyBuffer(); bufferToUse.limit(bufferToUse.position() + size); byteBuffer.position(bufferToUse.limit()); return new GenericIndexed<>( bufferToUse, strategy, allowReverseLookup ); } private static GenericIndexed fromIterableVersionOne(Iterable objectsIterable, ObjectStrategy strategy) { Iterator objects = objectsIterable.iterator(); if (!objects.hasNext()) { final ByteBuffer buffer = ByteBuffer.allocate(Ints.BYTES).putInt(0); buffer.flip(); return new GenericIndexed<>(buffer, strategy, true); } boolean allowReverseLookup = true; int count = 0; ZeroCopyByteArrayOutputStream headerBytes = new ZeroCopyByteArrayOutputStream(); ZeroCopyByteArrayOutputStream valueBytes = new ZeroCopyByteArrayOutputStream(); ByteBuffer helperBuffer = ByteBuffer.allocate(Ints.BYTES); try { int offset = 0; T prevVal = null; do { count++; T next = objects.next(); if (allowReverseLookup && prevVal != null && !(strategy.compare(prevVal, next) < 0)) { allowReverseLookup = false; } final byte[] bytes = strategy.toBytes(next); offset += Ints.BYTES + bytes.length; SerializerUtils.writeBigEndianIntToOutputStream(headerBytes, offset, helperBuffer); SerializerUtils.writeBigEndianIntToOutputStream(valueBytes, bytes.length, helperBuffer); valueBytes.write(bytes); if (prevVal instanceof Closeable) { CloseQuietly.close((Closeable) prevVal); } prevVal = next; } while (objects.hasNext()); if (prevVal instanceof Closeable) { CloseQuietly.close((Closeable) prevVal); } } catch (IOException e) { throw new RuntimeException(e); } ByteBuffer theBuffer = ByteBuffer.allocate(Ints.BYTES + headerBytes.size() + valueBytes.size()); theBuffer.putInt(count); headerBytes.writeTo(theBuffer); valueBytes.writeTo(theBuffer); theBuffer.flip(); return new GenericIndexed<>(theBuffer.asReadOnlyBuffer(), strategy, allowReverseLookup); } private long getSerializedSizeVersionOne() { return theBuffer.remaining() + 1 // version byte + 1 // allowReverseLookup flag + Ints.BYTES // numBytesUsed + Ints.BYTES; // numElements } private T getVersionOne(int index) { checkIndex(index); final int startOffset; final int endOffset; if (index == 0) { startOffset = Ints.BYTES; endOffset = headerBuffer.getInt(0); } else { int headerPosition = (index - 1) * Ints.BYTES; startOffset = headerBuffer.getInt(headerPosition) + Ints.BYTES; endOffset = headerBuffer.getInt(headerPosition + Ints.BYTES); } return copyBufferAndGet(firstValueBuffer, startOffset, endOffset); } private BufferIndexed singleThreadedVersionOne() { final ByteBuffer copyBuffer = firstValueBuffer.asReadOnlyBuffer(); return new BufferIndexed() { @Override public T get(final int index) { checkIndex(index); final int startOffset; final int endOffset; if (index == 0) { startOffset = 4; endOffset = headerBuffer.getInt(0); } else { int headerPosition = (index - 1) * Ints.BYTES; startOffset = headerBuffer.getInt(headerPosition) + Ints.BYTES; endOffset = headerBuffer.getInt(headerPosition + Ints.BYTES); } return bufferedIndexedGet(copyBuffer, startOffset, endOffset); } @Override public void inspectRuntimeShape(RuntimeShapeInspector inspector) { inspector.visit("headerBuffer", headerBuffer); inspector.visit("copyBuffer", copyBuffer); inspector.visit("strategy", strategy); } }; } private void writeToChannelVersionOne(WritableByteChannel channel) throws IOException { channel.write(ByteBuffer.wrap(new byte[]{ VERSION_ONE, allowReverseLookup ? REVERSE_LOOKUP_ALLOWED : REVERSE_LOOKUP_DISALLOWED })); channel.write(ByteBuffer.wrap(Ints.toByteArray(theBuffer.remaining() + Ints.BYTES))); channel.write(ByteBuffer.wrap(Ints.toByteArray(size))); channel.write(theBuffer.asReadOnlyBuffer()); } /////////////// // VERSION TWO /////////////// private static GenericIndexed createGenericIndexedVersionTwo( ByteBuffer byteBuffer, ObjectStrategy strategy, SmooshedFileMapper fileMapper ) { if (fileMapper == null) { throw new IAE("SmooshedFileMapper can not be null for version 2."); } boolean allowReverseLookup = byteBuffer.get() == REVERSE_LOOKUP_ALLOWED; int logBaseTwoOfElementsPerValueFile = byteBuffer.getInt(); int numElements = byteBuffer.getInt(); try { String columnName = SERIALIZER_UTILS.readString(byteBuffer); int elementsPerValueFile = 1 << logBaseTwoOfElementsPerValueFile; int numberOfFilesRequired = getNumberOfFilesRequired(elementsPerValueFile, numElements); ByteBuffer[] valueBuffersToUse = new ByteBuffer[numberOfFilesRequired]; for (int i = 0; i < numberOfFilesRequired; i++) { // SmooshedFileMapper.mapFile() contract guarantees that the valueBuffer's limit equals to capacity. ByteBuffer valueBuffer = fileMapper.mapFile(GenericIndexedWriter.generateValueFileName(columnName, i)); valueBuffersToUse[i] = valueBuffer.asReadOnlyBuffer(); } ByteBuffer headerBuffer = fileMapper.mapFile(GenericIndexedWriter.generateHeaderFileName(columnName)); return new GenericIndexed<>( valueBuffersToUse, headerBuffer, strategy, allowReverseLookup, logBaseTwoOfElementsPerValueFile, numElements ); } catch (IOException e) { throw new RuntimeException("File mapping failed.", e); } } private T getVersionTwo(int index) { checkIndex(index); final int startOffset; final int endOffset; int relativePositionOfIndex = index & relativeIndexMask; if (relativePositionOfIndex == 0) { int headerPosition = index * Ints.BYTES; startOffset = Ints.BYTES; endOffset = headerBuffer.getInt(headerPosition); } else { int headerPosition = (index - 1) * Ints.BYTES; startOffset = headerBuffer.getInt(headerPosition) + Ints.BYTES; endOffset = headerBuffer.getInt(headerPosition + Ints.BYTES); } int fileNum = index >> logBaseTwoOfElementsPerValueFile; return copyBufferAndGet(valueBuffers[fileNum], startOffset, endOffset); } private BufferIndexed singleThreadedVersionTwo() { final ByteBuffer[] copyValueBuffers = new ByteBuffer[valueBuffers.length]; for (int i = 0; i < valueBuffers.length; i++) { copyValueBuffers[i] = valueBuffers[i].asReadOnlyBuffer(); } return new BufferIndexed() { @Override public T get(final int index) { checkIndex(index); final int startOffset; final int endOffset; int relativePositionOfIndex = index & relativeIndexMask; if (relativePositionOfIndex == 0) { int headerPosition = index * Ints.BYTES; startOffset = 4; endOffset = headerBuffer.getInt(headerPosition); } else { int headerPosition = (index - 1) * Ints.BYTES; startOffset = headerBuffer.getInt(headerPosition) + Ints.BYTES; endOffset = headerBuffer.getInt(headerPosition + Ints.BYTES); } int fileNum = index >> logBaseTwoOfElementsPerValueFile; return bufferedIndexedGet(copyValueBuffers[fileNum], startOffset, endOffset); } @Override public void inspectRuntimeShape(RuntimeShapeInspector inspector) { inspector.visit("headerBuffer", headerBuffer); // Inspecting just one example of copyValueBuffer, not needed to inspect the whole array, because all buffers // in it are the same. inspector.visit("copyValueBuffer", copyValueBuffers.length > 0 ? copyValueBuffers[0] : null); inspector.visit("strategy", strategy); } }; } }





© 2015 - 2024 Weber Informatics LLC | Privacy Policy