All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.druid.segment.nested.GlobalDictionaryEncodedFieldColumnWriter Maven / Gradle / Ivy

There is a newer version: 30.0.1
Show newest version
/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *   http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied.  See the License for the
 * specific language governing permissions and limitations
 * under the License.
 */

package org.apache.druid.segment.nested;

import com.google.common.base.Preconditions;
import com.google.common.primitives.Ints;
import it.unimi.dsi.fastutil.ints.Int2IntOpenHashMap;
import it.unimi.dsi.fastutil.ints.Int2ObjectMap;
import it.unimi.dsi.fastutil.ints.Int2ObjectRBTreeMap;
import it.unimi.dsi.fastutil.ints.IntArrays;
import it.unimi.dsi.fastutil.ints.IntIterator;
import org.apache.druid.collections.bitmap.ImmutableBitmap;
import org.apache.druid.collections.bitmap.MutableBitmap;
import org.apache.druid.io.Channels;
import org.apache.druid.java.util.common.io.smoosh.FileSmoosher;
import org.apache.druid.java.util.common.io.smoosh.SmooshedWriter;
import org.apache.druid.java.util.common.logger.Logger;
import org.apache.druid.segment.IndexSpec;
import org.apache.druid.segment.data.CompressedVSizeColumnarIntsSerializer;
import org.apache.druid.segment.data.CompressionStrategy;
import org.apache.druid.segment.data.FixedIndexedIntWriter;
import org.apache.druid.segment.data.GenericIndexedWriter;
import org.apache.druid.segment.data.SingleValueColumnarIntsSerializer;
import org.apache.druid.segment.data.VSizeColumnarIntsSerializer;
import org.apache.druid.segment.serde.DictionaryEncodedColumnPartSerde;
import org.apache.druid.segment.serde.Serializer;
import org.apache.druid.segment.writeout.SegmentWriteOutMedium;

import javax.annotation.Nullable;
import java.io.IOException;
import java.nio.ByteBuffer;
import java.nio.ByteOrder;
import java.nio.channels.WritableByteChannel;

/**
 * Base class for writer of global dictionary encoded nested field columns for {@link NestedDataColumnSerializer}.
 * Nested columns are written in multiple passes. The first pass processes the 'raw' nested data with a
 * {@link StructuredDataProcessor} which will call {@link #addValue(int, Object)} for writers of each field which is
 * present. For this type of writer, this entails building a local dictionary ({@link #localDictionary}) to map into
 * the global dictionary ({@link #globalDictionaryIdLookup}) and writes this unsorted localId to an intermediate
 * integer column, {@link #intermediateValueWriter}.
 * 

* When processing the 'raw' value column is complete, the {@link #writeTo(int, FileSmoosher)} method will sort the * local ids and write them out to a local sorted dictionary, iterate over {@link #intermediateValueWriter} swapping * the unsorted local ids with the sorted ids and writing to the compressed id column writer * {@link #encodedValueSerializer}, building the bitmap indexes along the way. * * @see ScalarDoubleFieldColumnWriter - single type double columns * @see ScalarLongFieldColumnWriter - single type long columns * @see ScalarStringFieldColumnWriter - single type string columns * @see VariantArrayFieldColumnWriter - single type array columns of double, long, or string * @see VariantFieldColumnWriter - mixed type columns of any combination */ public abstract class GlobalDictionaryEncodedFieldColumnWriter { private static final Logger log = new Logger(GlobalDictionaryEncodedFieldColumnWriter.class); protected final SegmentWriteOutMedium segmentWriteOutMedium; protected final String columnName; protected final String fieldName; protected final IndexSpec indexSpec; protected final DictionaryIdLookup globalDictionaryIdLookup; protected final LocalDimensionDictionary localDictionary = new LocalDimensionDictionary(); protected final Int2ObjectRBTreeMap arrayElements = new Int2ObjectRBTreeMap<>(); protected FixedIndexedIntWriter intermediateValueWriter; // maybe someday we allow no bitmap indexes or multi-value columns protected int flags = DictionaryEncodedColumnPartSerde.NO_FLAGS; protected DictionaryEncodedColumnPartSerde.VERSION version = null; protected SingleValueColumnarIntsSerializer encodedValueSerializer; protected int cursorPosition; protected GlobalDictionaryEncodedFieldColumnWriter( String columnName, String fieldName, SegmentWriteOutMedium segmentWriteOutMedium, IndexSpec indexSpec, DictionaryIdLookup globalDictionaryIdLookup ) { this.columnName = columnName; this.fieldName = fieldName; this.segmentWriteOutMedium = segmentWriteOutMedium; this.indexSpec = indexSpec; this.globalDictionaryIdLookup = globalDictionaryIdLookup; } /** * Perform any value conversion needed before looking up the global id in the value dictionary (such as null handling * stuff or array processing to add the elements to the dictionary before adding the int[] to the dictionary) */ T processValue(int row, Object value) { return (T) value; } /** * Hook to allow implementors the chance to do additional operations during {@link #addValue(int, Object)}, such as * writing an additional value column */ void writeValue(@Nullable T value) throws IOException { // do nothing, if a value column is present this method should be overridden to write the value to the serializer } /** * Find a value in {@link #globalDictionaryIdLookup} as is most appropriate to the writer type */ abstract int lookupGlobalId(T value); /** * Open the writer so that {@link #addValue(int, Object)} can be called */ public void open() throws IOException { intermediateValueWriter = new FixedIndexedIntWriter(segmentWriteOutMedium, false); intermediateValueWriter.open(); cursorPosition = 0; } /** * Add a value to the unsorted local dictionary and write to an intermediate column */ public void addValue(int row, Object val) throws IOException { if (row > cursorPosition) { fillNull(row); } final T value = processValue(row, val); final int localId; // null is always 0 if (value == null) { localId = localDictionary.add(0); } else { final int globalId = lookupGlobalId(value); Preconditions.checkArgument(globalId >= 0, "Value [%s] is not present in global dictionary", value); localId = localDictionary.add(globalId); } intermediateValueWriter.write(localId); writeValue(value); cursorPosition++; } /** * Backfill intermediate column with null values */ private void fillNull(int row) throws IOException { final T value = processValue(row, null); final int localId = localDictionary.add(0); while (cursorPosition < row) { intermediateValueWriter.write(localId); writeValue(value); cursorPosition++; } } /** * How many bytes {@link #writeColumnTo(WritableByteChannel, FileSmoosher)} is expected to write to the segment. */ long getSerializedColumnSize() throws IOException { return Integer.BYTES + Integer.BYTES + encodedValueSerializer.getSerializedSize(); } /** * Defines how to write the column, including the dictionary id column, along with any additional columns * such as the long or double value column as type appropriate. */ abstract void writeColumnTo(WritableByteChannel channel, FileSmoosher smoosher) throws IOException; public void writeTo(int finalRowCount, FileSmoosher smoosher) throws IOException { if (finalRowCount > cursorPosition) { fillNull(finalRowCount); } // use a child writeout medium so that we can close them when we are finished and don't leave temporary files // hanging out until the entire segment is done final SegmentWriteOutMedium tmpWriteoutMedium = segmentWriteOutMedium.makeChildWriteOutMedium(); final FixedIndexedIntWriter sortedDictionaryWriter = new FixedIndexedIntWriter(tmpWriteoutMedium, true); sortedDictionaryWriter.open(); final FixedIndexedIntWriter arrayElementDictionaryWriter = new FixedIndexedIntWriter(tmpWriteoutMedium, true); arrayElementDictionaryWriter.open(); GenericIndexedWriter bitmapIndexWriter = new GenericIndexedWriter<>( tmpWriteoutMedium, columnName, indexSpec.getBitmapSerdeFactory().getObjectStrategy() ); bitmapIndexWriter.open(); bitmapIndexWriter.setObjectsNotSorted(); GenericIndexedWriter arrayElementIndexWriter = new GenericIndexedWriter<>( tmpWriteoutMedium, columnName, indexSpec.getBitmapSerdeFactory().getObjectStrategy() ); arrayElementIndexWriter.open(); arrayElementIndexWriter.setObjectsNotSorted(); final Int2IntOpenHashMap globalToUnsorted = localDictionary.getGlobalIdToLocalId(); final int[] unsortedToGlobal = new int[localDictionary.size()]; for (int key : globalToUnsorted.keySet()) { unsortedToGlobal[globalToUnsorted.get(key)] = key; } final int[] sortedGlobal = new int[unsortedToGlobal.length]; System.arraycopy(unsortedToGlobal, 0, sortedGlobal, 0, unsortedToGlobal.length); IntArrays.unstableSort(sortedGlobal); final int[] unsortedToSorted = new int[unsortedToGlobal.length]; final MutableBitmap[] bitmaps = new MutableBitmap[sortedGlobal.length]; for (int index = 0; index < sortedGlobal.length; index++) { final int globalId = sortedGlobal[index]; sortedDictionaryWriter.write(globalId); final int unsortedId = globalToUnsorted.get(globalId); unsortedToSorted[unsortedId] = index; bitmaps[index] = indexSpec.getBitmapSerdeFactory().getBitmapFactory().makeEmptyMutableBitmap(); } for (Int2ObjectMap.Entry arrayElement : arrayElements.int2ObjectEntrySet()) { arrayElementDictionaryWriter.write(arrayElement.getIntKey()); arrayElementIndexWriter.write( indexSpec.getBitmapSerdeFactory().getBitmapFactory().makeImmutableBitmap(arrayElement.getValue()) ); } openColumnSerializer(tmpWriteoutMedium, sortedGlobal[sortedGlobal.length - 1]); final IntIterator rows = intermediateValueWriter.getIterator(); int rowCount = 0; while (rows.hasNext()) { final int unsortedLocalId = rows.nextInt(); final int sortedLocalId = unsortedToSorted[unsortedLocalId]; encodedValueSerializer.addValue(sortedLocalId); bitmaps[sortedLocalId].add(rowCount++); } for (int i = 0; i < bitmaps.length; i++) { final MutableBitmap bitmap = bitmaps[i]; bitmapIndexWriter.write( indexSpec.getBitmapSerdeFactory().getBitmapFactory().makeImmutableBitmap(bitmap) ); bitmaps[i] = null; // Reclaim memory } final Serializer fieldSerializer = new Serializer() { @Override public long getSerializedSize() throws IOException { final long arraySize; if (arrayElements.size() > 0) { arraySize = arrayElementDictionaryWriter.getSerializedSize() + arrayElementIndexWriter.getSerializedSize(); } else { arraySize = 0; } return 1 + Integer.BYTES + sortedDictionaryWriter.getSerializedSize() + bitmapIndexWriter.getSerializedSize() + arraySize + getSerializedColumnSize(); } @Override public void writeTo(WritableByteChannel channel, FileSmoosher smoosher) throws IOException { Channels.writeFully(channel, ByteBuffer.wrap(new byte[]{version.asByte()})); channel.write(ByteBuffer.wrap(Ints.toByteArray(flags))); sortedDictionaryWriter.writeTo(channel, smoosher); writeColumnTo(channel, smoosher); bitmapIndexWriter.writeTo(channel, smoosher); if (arrayElements.size() > 0) { arrayElementDictionaryWriter.writeTo(channel, smoosher); arrayElementIndexWriter.writeTo(channel, smoosher); } } }; final String fieldFileName = NestedCommonFormatColumnSerializer.getInternalFileName(columnName, fieldName); final long size = fieldSerializer.getSerializedSize(); log.debug("Column [%s] serializing [%s] field of size [%d].", columnName, fieldName, size); try (SmooshedWriter smooshChannel = smoosher.addWithSmooshedWriter(fieldFileName, size)) { fieldSerializer.writeTo(smooshChannel, smoosher); } finally { tmpWriteoutMedium.close(); } } private void openColumnSerializer(SegmentWriteOutMedium medium, int maxId) throws IOException { if (indexSpec.getDimensionCompression() != CompressionStrategy.UNCOMPRESSED) { this.version = DictionaryEncodedColumnPartSerde.VERSION.COMPRESSED; encodedValueSerializer = CompressedVSizeColumnarIntsSerializer.create( fieldName, medium, columnName, maxId, indexSpec.getDimensionCompression() ); } else { encodedValueSerializer = new VSizeColumnarIntsSerializer(medium, maxId); this.version = DictionaryEncodedColumnPartSerde.VERSION.UNCOMPRESSED_SINGLE_VALUE; } encodedValueSerializer.open(); } public void writeLongAndDoubleColumnLength(WritableByteChannel channel, int longLength, int doubleLength) throws IOException { ByteBuffer intBuffer = ByteBuffer.allocate(Integer.BYTES).order(ByteOrder.nativeOrder()); intBuffer.position(0); intBuffer.putInt(longLength); intBuffer.flip(); Channels.writeFully(channel, intBuffer); intBuffer.position(0); intBuffer.limit(intBuffer.capacity()); intBuffer.putInt(doubleLength); intBuffer.flip(); Channels.writeFully(channel, intBuffer); } }





© 2015 - 2024 Weber Informatics LLC | Privacy Policy