org.apache.druid.segment.nested.NestedDataColumnSerializer Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of druid-processing Show documentation
A module that is everything required to understands Druid Segments
There is a newer version: 31.0.0
/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *   http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied.  See the License for the
 * specific language governing permissions and limitations
 * under the License.
 */

package org.apache.druid.segment.nested;

import com.google.common.base.Preconditions;
import com.google.common.collect.Maps;
import org.apache.druid.collections.bitmap.ImmutableBitmap;
import org.apache.druid.collections.bitmap.MutableBitmap;
import org.apache.druid.common.config.NullHandling;
import org.apache.druid.java.util.common.FileUtils;
import org.apache.druid.java.util.common.ISE;
import org.apache.druid.java.util.common.RE;
import org.apache.druid.java.util.common.io.Closer;
import org.apache.druid.java.util.common.io.smoosh.FileSmoosher;
import org.apache.druid.java.util.common.io.smoosh.SmooshedFileMapper;
import org.apache.druid.java.util.common.io.smoosh.SmooshedWriter;
import org.apache.druid.java.util.common.logger.Logger;
import org.apache.druid.math.expr.ExprEval;
import org.apache.druid.segment.ColumnValueSelector;
import org.apache.druid.segment.IndexMerger;
import org.apache.druid.segment.IndexSpec;
import org.apache.druid.segment.column.ColumnType;
import org.apache.druid.segment.column.StringEncodingStrategies;
import org.apache.druid.segment.column.Types;
import org.apache.druid.segment.column.ValueType;
import org.apache.druid.segment.data.ByteBufferWriter;
import org.apache.druid.segment.data.CompressedVariableSizedBlobColumnSerializer;
import org.apache.druid.segment.data.CompressionStrategy;
import org.apache.druid.segment.data.DictionaryWriter;
import org.apache.druid.segment.data.FixedIndexedWriter;
import org.apache.druid.segment.data.FrontCodedIntArrayIndexedWriter;
import org.apache.druid.segment.data.GenericIndexed;
import org.apache.druid.segment.data.GenericIndexedWriter;
import org.apache.druid.segment.writeout.SegmentWriteOutMedium;

import javax.annotation.Nullable;
import java.io.ByteArrayOutputStream;
import java.io.IOException;
import java.nio.ByteBuffer;
import java.nio.ByteOrder;
import java.nio.channels.WritableByteChannel;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;
import java.util.SortedMap;

/**
 * Serializer for {@link NestedCommonFormatColumn} which can store nested data. The serializer stores several components
 * including:
 * - a field list and associated type info
 * - value dictionaries for string, long, double, and array values (where the arrays are stored as int[] that point to
 *   the string, long, and double values)
 * - raw data is stored with a {@link CompressedVariableSizedBlobColumnSerializer} as blobs of SMILE encoded data
 * - a null value bitmap to track which 'raw' rows are null
 *
 * For each nested field, a {@link GlobalDictionaryEncodedFieldColumnWriter} will write a sub-column to specialize
 * fast reading and filtering of that path.
 *
 * @see ScalarDoubleFieldColumnWriter - single type double field
 * @see ScalarLongFieldColumnWriter   - single type long field
 * @see ScalarStringFieldColumnWriter - single type string field
 * @see VariantArrayFieldColumnWriter - single type array of string, long, and double field
 * @see VariantFieldColumnWriter      - mixed type field
 */
public class NestedDataColumnSerializer extends NestedCommonFormatColumnSerializer
{
  private static final Logger log = new Logger(NestedDataColumnSerializer.class);

  private final String name;
  private final SegmentWriteOutMedium segmentWriteOutMedium;
  private final IndexSpec indexSpec;
  @SuppressWarnings("unused")
  private final Closer closer;

  private final StructuredDataProcessor fieldProcessor = new StructuredDataProcessor()
  {
    @Override
    public ProcessedValue processField(ArrayList fieldPath, @Nullable Object fieldValue)
    {
      final GlobalDictionaryEncodedFieldColumnWriter writer = fieldWriters.get(
          NestedPathFinder.toNormalizedJsonPath(fieldPath)
      );
      if (writer != null) {
        try {
          final ExprEval eval = ExprEval.bestEffortOf(fieldValue);
          if (eval.type().isPrimitive() || eval.type().isPrimitiveArray()) {
            writer.addValue(rowCount, eval.value());
          } else {
            // behave consistently with nested column indexer, which defaults to string
            writer.addValue(rowCount, eval.asString());
          }
          // serializer doesn't use size estimate
          return ProcessedValue.NULL_LITERAL;
        }
        catch (IOException e) {
          throw new RE(e, "Failed to write field [%s], unhandled value", fieldPath);
        }
      }
      return ProcessedValue.NULL_LITERAL;
    }

    @Nullable
    @Override
    public ProcessedValue processArrayField(
        ArrayList fieldPath,
        @Nullable List array
    )
    {
      final ExprEval eval = ExprEval.bestEffortArray(array);
      if (eval.type().isPrimitiveArray()) {
        final GlobalDictionaryEncodedFieldColumnWriter writer = fieldWriters.get(
            NestedPathFinder.toNormalizedJsonPath(fieldPath)
        );
        if (writer != null) {
          try {
            writer.addValue(rowCount, eval.value());
            // serializer doesn't use size estimate
            return ProcessedValue.NULL_LITERAL;
          }
          catch (IOException e) {
            throw new RE(e, "Failed to write field [%s] value [%s]", fieldPath, array);
          }
        }
      }
      return null;
    }
  };

  private DictionaryIdLookup globalDictionaryIdLookup;
  private SortedMap fields;
  private GenericIndexedWriter fieldsWriter;
  private FieldTypeInfo.Writer fieldsInfoWriter;
  private DictionaryWriter dictionaryWriter;
  private FixedIndexedWriter longDictionaryWriter;
  private FixedIndexedWriter doubleDictionaryWriter;
  private FrontCodedIntArrayIndexedWriter arrayDictionaryWriter;
  private CompressedVariableSizedBlobColumnSerializer rawWriter;
  private ByteBufferWriter nullBitmapWriter;
  private MutableBitmap nullRowsBitmap;
  private Map> fieldWriters;
  private int rowCount = 0;
  private boolean closedForWrite = false;

  private boolean dictionarySerialized = false;
  private ByteBuffer columnNameBytes = null;

  public NestedDataColumnSerializer(
      String name,
      IndexSpec indexSpec,
      SegmentWriteOutMedium segmentWriteOutMedium,
      Closer closer
  )
  {
    this.name = name;
    this.segmentWriteOutMedium = segmentWriteOutMedium;
    this.indexSpec = indexSpec;
    this.closer = closer;
  }

  @Override
  public String getColumnName()
  {
    return name;
  }

  @Override
  public DictionaryIdLookup getGlobalLookup()
  {
    return globalDictionaryIdLookup;
  }

  @Override
  public boolean hasNulls()
  {
    return !nullRowsBitmap.isEmpty();
  }

  @Override
  public void openDictionaryWriter() throws IOException
  {
    fieldsWriter = new GenericIndexedWriter<>(segmentWriteOutMedium, name, GenericIndexed.STRING_STRATEGY);
    fieldsWriter.open();

    fieldsInfoWriter = new FieldTypeInfo.Writer(segmentWriteOutMedium);
    fieldsInfoWriter.open();

    dictionaryWriter = StringEncodingStrategies.getStringDictionaryWriter(
        indexSpec.getStringDictionaryEncoding(),
        segmentWriteOutMedium,
        name
    );
    dictionaryWriter.open();

    longDictionaryWriter = new FixedIndexedWriter<>(
        segmentWriteOutMedium,
        ColumnType.LONG.getStrategy(),
        ByteOrder.nativeOrder(),
        Long.BYTES,
        true
    );
    longDictionaryWriter.open();

    doubleDictionaryWriter = new FixedIndexedWriter<>(
        segmentWriteOutMedium,
        ColumnType.DOUBLE.getStrategy(),
        ByteOrder.nativeOrder(),
        Double.BYTES,
        true
    );
    doubleDictionaryWriter.open();

    arrayDictionaryWriter = new FrontCodedIntArrayIndexedWriter(
        segmentWriteOutMedium,
        ByteOrder.nativeOrder(),
        4
    );
    arrayDictionaryWriter.open();
    globalDictionaryIdLookup = closer.register(
        new DictionaryIdLookup(
            name,
            FileUtils.getTempDir(),
            dictionaryWriter,
            longDictionaryWriter,
            doubleDictionaryWriter,
            arrayDictionaryWriter
        )
    );
  }

  @Override
  public void open() throws IOException
  {
    rawWriter = new CompressedVariableSizedBlobColumnSerializer(
        getInternalFileName(name, RAW_FILE_NAME),
        segmentWriteOutMedium,
        indexSpec.getJsonCompression() != null ? indexSpec.getJsonCompression() : CompressionStrategy.LZ4
    );
    rawWriter.open();

    nullBitmapWriter = new ByteBufferWriter<>(
        segmentWriteOutMedium,
        indexSpec.getBitmapSerdeFactory().getObjectStrategy()
    );
    nullBitmapWriter.open();

    nullRowsBitmap = indexSpec.getBitmapSerdeFactory().getBitmapFactory().makeEmptyMutableBitmap();
  }

  @Override
  public void serializeFields(SortedMap fields) throws IOException
  {
    this.fields = fields;
    this.fieldWriters = Maps.newHashMapWithExpectedSize(fields.size());
    int ctr = 0;
    for (Map.Entry field : fields.entrySet()) {
      final String fieldName = field.getKey();
      final String fieldFileName = NESTED_FIELD_PREFIX + ctr++;
      fieldsWriter.write(fieldName);
      fieldsInfoWriter.write(field.getValue());
      final GlobalDictionaryEncodedFieldColumnWriter writer;
      final ColumnType type = field.getValue().getSingleType();
      if (type != null) {
        if (Types.is(type, ValueType.STRING)) {
          writer = new ScalarStringFieldColumnWriter(
              name,
              fieldFileName,
              segmentWriteOutMedium,
              indexSpec,
              globalDictionaryIdLookup
          );
        } else if (Types.is(type, ValueType.LONG)) {
          writer = new ScalarLongFieldColumnWriter(
              name,
              fieldFileName,
              segmentWriteOutMedium,
              indexSpec,
              globalDictionaryIdLookup
          );
        } else if (Types.is(type, ValueType.DOUBLE)) {
          writer = new ScalarDoubleFieldColumnWriter(
              name,
              fieldFileName,
              segmentWriteOutMedium,
              indexSpec,
              globalDictionaryIdLookup
          );
        } else if (Types.is(type, ValueType.ARRAY)) {
          writer = new VariantArrayFieldColumnWriter(
              name,
              fieldFileName,
              segmentWriteOutMedium,
              indexSpec,
              globalDictionaryIdLookup
          );
        } else {
          throw new ISE("Invalid field type [%s], how did this happen?", type);
        }
      } else {
        writer = new VariantFieldColumnWriter(
            name,
            fieldFileName,
            segmentWriteOutMedium,
            indexSpec,
            globalDictionaryIdLookup
        );
      }
      writer.open();
      fieldWriters.put(fieldName, writer);
    }
  }

  @Override
  public void serializeDictionaries(
      Iterable strings,
      Iterable longs,
      Iterable doubles,
      Iterable arrays
  ) throws IOException
  {
    if (dictionarySerialized) {
      throw new ISE("String dictionary already serialized for column [%s], cannot serialize again", name);
    }

    // null is always 0
    dictionaryWriter.write(null);
    for (String value : strings) {
      value = NullHandling.emptyToNullIfNeeded(value);
      if (value == null) {
        continue;
      }

      dictionaryWriter.write(value);
    }
    dictionarySerialized = true;

    for (Long value : longs) {
      if (value == null) {
        continue;
      }
      longDictionaryWriter.write(value);
    }

    for (Double value : doubles) {
      if (value == null) {
        continue;
      }
      doubleDictionaryWriter.write(value);
    }

    for (int[] value : arrays) {
      if (value == null) {
        continue;
      }
      arrayDictionaryWriter.write(value);
    }
    dictionarySerialized = true;
  }

  @Override
  public void serialize(ColumnValueSelector selector) throws IOException
  {
    if (!dictionarySerialized) {
      throw new ISE("Must serialize value dictionaries before serializing values for column [%s]", name);
    }
    StructuredData data = StructuredData.wrap(selector.getObject());
    if (data == null) {
      nullRowsBitmap.add(rowCount);
    }
    rawWriter.addValue(NestedDataComplexTypeSerde.INSTANCE.toBytes(data));
    if (data != null) {
      fieldProcessor.processFields(data.getValue());
    }
    rowCount++;
  }

  private void closeForWrite() throws IOException
  {
    if (!closedForWrite) {
      closedForWrite = true;
      ByteArrayOutputStream baos = new ByteArrayOutputStream();
      IndexMerger.SERIALIZER_UTILS.writeString(
          baos,
          NestedDataComplexTypeSerde.OBJECT_MAPPER.writeValueAsString(
              new NestedDataColumnMetadata(
                  ByteOrder.nativeOrder(),
                  indexSpec.getBitmapSerdeFactory(),
                  name,
                  !nullRowsBitmap.isEmpty()
              )
          )
      );
      nullBitmapWriter.write(nullRowsBitmap);
      columnNameBytes = computeFilenameBytes();
    }
  }

  @Override
  public long getSerializedSize() throws IOException
  {
    closeForWrite();

    long size = 1 + columnNameBytes.capacity();
    if (fieldsWriter != null) {
      size += fieldsWriter.getSerializedSize();
    }
    if (fieldsInfoWriter != null) {
      size += fieldsInfoWriter.getSerializedSize();
    }
    // the value dictionaries, raw column, and null index are all stored in separate files
    return size;
  }

  @Override
  public void writeTo(
      WritableByteChannel channel,
      FileSmoosher smoosher
  ) throws IOException
  {
    Preconditions.checkState(closedForWrite, "Not closed yet!");
    Preconditions.checkArgument(dictionaryWriter.isSorted(), "Dictionary not sorted?!?");

    writeV0Header(channel, columnNameBytes);
    fieldsWriter.writeTo(channel, smoosher);
    fieldsInfoWriter.writeTo(channel, smoosher);


    if (globalDictionaryIdLookup.getStringBufferMapper() != null) {
      SmooshedFileMapper fileMapper = globalDictionaryIdLookup.getStringBufferMapper();
      for (String internalName : fileMapper.getInternalFilenames()) {
        smoosher.add(internalName, fileMapper.mapFile(internalName));
      }
    } else {
      writeInternal(smoosher, dictionaryWriter, STRING_DICTIONARY_FILE_NAME);
    }
    if (globalDictionaryIdLookup.getLongBuffer() != null) {
      writeInternal(smoosher, globalDictionaryIdLookup.getLongBuffer(), LONG_DICTIONARY_FILE_NAME);
    } else {
      writeInternal(smoosher, longDictionaryWriter, LONG_DICTIONARY_FILE_NAME);
    }
    if (globalDictionaryIdLookup.getDoubleBuffer() != null) {
      writeInternal(smoosher, globalDictionaryIdLookup.getDoubleBuffer(), DOUBLE_DICTIONARY_FILE_NAME);
    } else {
      writeInternal(smoosher, doubleDictionaryWriter, DOUBLE_DICTIONARY_FILE_NAME);
    }
    if (globalDictionaryIdLookup.getArrayBuffer() != null) {
      writeInternal(smoosher, globalDictionaryIdLookup.getArrayBuffer(), ARRAY_DICTIONARY_FILE_NAME);
    } else {
      writeInternal(smoosher, arrayDictionaryWriter, ARRAY_DICTIONARY_FILE_NAME);
    }
    writeInternal(smoosher, rawWriter, RAW_FILE_NAME);
    if (!nullRowsBitmap.isEmpty()) {
      writeInternal(smoosher, nullBitmapWriter, NULL_BITMAP_FILE_NAME);
    }


    // close the SmooshedWriter since we are done here, so we don't write to a temporary file per sub-column
    // In the future, it would be best if the writeTo() itself didn't take a channel but was expected to actually
    // open its own channels on the FileSmoosher object itself.  Or some other thing that give this Serializer
    // total control over when resources are opened up and when they are closed.  Until then, we are stuck
    // with a very tight coupling of this code with how the external "driver" is working.
    if (channel instanceof SmooshedWriter) {
      channel.close();
    }

    for (Map.Entry field : fields.entrySet()) {
      // remove writer so that it can be collected when we are done with it
      GlobalDictionaryEncodedFieldColumnWriter writer = fieldWriters.remove(field.getKey());
      writer.writeTo(rowCount, smoosher);
    }
    log.info("Column [%s] serialized successfully with [%d] nested columns.", name, fields.size());
  }
}