org.apache.druid.segment.column.StringEncodingStrategies Maven / Gradle / Ivy

Show more of this group Show more artifacts with this name
Show all versions of druid-processing Show documentation
A module that is everything required to understands Druid Segments
There is a newer version: 30.0.1
/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *   http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied.  See the License for the
 * specific language governing permissions and limitations
 * under the License.
 */

package org.apache.druid.segment.column;

import com.google.common.base.Supplier;
import org.apache.druid.java.util.common.ISE;
import org.apache.druid.java.util.common.StringUtils;
import org.apache.druid.java.util.common.io.smoosh.SmooshedFileMapper;
import org.apache.druid.query.monomorphicprocessing.RuntimeShapeInspector;
import org.apache.druid.segment.IndexIO;
import org.apache.druid.segment.data.DictionaryWriter;
import org.apache.druid.segment.data.EncodedStringDictionaryWriter;
import org.apache.druid.segment.data.FrontCodedIndexed;
import org.apache.druid.segment.data.FrontCodedIndexedWriter;
import org.apache.druid.segment.data.GenericIndexed;
import org.apache.druid.segment.data.GenericIndexedWriter;
import org.apache.druid.segment.data.Indexed;
import org.apache.druid.segment.writeout.SegmentWriteOutMedium;

import javax.annotation.Nullable;
import java.nio.ByteBuffer;
import java.nio.ByteOrder;
import java.util.Iterator;

public class StringEncodingStrategies
{
  public static DictionaryWriter getStringDictionaryWriter(
      StringEncodingStrategy encodingStrategy,
      SegmentWriteOutMedium writeoutMedium,
      String fileName
  )
  {
    // write plain utf8 in the legacy format, where generic indexed was written directly
    if (StringEncodingStrategy.UTF8.equals(encodingStrategy.getType())) {
      return new GenericIndexedWriter<>(writeoutMedium, fileName, GenericIndexed.STRING_STRATEGY);
    } else {
      // otherwise, we wrap in an EncodedStringDictionaryWriter so that we write a small header that includes
      // a version byte that should hopefully never conflict with a GenericIndexed version, along with a byte
      // from StringEncodingStrategy.getId to indicate which encoding strategy is used for the dictionary before
      // writing the dictionary itself
      DictionaryWriter writer;
      if (StringEncodingStrategy.FRONT_CODED.equals(encodingStrategy.getType())) {
        StringEncodingStrategy.FrontCoded strategy = (StringEncodingStrategy.FrontCoded) encodingStrategy;
        writer = new FrontCodedIndexedWriter(
            writeoutMedium,
            IndexIO.BYTE_ORDER,
            strategy.getBucketSize(),
            strategy.getFormatVersion()
        );
      } else {
        throw new ISE("Unknown encoding strategy: %s", encodingStrategy.getType());
      }
      return new EncodedStringDictionaryWriter(writer, encodingStrategy);
    }
  }

  public static Supplier> getStringDictionarySupplier(
      SmooshedFileMapper mapper,
      ByteBuffer stringDictionaryBuffer,
      ByteOrder byteOrder
  )
  {
    final int dictionaryStartPosition = stringDictionaryBuffer.position();
    final byte dictionaryVersion = stringDictionaryBuffer.get();

    if (dictionaryVersion == EncodedStringDictionaryWriter.VERSION) {
      final byte encodingId = stringDictionaryBuffer.get();
      if (encodingId == StringEncodingStrategy.FRONT_CODED_ID) {
        return FrontCodedIndexed.read(
            stringDictionaryBuffer,
            byteOrder
        );
      } else if (encodingId == StringEncodingStrategy.UTF8_ID) {
        // this cannot happen naturally right now since generic indexed is written in the 'legacy' format, but
        // this provides backwards compatibility should we switch at some point in the future to always
        // writing dictionaryVersion
        return GenericIndexed.read(stringDictionaryBuffer, GenericIndexed.UTF8_STRATEGY, mapper)::singleThreaded;
      } else {
        throw new ISE("impossible, unknown encoding strategy id: %s", encodingId);
      }
    } else {
      // legacy format that only supports plain utf8 enoding stored in GenericIndexed and the byte we are reading
      // as dictionaryVersion is actually also the GenericIndexed version, so we reset start position so the
      // GenericIndexed version can be correctly read
      stringDictionaryBuffer.position(dictionaryStartPosition);
      return GenericIndexed.read(stringDictionaryBuffer, GenericIndexed.UTF8_STRATEGY, mapper)::singleThreaded;
    }
  }

  /**
   * Adapter to convert {@link Indexed} with utf8 encoded bytes into {@link Indexed} to be friendly
   * to consumers.
   */
  public static final class Utf8ToStringIndexed implements Indexed
  {
    private final Indexed delegate;

    public Utf8ToStringIndexed(Indexed delegate)
    {
      this.delegate = delegate;
    }

    @Override
    public int size()
    {
      return delegate.size();
    }

    @Nullable
    @Override
    public String get(int index)
    {
      return StringUtils.fromUtf8Nullable(delegate.get(index));
    }

    @Override
    public int indexOf(@Nullable String value)
    {
      return delegate.indexOf(StringUtils.toUtf8ByteBuffer(value));
    }

    @Override
    public Iterator iterator()
    {
      final Iterator delegateIterator = delegate.iterator();
      return new Iterator()
      {
        @Override
        public boolean hasNext()
        {
          return delegateIterator.hasNext();
        }

        @Override
        public String next()
        {
          return StringUtils.fromUtf8Nullable(delegateIterator.next());
        }
      };
    }

    @Override
    public void inspectRuntimeShape(RuntimeShapeInspector inspector)
    {
      inspector.visit("delegateIndex", delegate);
    }
  }
}