org.apache.druid.segment.incremental.IncrementalIndex Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of druid-processing Show documentation
A module that is everything required to understands Druid Segments
There is a newer version: 31.0.0
/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *   http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied.  See the License for the
 * specific language governing permissions and limitations
 * under the License.
 */

package org.apache.druid.segment.incremental;

import com.google.common.annotations.VisibleForTesting;
import com.google.common.base.Function;
import com.google.common.base.Preconditions;
import com.google.common.base.Strings;
import com.google.common.collect.ImmutableList;
import com.google.common.collect.ImmutableMap;
import com.google.common.collect.Maps;
import com.google.common.collect.Sets;
import com.google.common.primitives.Ints;
import com.google.common.primitives.Longs;
import com.google.errorprone.annotations.concurrent.GuardedBy;
import org.apache.druid.common.config.NullHandling;
import org.apache.druid.data.input.InputRow;
import org.apache.druid.data.input.ListBasedInputRow;
import org.apache.druid.data.input.MapBasedInputRow;
import org.apache.druid.data.input.Row;
import org.apache.druid.data.input.impl.DimensionSchema;
import org.apache.druid.data.input.impl.DimensionsSpec;
import org.apache.druid.data.input.impl.SpatialDimensionSchema;
import org.apache.druid.java.util.common.DateTimes;
import org.apache.druid.java.util.common.IAE;
import org.apache.druid.java.util.common.ISE;
import org.apache.druid.java.util.common.granularity.Granularity;
import org.apache.druid.java.util.common.parsers.ParseException;
import org.apache.druid.java.util.common.parsers.UnparseableColumnsParseException;
import org.apache.druid.query.aggregation.AggregatorFactory;
import org.apache.druid.query.aggregation.PostAggregator;
import org.apache.druid.query.dimension.DimensionSpec;
import org.apache.druid.query.monomorphicprocessing.RuntimeShapeInspector;
import org.apache.druid.segment.ColumnInspector;
import org.apache.druid.segment.ColumnSelectorFactory;
import org.apache.druid.segment.ColumnValueSelector;
import org.apache.druid.segment.DimensionHandler;
import org.apache.druid.segment.DimensionHandlerUtils;
import org.apache.druid.segment.DimensionIndexer;
import org.apache.druid.segment.DimensionSelector;
import org.apache.druid.segment.DoubleColumnSelector;
import org.apache.druid.segment.EncodedKeyComponent;
import org.apache.druid.segment.FloatColumnSelector;
import org.apache.druid.segment.LongColumnSelector;
import org.apache.druid.segment.Metadata;
import org.apache.druid.segment.NestedCommonFormatColumnHandler;
import org.apache.druid.segment.NilColumnValueSelector;
import org.apache.druid.segment.ObjectColumnSelector;
import org.apache.druid.segment.RowAdapters;
import org.apache.druid.segment.RowBasedColumnSelectorFactory;
import org.apache.druid.segment.VirtualColumns;
import org.apache.druid.segment.column.CapabilitiesBasedFormat;
import org.apache.druid.segment.column.ColumnCapabilities;
import org.apache.druid.segment.column.ColumnCapabilitiesImpl;
import org.apache.druid.segment.column.ColumnFormat;
import org.apache.druid.segment.column.ColumnHolder;
import org.apache.druid.segment.column.ColumnType;
import org.apache.druid.segment.column.RowSignature;
import org.apache.druid.segment.column.ValueType;
import org.apache.druid.segment.serde.ComplexMetricExtractor;
import org.apache.druid.segment.serde.ComplexMetricSerde;
import org.apache.druid.segment.serde.ComplexMetrics;
import org.apache.druid.segment.transform.TransformedInputRow;
import org.joda.time.DateTime;
import org.joda.time.Interval;

import javax.annotation.Nullable;
import java.io.Closeable;
import java.util.ArrayList;
import java.util.Comparator;
import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.concurrent.CopyOnWriteArrayList;
import java.util.concurrent.atomic.AtomicInteger;
import java.util.concurrent.atomic.AtomicLong;

/**
 * In-memory, row-based data structure used to hold data during ingestion. Realtime tasks query this index using
 * {@link IncrementalIndexStorageAdapter}.
 *
 * Concurrency model: {@link #add(InputRow)} and {@link #add(InputRow, boolean)} are not thread-safe, and must be
 * called from a single thread or externally synchronized. However, the methods that support
 * {@link IncrementalIndexStorageAdapter} are thread-safe, and may be called concurrently with each other, and with
 * the "add" methods. This concurrency model supports real-time queries of the data in the index.
 */
public abstract class IncrementalIndex implements Iterable, Closeable, ColumnInspector
{
  /**
   * Column selector used at ingestion time for inputs to aggregators.
   *
   * @param virtualColumns virtual columns
   * @param inputRowHolder ingestion-time input row holder
   * @param agg            the aggregator, or null to make a generic aggregator. Only required if the agg has
   *                       {@link AggregatorFactory#getIntermediateType()} as {@link ValueType#COMPLEX}, because
   *                       in this case we need to do some magic to ensure the correct values show up.
   *
   * @return column selector factory
   */
  public static ColumnSelectorFactory makeColumnSelectorFactory(
      final VirtualColumns virtualColumns,
      final InputRowHolder inputRowHolder,
      @Nullable final AggregatorFactory agg
  )
  {
    // we use RowSignature.empty() because ColumnInspector here should be the InputRow schema, not the
    // IncrementalIndex schema, because we are reading values from the InputRow
    final RowBasedColumnSelectorFactory baseSelectorFactory = new RowBasedColumnSelectorFactory<>(
        inputRowHolder::getRow,
        inputRowHolder::getRowId,
        RowAdapters.standardRow(),
        RowSignature.empty(),
        true,
        true
    );

    class IncrementalIndexInputRowColumnSelectorFactory implements ColumnSelectorFactory
    {
      @Override
      public ColumnValueSelector makeColumnValueSelector(final String column)
      {
        final ColumnValueSelector selector = baseSelectorFactory.makeColumnValueSelector(column);

        if (agg == null || !agg.getIntermediateType().is(ValueType.COMPLEX)) {
          return selector;
        } else {
          // Wrap selector in a special one that uses ComplexMetricSerde to modify incoming objects.
          // For complex aggregators that read from multiple columns, we wrap all of them. This is not ideal but it
          // has worked so far.
          final String complexTypeName = agg.getIntermediateType().getComplexTypeName();
          final ComplexMetricSerde serde = ComplexMetrics.getSerdeForType(complexTypeName);
          if (serde == null) {
            throw new ISE("Don't know how to handle type[%s]", complexTypeName);
          }

          final ComplexMetricExtractor extractor = serde.getExtractor();
          return new ColumnValueSelector()
          {
            @Override
            public boolean isNull()
            {
              return selector.isNull();
            }

            @Override
            public long getLong()
            {
              return selector.getLong();
            }

            @Override
            public float getFloat()
            {
              return selector.getFloat();
            }

            @Override
            public double getDouble()
            {
              return selector.getDouble();
            }

            @Override
            public Class classOfObject()
            {
              return extractor.extractedClass();
            }

            @Nullable
            @Override
            public Object getObject()
            {
              // Here is where the magic happens: read from "in" directly, don't go through the normal "selector".
              return extractor.extractValue(inputRowHolder.getRow(), column, agg);
            }

            @Override
            public void inspectRuntimeShape(RuntimeShapeInspector inspector)
            {
              inspector.visit("inputRowHolder", inputRowHolder);
              inspector.visit("selector", selector);
              inspector.visit("extractor", extractor);
            }
          };
        }
      }

      @Override
      public DimensionSelector makeDimensionSelector(DimensionSpec dimensionSpec)
      {
        return baseSelectorFactory.makeDimensionSelector(dimensionSpec);
      }

      @Nullable
      @Override
      public ColumnCapabilities getColumnCapabilities(String columnName)
      {
        return baseSelectorFactory.getColumnCapabilities(columnName);
      }
    }

    return virtualColumns.wrap(new IncrementalIndexInputRowColumnSelectorFactory());
  }

  private final long minTimestamp;
  private final Granularity gran;
  private final boolean rollup;
  private final List> rowTransformers;
  private final VirtualColumns virtualColumns;
  private final AggregatorFactory[] metrics;
  private final Metadata metadata;
  protected final boolean preserveExistingMetrics;

  private final Map metricDescs;

  private final DimensionsSpec dimensionsSpec;
  private final Map dimensionDescs;
  private final List dimensionDescsList;
  // dimension capabilities are provided by the indexers
  private final Map timeAndMetricsColumnCapabilities;
  private final Map timeAndMetricsColumnFormats;
  private final AtomicInteger numEntries = new AtomicInteger();
  private final AtomicLong bytesInMemory = new AtomicLong();
  private final boolean useMaxMemoryEstimates;

  private final boolean useSchemaDiscovery;

  private final InputRowHolder inputRowHolder = new InputRowHolder();

  private volatile DateTime maxIngestedEventTime;

  /**
   * @param incrementalIndexSchema    the schema to use for incremental index
   * @param preserveExistingMetrics   When set to true, for any row that already has metric
   *                                  (with the same name defined in metricSpec), the metric aggregator in metricSpec
   *                                  is skipped and the existing metric is unchanged. If the row does not already have
   *                                  the metric, then the metric aggregator is applied on the source column as usual.
   *                                  This should only be set for DruidInputSource since that is the only case where we
   *                                  can have existing metrics. This is currently only use by auto compaction and
   *                                  should not be use for anything else.
   * @param useMaxMemoryEstimates     true if max values should be used to estimate memory
   */
  protected IncrementalIndex(
      final IncrementalIndexSchema incrementalIndexSchema,
      final boolean preserveExistingMetrics,
      final boolean useMaxMemoryEstimates
  )
  {
    this.minTimestamp = incrementalIndexSchema.getMinTimestamp();
    this.gran = incrementalIndexSchema.getGran();
    this.rollup = incrementalIndexSchema.isRollup();
    this.virtualColumns = incrementalIndexSchema.getVirtualColumns();
    this.metrics = incrementalIndexSchema.getMetrics();
    this.rowTransformers = new CopyOnWriteArrayList<>();
    this.preserveExistingMetrics = preserveExistingMetrics;
    this.useMaxMemoryEstimates = useMaxMemoryEstimates;
    this.useSchemaDiscovery = incrementalIndexSchema.getDimensionsSpec()
                                                    .useSchemaDiscovery();

    this.timeAndMetricsColumnCapabilities = new HashMap<>();
    this.timeAndMetricsColumnFormats = new HashMap<>();
    this.metricDescs = Maps.newLinkedHashMap();
    this.dimensionDescs = Maps.newLinkedHashMap();
    this.metadata = new Metadata(
        null,
        getCombiningAggregators(metrics),
        incrementalIndexSchema.getTimestampSpec(),
        this.gran,
        this.rollup
    );

    initAggs(metrics, inputRowHolder);

    for (AggregatorFactory metric : metrics) {
      MetricDesc metricDesc = new MetricDesc(metricDescs.size(), metric);
      metricDescs.put(metricDesc.getName(), metricDesc);
      final ColumnCapabilities capabilities = metricDesc.getCapabilities();
      timeAndMetricsColumnCapabilities.put(metricDesc.getName(), capabilities);
      if (capabilities.is(ValueType.COMPLEX)) {
        timeAndMetricsColumnFormats.put(
            metricDesc.getName(),
            new CapabilitiesBasedFormat(
                ColumnCapabilitiesImpl.snapshot(
                    ColumnCapabilitiesImpl.copyOf(capabilities).setType(ColumnType.ofComplex(metricDesc.getType())),
                    ColumnCapabilitiesImpl.ALL_FALSE
                )
            )
        );
      } else {
        timeAndMetricsColumnFormats.put(
            metricDesc.getName(),
            new CapabilitiesBasedFormat(
                ColumnCapabilitiesImpl.snapshot(capabilities, ColumnCapabilitiesImpl.ALL_FALSE)
            )
        );
      }

    }

    this.dimensionsSpec = incrementalIndexSchema.getDimensionsSpec();

    this.dimensionDescsList = new ArrayList<>();
    for (DimensionSchema dimSchema : dimensionsSpec.getDimensions()) {
      addNewDimension(dimSchema.getName(), dimSchema.getDimensionHandler());
    }

    //__time capabilities
    timeAndMetricsColumnCapabilities.put(
        ColumnHolder.TIME_COLUMN_NAME,
        ColumnCapabilitiesImpl.createSimpleNumericColumnCapabilities(ColumnType.LONG)
    );

    // This should really be more generic
    List spatialDimensions = dimensionsSpec.getSpatialDimensions();
    if (!spatialDimensions.isEmpty()) {
      this.rowTransformers.add(new SpatialDimensionRowTransformer(spatialDimensions));
    }
  }

  public abstract FactsHolder getFacts();

  public abstract boolean canAppendRow();

  public abstract String getOutOfRowsReason();

  protected abstract void initAggs(
      AggregatorFactory[] metrics,
      InputRowHolder rowSupplier
  );

  // Note: This method does not need to be thread safe.
  protected abstract AddToFactsResult addToFacts(
      IncrementalIndexRow key,
      InputRowHolder inputRowHolder,
      boolean skipMaxRowsInMemoryCheck
  ) throws IndexSizeExceededException;

  public abstract int getLastRowIndex();

  protected abstract float getMetricFloatValue(int rowOffset, int aggOffset);

  protected abstract long getMetricLongValue(int rowOffset, int aggOffset);

  protected abstract Object getMetricObjectValue(int rowOffset, int aggOffset);

  protected abstract double getMetricDoubleValue(int rowOffset, int aggOffset);

  protected abstract boolean isNull(int rowOffset, int aggOffset);

  static class IncrementalIndexRowResult
  {
    private final IncrementalIndexRow incrementalIndexRow;
    private final List parseExceptionMessages;

    IncrementalIndexRowResult(IncrementalIndexRow incrementalIndexRow, List parseExceptionMessages)
    {
      this.incrementalIndexRow = incrementalIndexRow;
      this.parseExceptionMessages = parseExceptionMessages;
    }

    IncrementalIndexRow getIncrementalIndexRow()
    {
      return incrementalIndexRow;
    }

    List getParseExceptionMessages()
    {
      return parseExceptionMessages;
    }
  }

  static class AddToFactsResult
  {
    private final int rowCount;
    private final long bytesInMemory;
    private final List parseExceptionMessages;

    public AddToFactsResult(
        int rowCount,
        long bytesInMemory,
        List parseExceptionMessages
    )
    {
      this.rowCount = rowCount;
      this.bytesInMemory = bytesInMemory;
      this.parseExceptionMessages = parseExceptionMessages;
    }

    int getRowCount()
    {
      return rowCount;
    }

    public long getBytesInMemory()
    {
      return bytesInMemory;
    }

    public List getParseExceptionMessages()
    {
      return parseExceptionMessages;
    }
  }

  public static class InputRowHolder
  {
    @Nullable
    private InputRow row;
    private long rowId = -1;

    public void set(final InputRow row)
    {
      this.row = row;
      this.rowId++;
    }

    public void unset()
    {
      this.row = null;
    }

    public InputRow getRow()
    {
      return Preconditions.checkNotNull(row, "row");
    }

    public long getRowId()
    {
      return rowId;
    }
  }

  public boolean isRollup()
  {
    return rollup;
  }

  @Override
  public void close()
  {
  }

  public InputRow formatRow(InputRow row)
  {
    for (Function rowTransformer : rowTransformers) {
      row = rowTransformer.apply(row);
    }

    if (row == null) {
      throw new IAE("Row is null? How can this be?!");
    }
    return row;
  }

  public Map getColumnFormats()
  {
    ImmutableMap.Builder builder = ImmutableMap.builder();

    synchronized (dimensionDescs) {
      timeAndMetricsColumnFormats.forEach(builder::put);
      dimensionDescs.forEach((dimension, desc) -> builder.put(dimension, desc.getIndexer().getFormat()));
    }
    return builder.build();
  }

  @Nullable
  @Override
  public ColumnCapabilities getColumnCapabilities(String columnName)
  {
    if (timeAndMetricsColumnCapabilities.containsKey(columnName)) {
      return timeAndMetricsColumnCapabilities.get(columnName);
    }
    synchronized (dimensionDescs) {
      final DimensionDesc desc = dimensionDescs.get(columnName);
      return desc != null ? desc.getCapabilities() : null;
    }
  }

  @Nullable
  public ColumnFormat getColumnFormat(String columnName)
  {
    if (timeAndMetricsColumnFormats.containsKey(columnName)) {
      return timeAndMetricsColumnFormats.get(columnName);
    }

    synchronized (dimensionDescs) {
      final DimensionDesc desc = dimensionDescs.get(columnName);
      return desc != null ? desc.getIndexer().getFormat() : null;
    }
  }

  /**
   * Adds a new row.  The row might correspond with another row that already exists, in which case this will
   * update that row instead of inserting a new one.
   *
   * Not thread-safe.
   *
   * @param row the row of data to add
   *
   * @return the number of rows in the data set after adding the InputRow. If any parse failure occurs, a {@link ParseException} is returned in {@link IncrementalIndexAddResult}.
   *
   * @throws IndexSizeExceededException this exception is thrown once it reaches max rows limit and skipMaxRowsInMemoryCheck is set to false.
   */
  public IncrementalIndexAddResult add(InputRow row) throws IndexSizeExceededException
  {
    return add(row, false);
  }

  /**
   * Adds a new row.  The row might correspond with another row that already exists, in which case this will
   * update that row instead of inserting a new one.
   *
   * Not thread-safe.
   *
   * @param row                      the row of data to add
   * @param skipMaxRowsInMemoryCheck whether or not to skip the check of rows exceeding the max rows or bytes limit
   *
   * @return the number of rows in the data set after adding the InputRow. If any parse failure occurs, a {@link ParseException} is returned in {@link IncrementalIndexAddResult}.
   *
   * @throws IndexSizeExceededException this exception is thrown once it reaches max rows limit and skipMaxRowsInMemoryCheck is set to false.
   */
  public IncrementalIndexAddResult add(InputRow row, boolean skipMaxRowsInMemoryCheck)
      throws IndexSizeExceededException
  {
    IncrementalIndexRowResult incrementalIndexRowResult = toIncrementalIndexRow(row);
    inputRowHolder.set(row);
    final AddToFactsResult addToFactsResult = addToFacts(
        incrementalIndexRowResult.getIncrementalIndexRow(),
        inputRowHolder,
        skipMaxRowsInMemoryCheck
    );
    updateMaxIngestedTime(row.getTimestamp());
    @Nullable ParseException parseException = getCombinedParseException(
        row,
        incrementalIndexRowResult.getParseExceptionMessages(),
        addToFactsResult.getParseExceptionMessages()
    );
    inputRowHolder.unset();
    return new IncrementalIndexAddResult(
        addToFactsResult.getRowCount(),
        addToFactsResult.getBytesInMemory(),
        parseException
    );
  }

  @VisibleForTesting
  IncrementalIndexRowResult toIncrementalIndexRow(InputRow row)
  {
    row = formatRow(row);
    if (row.getTimestampFromEpoch() < minTimestamp) {
      throw new IAE("Cannot add row[%s] because it is below the minTimestamp[%s]", row, DateTimes.utc(minTimestamp));
    }

    final List rowDimensions = row.getDimensions();
    Object[] dims;
    List