org.apache.druid.segment.IndexMerger Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of druid-processing Show documentation
A module that is everything required to understands Druid Segments
There is a newer version: 31.0.0
/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *   http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied.  See the License for the
 * specific language governing permissions and limitations
 * under the License.
 */

package org.apache.druid.segment;

import com.google.common.annotations.VisibleForTesting;
import com.google.common.base.Function;
import com.google.common.collect.ImmutableList;
import com.google.common.collect.Lists;
import com.google.inject.ImplementedBy;
import org.apache.druid.common.utils.SerializerUtils;
import org.apache.druid.data.input.impl.DimensionsSpec;
import org.apache.druid.java.util.common.guava.Comparators;
import org.apache.druid.java.util.common.logger.Logger;
import org.apache.druid.query.aggregation.AggregatorFactory;
import org.apache.druid.segment.incremental.IncrementalIndex;
import org.apache.druid.segment.writeout.SegmentWriteOutMediumFactory;
import org.apache.druid.utils.CollectionUtils;
import org.joda.time.Interval;

import javax.annotation.Nullable;
import java.io.File;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;
import java.util.Set;
import java.util.TreeSet;
import java.util.stream.Collectors;

@ImplementedBy(IndexMergerV9.class)
public interface IndexMerger
{
  Logger log = new Logger(IndexMerger.class);

  SerializerUtils SERIALIZER_UTILS = new SerializerUtils();
  int INVALID_ROW = -1;
  int UNLIMITED_MAX_COLUMNS_TO_MERGE = -1;

  static List getMergedDimensionsFromQueryableIndexes(
      List indexes,
      @Nullable DimensionsSpec dimensionsSpec
  )
  {
    return getMergedDimensions(toIndexableAdapters(indexes), dimensionsSpec);
  }

  static List toIndexableAdapters(List indexes)
  {
    return indexes.stream().map(QueryableIndexIndexableAdapter::new).collect(Collectors.toList());
  }

  static List getMergedDimensions(
      List indexes,
      @Nullable DimensionsSpec dimensionsSpec
  )
  {
    if (indexes.size() == 0) {
      return ImmutableList.of();
    }
    List commonDimOrder = getLongestSharedDimOrder(indexes, dimensionsSpec);
    if (commonDimOrder == null) {
      log.warn("Indexes have incompatible dimension orders and there is no valid dimension ordering"
               + " in the ingestionSpec, using lexicographic order.");
      return getLexicographicMergedDimensions(indexes);
    } else {
      return commonDimOrder;
    }
  }

  @Nullable
  static List getLongestSharedDimOrder(
      List indexes,
      @Nullable DimensionsSpec dimensionsSpec
  )
  {
    int maxSize = 0;
    Iterable orderingCandidate = null;
    for (IndexableAdapter index : indexes) {
      int iterSize = index.getDimensionNames().size();
      if (iterSize > maxSize) {
        maxSize = iterSize;
        orderingCandidate = index.getDimensionNames();
      }
    }

    if (orderingCandidate == null) {
      return null;
    }

    if (isDimensionOrderingValid(indexes, orderingCandidate)) {
      return ImmutableList.copyOf(orderingCandidate);
    } else {
      log.info("Indexes have incompatible dimension orders, try falling back on dimension ordering from ingestionSpec");
      // Check if there is a valid dimension ordering in the ingestionSpec to fall back on
      if (dimensionsSpec == null || CollectionUtils.isNullOrEmpty(dimensionsSpec.getDimensionNames())) {
        log.info("Cannot fall back on dimension ordering from ingestionSpec as it does not exist");
        return null;
      }
      List candidate = new ArrayList<>(dimensionsSpec.getDimensionNames());
      // Remove all dimensions that does not exist within the indexes from the candidate
      Set allValidDimensions = indexes.stream()
                                         .flatMap(indexableAdapter -> indexableAdapter.getDimensionNames().stream())
                                         .collect(Collectors.toSet());
      candidate.retainAll(allValidDimensions);
      // Sanity check that there is no extra/missing columns
      if (candidate.size() != allValidDimensions.size()) {
        log.error("Dimension mismatched between ingestionSpec and indexes. ingestionSpec[%s] indexes[%s]",
                  candidate,
                  allValidDimensions);
        return null;
      }

      // Sanity check that all indexes dimension ordering is the same as the ordering in candidate
      if (!isDimensionOrderingValid(indexes, candidate)) {
        log.error("Dimension from ingestionSpec has invalid ordering");
        return null;
      }
      log.info("Dimension ordering from ingestionSpec is valid. Fall back on dimension ordering [%s]", candidate);
      return candidate;
    }
  }

  static boolean isDimensionOrderingValid(List indexes, Iterable orderingCandidate)
  {
    for (IndexableAdapter index : indexes) {
      Iterator candidateIter = orderingCandidate.iterator();
      for (String matchDim : index.getDimensionNames()) {
        boolean matched = false;
        while (candidateIter.hasNext()) {
          String nextDim = candidateIter.next();
          if (matchDim.equals(nextDim)) {
            matched = true;
            break;
          }
        }
        if (!matched) {
          return false;
        }
      }
    }
    return true;
  }

  static List getLexicographicMergedDimensions(List indexes)
  {
    return mergeIndexed(
        Lists.transform(
            indexes,
            new Function>()
            {
              @Override
              public Iterable apply(@Nullable IndexableAdapter input)
              {
                return input.getDimensionNames();
              }
            }
        )
    );
  }

  static > ArrayList mergeIndexed(List> indexedLists)
  {
    Set retVal = new TreeSet<>(Comparators.naturalNullsFirst());

    for (Iterable indexedList : indexedLists) {
      for (T val : indexedList) {
        retVal.add(val);
      }
    }

    return Lists.newArrayList(retVal);
  }

  /**
   * Equivalent to {@link #persist(IncrementalIndex, Interval, File, IndexSpec, ProgressIndicator, SegmentWriteOutMediumFactory)}
   * without a progress indicator and with interval set to {@link IncrementalIndex#getInterval()}.
   */
  @VisibleForTesting
  default File persist(
      IncrementalIndex index,
      File outDir,
      IndexSpec indexSpec,
      @Nullable SegmentWriteOutMediumFactory segmentWriteOutMediumFactory
  ) throws IOException
  {
    return persist(
        index,
        index.getInterval(),
        outDir,
        indexSpec,
        new BaseProgressIndicator(),
        segmentWriteOutMediumFactory
    );
  }

  /**
   * Equivalent to {@link #persist(IncrementalIndex, Interval, File, IndexSpec, ProgressIndicator, SegmentWriteOutMediumFactory)}
   * without a progress indicator.
   */
  default File persist(
      IncrementalIndex index,
      Interval dataInterval,
      File outDir,
      IndexSpec indexSpec,
      @Nullable SegmentWriteOutMediumFactory segmentWriteOutMediumFactory
  ) throws IOException
  {
    return persist(index, dataInterval, outDir, indexSpec, new BaseProgressIndicator(), segmentWriteOutMediumFactory);
  }

  /**
   * Persist an IncrementalIndex to disk in such a way that it can be loaded back up as a {@link QueryableIndex}.
   *
   * This is *not* thread-safe and havoc will ensue if this is called and writes are still occurring on the
   * IncrementalIndex object.
   *
   * @param index                        the IncrementalIndex to persist
   * @param dataInterval                 the Interval that the data represents. Typically, this is the same as the
   *                                     interval from the corresponding {@link org.apache.druid.timeline.SegmentId}.
   * @param outDir                       the directory to persist the data to
   * @param indexSpec                    storage and compression options
   * @param progress                     an object that will receive progress updates
   * @param segmentWriteOutMediumFactory controls allocation of temporary data structures
   *
   * @return the index output directory
   *
   * @throws IOException if an IO error occurs persisting the index
   */
  File persist(
      IncrementalIndex index,
      Interval dataInterval,
      File outDir,
      IndexSpec indexSpec,
      ProgressIndicator progress,
      @Nullable SegmentWriteOutMediumFactory segmentWriteOutMediumFactory
  ) throws IOException;

  /**
   * Merge a collection of {@link QueryableIndex}.
   *
   * Only used as a convenience method in tests. In production code, use the full version
   * {@link #mergeQueryableIndex(List, boolean, AggregatorFactory[], DimensionsSpec, File, IndexSpec, IndexSpec, ProgressIndicator, SegmentWriteOutMediumFactory, int)}.
   */
  @VisibleForTesting
  default File mergeQueryableIndex(
      List indexes,
      boolean rollup,
      AggregatorFactory[] metricAggs,
      File outDir,
      IndexSpec indexSpec,
      @Nullable SegmentWriteOutMediumFactory segmentWriteOutMediumFactory,
      int maxColumnsToMerge
  ) throws IOException
  {
    return mergeQueryableIndex(
        indexes,
        rollup,
        metricAggs,
        null,
        outDir,
        indexSpec,
        indexSpec,
        new BaseProgressIndicator(),
        segmentWriteOutMediumFactory,
        maxColumnsToMerge
    );
  }

  /**
   * Merge a collection of {@link QueryableIndex}.
   */
  File mergeQueryableIndex(
      List indexes,
      boolean rollup,
      AggregatorFactory[] metricAggs,
      @Nullable DimensionsSpec dimensionsSpec,
      File outDir,
      IndexSpec indexSpec,
      IndexSpec indexSpecForIntermediatePersists,
      ProgressIndicator progress,
      @Nullable SegmentWriteOutMediumFactory segmentWriteOutMediumFactory,
      int maxColumnsToMerge
  ) throws IOException;

  /**
   * Only used as a convenience method in tests.
   *
   * In production code, to merge multiple {@link QueryableIndex}, use
   * {@link #mergeQueryableIndex(List, boolean, AggregatorFactory[], DimensionsSpec, File, IndexSpec, IndexSpec, ProgressIndicator, SegmentWriteOutMediumFactory, int)}.
   * To merge multiple {@link IncrementalIndex}, call one of the {@link #persist} methods and then merge the resulting
   * {@link QueryableIndex}.
   */
  @VisibleForTesting
  File merge(
      List indexes,
      boolean rollup,
      AggregatorFactory[] metricAggs,
      File outDir,
      DimensionsSpec dimensionsSpec,
      IndexSpec indexSpec,
      int maxColumnsToMerge
  ) throws IOException;

  /**
   * This method applies {@link DimensionMerger#convertSortedSegmentRowValuesToMergedRowValues(int, ColumnValueSelector)} to
   * all dimension column selectors of the given sourceRowIterator, using the given index number.
   */
  static TransformableRowIterator toMergedIndexRowIterator(
      TransformableRowIterator sourceRowIterator,
      int indexNumber,
      final List mergers
  )
  {
    RowPointer sourceRowPointer = sourceRowIterator.getPointer();
    TimeAndDimsPointer markedSourceRowPointer = sourceRowIterator.getMarkedPointer();
    boolean anySelectorChanged = false;
    ColumnValueSelector[] convertedDimensionSelectors = new ColumnValueSelector[mergers.size()];
    ColumnValueSelector[] convertedMarkedDimensionSelectors = new ColumnValueSelector[mergers.size()];
    for (int i = 0; i < mergers.size(); i++) {
      ColumnValueSelector sourceDimensionSelector = sourceRowPointer.getDimensionSelector(i);
      ColumnValueSelector convertedDimensionSelector =
          mergers.get(i).convertSortedSegmentRowValuesToMergedRowValues(indexNumber, sourceDimensionSelector);
      convertedDimensionSelectors[i] = convertedDimensionSelector;
      // convertedDimensionSelector could be just the same object as sourceDimensionSelector, it means that this
      // type of column doesn't have any kind of special per-index encoding that needs to be converted to the "global"
      // encoding. E. g. it's always true for subclasses of NumericDimensionMergerV9.
      //noinspection ObjectEquality
      anySelectorChanged |= convertedDimensionSelector != sourceDimensionSelector;

      convertedMarkedDimensionSelectors[i] = mergers.get(i).convertSortedSegmentRowValuesToMergedRowValues(
          indexNumber,
          markedSourceRowPointer.getDimensionSelector(i)
      );
    }
    // If none dimensions are actually converted, don't need to transform the sourceRowIterator, adding extra
    // indirection layer. It could be just returned back from this method.
    if (!anySelectorChanged) {
      return sourceRowIterator;
    }
    return makeRowIteratorWithConvertedDimensionColumns(
        sourceRowIterator,
        convertedDimensionSelectors,
        convertedMarkedDimensionSelectors
    );
  }

  static TransformableRowIterator makeRowIteratorWithConvertedDimensionColumns(
      TransformableRowIterator sourceRowIterator,
      ColumnValueSelector[] convertedDimensionSelectors,
      ColumnValueSelector[] convertedMarkedDimensionSelectors
  )
  {
    RowPointer convertedRowPointer = sourceRowIterator.getPointer().withDimensionSelectors(convertedDimensionSelectors);
    TimeAndDimsPointer convertedMarkedRowPointer =
        sourceRowIterator.getMarkedPointer().withDimensionSelectors(convertedMarkedDimensionSelectors);
    return new ForwardingRowIterator(sourceRowIterator)
    {
      @Override
      public RowPointer getPointer()
      {
        return convertedRowPointer;
      }

      @Override
      public TimeAndDimsPointer getMarkedPointer()
      {
        return convertedMarkedRowPointer;
      }
    };
  }
}