org.apache.druid.indexing.common.task.IndexTask Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of druid-indexing-service Show documentation
druid-indexing-service
There is a newer version: 31.0.0
Show newest version
/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *   http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied.  See the License for the
 * specific language governing permissions and limitations
 * under the License.
 */

package org.apache.druid.indexing.common.task;

import com.fasterxml.jackson.annotation.JsonCreator;
import com.fasterxml.jackson.annotation.JsonIgnore;
import com.fasterxml.jackson.annotation.JsonProperty;
import com.fasterxml.jackson.annotation.JsonTypeName;
import com.fasterxml.jackson.databind.ObjectMapper;
import com.google.common.base.Optional;
import com.google.common.base.Preconditions;
import com.google.common.base.Throwables;
import com.google.common.collect.ImmutableList;
import com.google.common.collect.ImmutableSet;
import com.google.common.hash.HashFunction;
import com.google.common.hash.Hashing;
import com.google.common.util.concurrent.ListenableFuture;
import org.apache.druid.data.input.InputFormat;
import org.apache.druid.data.input.InputRow;
import org.apache.druid.data.input.InputSource;
import org.apache.druid.data.input.Rows;
import org.apache.druid.hll.HyperLogLogCollector;
import org.apache.druid.indexer.Checks;
import org.apache.druid.indexer.IngestionState;
import org.apache.druid.indexer.Property;
import org.apache.druid.indexer.TaskStatus;
import org.apache.druid.indexer.partitions.DynamicPartitionsSpec;
import org.apache.druid.indexer.partitions.HashedPartitionsSpec;
import org.apache.druid.indexer.partitions.PartitionsSpec;
import org.apache.druid.indexer.partitions.SecondaryPartitionType;
import org.apache.druid.indexer.report.TaskReport;
import org.apache.druid.indexing.common.TaskLockType;
import org.apache.druid.indexing.common.TaskRealtimeMetricsMonitorBuilder;
import org.apache.druid.indexing.common.TaskToolbox;
import org.apache.druid.indexing.common.actions.TaskActionClient;
import org.apache.druid.indexing.common.stats.TaskRealtimeMetricsMonitor;
import org.apache.druid.indexing.common.task.batch.parallel.PartialHashSegmentGenerateTask;
import org.apache.druid.indexing.common.task.batch.parallel.TombstoneHelper;
import org.apache.druid.indexing.common.task.batch.parallel.iterator.DefaultIndexTaskInputRowIteratorBuilder;
import org.apache.druid.indexing.common.task.batch.partition.CompletePartitionAnalysis;
import org.apache.druid.indexing.common.task.batch.partition.HashPartitionAnalysis;
import org.apache.druid.indexing.common.task.batch.partition.LinearPartitionAnalysis;
import org.apache.druid.indexing.common.task.batch.partition.PartitionAnalysis;
import org.apache.druid.indexing.input.DruidInputSource;
import org.apache.druid.indexing.input.TaskInputSource;
import org.apache.druid.indexing.overlord.sampler.InputSourceSampler;
import org.apache.druid.java.util.common.IAE;
import org.apache.druid.java.util.common.ISE;
import org.apache.druid.java.util.common.JodaUtils;
import org.apache.druid.java.util.common.Pair;
import org.apache.druid.java.util.common.StringUtils;
import org.apache.druid.java.util.common.UOE;
import org.apache.druid.java.util.common.granularity.Granularity;
import org.apache.druid.java.util.common.guava.Comparators;
import org.apache.druid.java.util.common.logger.Logger;
import org.apache.druid.java.util.common.parsers.CloseableIterator;
import org.apache.druid.segment.IndexMerger;
import org.apache.druid.segment.IndexSpec;
import org.apache.druid.segment.SegmentSchemaMapping;
import org.apache.druid.segment.incremental.AppendableIndexSpec;
import org.apache.druid.segment.incremental.ParseExceptionHandler;
import org.apache.druid.segment.incremental.ParseExceptionReport;
import org.apache.druid.segment.incremental.RowIngestionMeters;
import org.apache.druid.segment.indexing.BatchIOConfig;
import org.apache.druid.segment.indexing.DataSchema;
import org.apache.druid.segment.indexing.IngestionSpec;
import org.apache.druid.segment.indexing.TuningConfig;
import org.apache.druid.segment.indexing.granularity.ArbitraryGranularitySpec;
import org.apache.druid.segment.indexing.granularity.GranularitySpec;
import org.apache.druid.segment.realtime.ChatHandler;
import org.apache.druid.segment.realtime.SegmentGenerationMetrics;
import org.apache.druid.segment.realtime.appenderator.Appenderator;
import org.apache.druid.segment.realtime.appenderator.AppenderatorConfig;
import org.apache.druid.segment.realtime.appenderator.BaseAppenderatorDriver;
import org.apache.druid.segment.realtime.appenderator.BatchAppenderatorDriver;
import org.apache.druid.segment.realtime.appenderator.SegmentIdWithShardSpec;
import org.apache.druid.segment.realtime.appenderator.SegmentsAndCommitMetadata;
import org.apache.druid.segment.realtime.appenderator.TransactionalSegmentPublisher;
import org.apache.druid.segment.writeout.SegmentWriteOutMediumFactory;
import org.apache.druid.server.security.Action;
import org.apache.druid.server.security.AuthorizerMapper;
import org.apache.druid.server.security.Resource;
import org.apache.druid.server.security.ResourceAction;
import org.apache.druid.server.security.ResourceType;
import org.apache.druid.timeline.DataSegment;
import org.apache.druid.timeline.partition.HashBasedNumberedShardSpec;
import org.apache.druid.timeline.partition.NumberedShardSpec;
import org.apache.druid.utils.CircularBuffer;
import org.checkerframework.checker.nullness.qual.MonotonicNonNull;
import org.joda.time.Interval;
import org.joda.time.Period;

import javax.annotation.Nonnull;
import javax.annotation.Nullable;
import javax.servlet.http.HttpServletRequest;
import javax.ws.rs.GET;
import javax.ws.rs.Path;
import javax.ws.rs.Produces;
import javax.ws.rs.QueryParam;
import javax.ws.rs.core.Context;
import javax.ws.rs.core.MediaType;
import javax.ws.rs.core.Response;
import java.io.File;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Collections;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Objects;
import java.util.Set;
import java.util.TreeMap;
import java.util.concurrent.ExecutionException;
import java.util.concurrent.TimeUnit;
import java.util.concurrent.TimeoutException;
import java.util.concurrent.atomic.AtomicReference;
import java.util.function.Function;
import java.util.stream.Collectors;

public class IndexTask extends AbstractBatchIndexTask implements ChatHandler, PendingSegmentAllocatingTask
{
  public static final HashFunction HASH_FUNCTION = Hashing.murmur3_128();

  public static final String TYPE = "index";
  private static final Logger log = new Logger(IndexTask.class);

  private static String makeGroupId(IndexIngestionSpec ingestionSchema, IngestionMode ingestionMode)
  {
    return makeGroupId(ingestionSchema.dataSchema.getDataSource(), ingestionMode);
  }

  private static String makeGroupId(String dataSource, IngestionMode ingestionMode)
  {
    if (ingestionMode == IngestionMode.APPEND) {
      // Shared locking group for all tasks that append, since they are OK to run concurrently.
      return StringUtils.format("%s_append_%s", TYPE, dataSource);
    } else {
      // Return null, one locking group per task.
      return null;
    }
  }

  private final String baseSequenceName;

  private final IndexIngestionSpec ingestionSchema;

  private IngestionState ingestionState;

  private boolean isStandAloneTask;

  @MonotonicNonNull
  private ParseExceptionHandler determinePartitionsParseExceptionHandler;

  @MonotonicNonNull
  private ParseExceptionHandler buildSegmentsParseExceptionHandler;

  @MonotonicNonNull
  private AuthorizerMapper authorizerMapper;

  @MonotonicNonNull
  private RowIngestionMeters determinePartitionsMeters;

  @MonotonicNonNull
  private RowIngestionMeters buildSegmentsMeters;

  @Nullable
  private String errorMsg;

  private TaskReport.ReportMap completionReports;

  @JsonCreator
  public IndexTask(
      @JsonProperty("id") final String id,
      @JsonProperty("resource") final TaskResource taskResource,
      @JsonProperty("spec") final IndexIngestionSpec ingestionSchema,
      @JsonProperty("context") final Map context
  )
  {
    this(
        id,
        makeGroupId(
            ingestionSchema,
            computeBatchIngestionMode(ingestionSchema.getIOConfig())
        ),
        taskResource,
        ingestionSchema.dataSchema.getDataSource(),
        null,
        ingestionSchema,
        context,
        -1,
        true
    );
  }

  /**
   * @param isStandAloneTask used to specify if indextask.run() is run as a part of another task
   *                         skips writing reports and cleanup if not a standalone task
   */
  public IndexTask(
      String id,
      String groupId,
      TaskResource resource,
      String dataSource,
      @Nullable String baseSequenceName,
      IndexIngestionSpec ingestionSchema,
      Map context,
      int maxAllowedLockCount,
      boolean isStandAloneTask
  )
  {
    super(
        getOrMakeId(id, TYPE, dataSource),
        groupId,
        resource,
        dataSource,
        context,
        maxAllowedLockCount,
        computeBatchIngestionMode(ingestionSchema.getIOConfig())
    );
    this.baseSequenceName = baseSequenceName == null ? getId() : baseSequenceName;
    this.ingestionSchema = ingestionSchema;
    this.ingestionState = IngestionState.NOT_STARTED;
    this.isStandAloneTask = isStandAloneTask;
  }

  @Override
  public String getType()
  {
    return TYPE;
  }

  @Override
  public boolean isReady(TaskActionClient taskActionClient) throws Exception
  {
    final IndexTuningConfig tuningConfig = getIngestionSchema().getTuningConfig();
    if (tuningConfig != null && tuningConfig.getPartitionsSpec() != null) {
      if (tuningConfig.getPartitionsSpec().getType() != SecondaryPartitionType.LINEAR
          && tuningConfig.getPartitionsSpec().getType() != SecondaryPartitionType.HASH) {
        throw new UOE("partitionsSpec[%s] is not supported", tuningConfig.getPartitionsSpec().getClass().getName());
      }
    }
    return determineLockGranularityAndTryLock(
        taskActionClient,
        ingestionSchema.dataSchema.getGranularitySpec().inputIntervals()
    );
  }

  @Override
  public boolean requireLockExistingSegments()
  {
    return isGuaranteedRollup(getIngestionMode(), ingestionSchema.tuningConfig)
           || (getIngestionMode() != IngestionMode.APPEND);
  }

  @Override
  public List findSegmentsToLock(TaskActionClient taskActionClient, List intervals)
      throws IOException
  {
    return findInputSegments(
        getDataSource(),
        taskActionClient,
        intervals
    );
  }

  @Override
  public boolean isPerfectRollup()
  {
    return isGuaranteedRollup(getIngestionMode(), ingestionSchema.tuningConfig);
  }

  @Nullable
  @Override
  public Granularity getSegmentGranularity()
  {
    final GranularitySpec granularitySpec = ingestionSchema.getDataSchema().getGranularitySpec();
    if (granularitySpec instanceof ArbitraryGranularitySpec) {
      return null;
    } else {
      return granularitySpec.getSegmentGranularity();
    }
  }

  @Override
  public String getTaskAllocatorId()
  {
    return getGroupId();
  }

  @Nonnull
  @JsonIgnore
  @Override
  public Set getInputSourceResources()
  {
    return getIngestionSchema().getIOConfig().getInputSource() != null ?
           getIngestionSchema().getIOConfig().getInputSource().getTypes()
               .stream()
               .map(i -> new ResourceAction(new Resource(i, ResourceType.EXTERNAL), Action.READ))
               .collect(Collectors.toSet()) :
           ImmutableSet.of();
  }

  @Nullable
  @JsonIgnore
  public TaskReport.ReportMap getCompletionReports()
  {
    return completionReports;
  }

  @GET
  @Path("/unparseableEvents")
  @Produces(MediaType.APPLICATION_JSON)
  public Response getUnparseableEvents(
      @Context final HttpServletRequest req,
      @QueryParam("full") String full
  )
  {
    IndexTaskUtils.datasourceAuthorizationCheck(req, Action.READ, getDataSource(), authorizerMapper);
    return Response.ok(doGetUnparseableEvents(full != null)).build();
  }

  public Map doGetUnparseableEvents(boolean isFullReport)
  {
    final Map events = new HashMap<>();

    if (addDeterminePartitionStatsToReport(isFullReport, ingestionState)) {
      events.put(
          RowIngestionMeters.DETERMINE_PARTITIONS,
          IndexTaskUtils.getReportListFromSavedParseExceptions(
              determinePartitionsParseExceptionHandler.getSavedParseExceptionReports()
          )
      );
    }

    if (addBuildSegmentStatsToReport(isFullReport, ingestionState)) {
      events.put(
          RowIngestionMeters.BUILD_SEGMENTS,
          IndexTaskUtils.getReportListFromSavedParseExceptions(
              buildSegmentsParseExceptionHandler.getSavedParseExceptionReports()
          )
      );
    }
    return events;
  }

  public Map doGetRowStats(boolean isFullReport)
  {
    Map returnMap = new HashMap<>();
    Map totalsMap = new HashMap<>();
    Map averagesMap = new HashMap<>();

    if (addDeterminePartitionStatsToReport(isFullReport, ingestionState)) {
      totalsMap.put(
          RowIngestionMeters.DETERMINE_PARTITIONS,
          determinePartitionsMeters.getTotals()
      );
      averagesMap.put(
          RowIngestionMeters.DETERMINE_PARTITIONS,
          determinePartitionsMeters.getMovingAverages()
      );
    }

    if (addBuildSegmentStatsToReport(isFullReport, ingestionState)) {
      totalsMap.put(
          RowIngestionMeters.BUILD_SEGMENTS,
          buildSegmentsMeters.getTotals()
      );
      averagesMap.put(
          RowIngestionMeters.BUILD_SEGMENTS,
          buildSegmentsMeters.getMovingAverages()
      );
    }

    returnMap.put("totals", totalsMap);
    returnMap.put("movingAverages", averagesMap);
    return returnMap;
  }

  @GET
  @Path("/rowStats")
  @Produces(MediaType.APPLICATION_JSON)
  public Response getRowStats(
      @Context final HttpServletRequest req,
      @QueryParam("full") String full
  )
  {
    IndexTaskUtils.datasourceAuthorizationCheck(req, Action.READ, getDataSource(), authorizerMapper);
    return Response.ok(doGetRowStats(full != null)).build();
  }

  @GET
  @Path("/liveReports")
  @Produces(MediaType.APPLICATION_JSON)
  public Response getLiveReports(
      @Context final HttpServletRequest req,
      @QueryParam("full") String full
  )
  {
    IndexTaskUtils.datasourceAuthorizationCheck(req, Action.READ, getDataSource(), authorizerMapper);

    final TaskReport.ReportMap liveReports = buildLiveIngestionStatsReport(
        ingestionState,
        getTaskCompletionUnparseableEvents(),
        doGetRowStats(full != null)
    );
    return Response.ok(liveReports).build();
  }

  @JsonProperty("spec")
  public IndexIngestionSpec getIngestionSchema()
  {
    return ingestionSchema;
  }

  @Override
  public TaskStatus runTask(final TaskToolbox toolbox)
  {
    try {

      // emit metric for sequential batch ingestion mode:
      emitMetric(toolbox.getEmitter(), "ingest/count", 1);

      log.debug("Found chat handler of class[%s]", toolbox.getChatHandlerProvider().getClass().getName());

      if (toolbox.getChatHandlerProvider().get(getId()).isPresent()) {
        // This is a workaround for ParallelIndexSupervisorTask to avoid double registering when it runs in the
        // sequential mode. See ParallelIndexSupervisorTask.runSequential().
        // Note that all HTTP endpoints are not available in this case. This works only for
        // ParallelIndexSupervisorTask because it doesn't support APIs for live ingestion reports.
        log.warn("Chat handler is already registered. Skipping chat handler registration.");
      } else {
        toolbox.getChatHandlerProvider().register(getId(), this, false);
      }

      this.authorizerMapper = toolbox.getAuthorizerMapper();
      this.determinePartitionsMeters = toolbox.getRowIngestionMetersFactory().createRowIngestionMeters();
      this.buildSegmentsMeters = toolbox.getRowIngestionMetersFactory().createRowIngestionMeters();
      this.determinePartitionsParseExceptionHandler = new ParseExceptionHandler(
          determinePartitionsMeters,
          ingestionSchema.getTuningConfig().isLogParseExceptions(),
          ingestionSchema.getTuningConfig().getMaxParseExceptions(),
          ingestionSchema.getTuningConfig().getMaxSavedParseExceptions()
      );
      this.buildSegmentsParseExceptionHandler = new ParseExceptionHandler(
          buildSegmentsMeters,
          ingestionSchema.getTuningConfig().isLogParseExceptions(),
          ingestionSchema.getTuningConfig().getMaxParseExceptions(),
          ingestionSchema.getTuningConfig().getMaxSavedParseExceptions()
      );

      final boolean determineIntervals = ingestionSchema.getDataSchema()
                                                        .getGranularitySpec()
                                                        .inputIntervals()
                                                        .isEmpty();

      final InputSource inputSource = ingestionSchema.getIOConfig()
                                                     .getNonNullInputSource(toolbox);

      final File tmpDir = toolbox.getIndexingTmpDir();

      ingestionState = IngestionState.DETERMINE_PARTITIONS;

      // Initialize maxRowsPerSegment and maxTotalRows lazily
      final IndexTuningConfig tuningConfig = ingestionSchema.tuningConfig;
      final PartitionsSpec partitionsSpec = tuningConfig.getGivenOrDefaultPartitionsSpec();
      final PartitionAnalysis partitionAnalysis = determineShardSpecs(
          toolbox,
          inputSource,
          tmpDir,
          partitionsSpec
      );
      final List allocateIntervals = new ArrayList<>(partitionAnalysis.getAllIntervalsToIndex());
      final DataSchema dataSchema;
      if (determineIntervals) {
        final boolean gotLocks = determineLockGranularityAndTryLock(
            toolbox.getTaskActionClient(),
            allocateIntervals
        );
        if (!gotLocks) {
          throw new ISE("Failed to get locks for intervals[%s]", allocateIntervals);
        }

        dataSchema = ingestionSchema.getDataSchema().withGranularitySpec(
            ingestionSchema.getDataSchema()
                           .getGranularitySpec()
                           .withIntervals(JodaUtils.condenseIntervals(allocateIntervals))
        );
      } else {
        dataSchema = ingestionSchema.getDataSchema();
      }

      ingestionState = IngestionState.BUILD_SEGMENTS;
      return generateAndPublishSegments(
          toolbox,
          dataSchema,
          inputSource,
          tmpDir,
          partitionAnalysis
      );
    }
    catch (Exception e) {
      log.error(e, "Encountered exception in %s.", ingestionState);
      errorMsg = Throwables.getStackTraceAsString(e);
      updateAndWriteCompletionReports(toolbox);
      return TaskStatus.failure(
          getId(),
          errorMsg
      );
    }

    finally {
      toolbox.getChatHandlerProvider().unregister(getId());
    }
  }

  private void updateAndWriteCompletionReports(TaskToolbox toolbox)
  {
    updateAndWriteCompletionReports(toolbox, null, null);
  }

  private void updateAndWriteCompletionReports(TaskToolbox toolbox, Long segmentsRead, Long segmentsPublished)
  {
    completionReports = buildIngestionStatsAndContextReport(ingestionState, errorMsg, segmentsRead, segmentsPublished);
    if (isStandAloneTask) {
      toolbox.getTaskReportFileWriter().write(getId(), completionReports);
    }
  }


  @Override
  protected Map getTaskCompletionUnparseableEvents()
  {
    Map unparseableEventsMap = new HashMap<>();
    CircularBuffer determinePartitionsParseExceptionReports =
        determinePartitionsParseExceptionHandler.getSavedParseExceptionReports();
    CircularBuffer buildSegmentsParseExceptionReports =
        buildSegmentsParseExceptionHandler.getSavedParseExceptionReports();

    if (determinePartitionsParseExceptionReports != null || buildSegmentsParseExceptionReports != null) {
      unparseableEventsMap.put(
          RowIngestionMeters.DETERMINE_PARTITIONS,
          IndexTaskUtils.getReportListFromSavedParseExceptions(determinePartitionsParseExceptionReports)
      );
      unparseableEventsMap.put(
          RowIngestionMeters.BUILD_SEGMENTS,
          IndexTaskUtils.getReportListFromSavedParseExceptions(buildSegmentsParseExceptionReports)
      );
    }

    return unparseableEventsMap;
  }

  @Override
  protected Map getTaskCompletionRowStats()
  {
    Map metrics = new HashMap<>();
    metrics.put(
        RowIngestionMeters.DETERMINE_PARTITIONS,
        determinePartitionsMeters.getTotals()
    );

    metrics.put(
        RowIngestionMeters.BUILD_SEGMENTS,
        buildSegmentsMeters.getTotals()
    );

    return metrics;
  }

  /**
   * Determines intervals and shardSpecs for input data.  This method first checks that it must determine intervals and
   * shardSpecs by itself.  Intervals must be determined if they are not specified in {@link GranularitySpec}.
   * ShardSpecs must be determined if the perfect rollup must be guaranteed even though the number of shards is not
   * specified in {@link IndexTuningConfig}.
   * 
   * If both intervals and shardSpecs don't have to be determined, this method simply returns {@link ShardSpecs} for the
   * given intervals.  Here, if {@link HashedPartitionsSpec#numShards} is not specified, {@link NumberedShardSpec} is
   * used.
   * 

   * If one of intervals or shardSpecs need to be determined, this method reads the entire input for determining one of
   * them.  If the perfect rollup must be guaranteed, {@link HashBasedNumberedShardSpec} is used for hash partitioning
   * of input data.  In the future we may want to also support single-dimension partitioning.
   *
   * @return a map indicating how many shardSpecs need to be created per interval.
   */
  private PartitionAnalysis determineShardSpecs(
      final TaskToolbox toolbox,
      final InputSource inputSource,
      final File tmpDir,
      @Nonnull final PartitionsSpec partitionsSpec
  ) throws IOException
  {
    final ObjectMapper jsonMapper = toolbox.getJsonMapper();

    final GranularitySpec granularitySpec = ingestionSchema.getDataSchema().getGranularitySpec();

    // Must determine intervals if unknown, since we acquire all locks before processing any data.
    final boolean determineIntervals = granularitySpec.inputIntervals().isEmpty();

    // Must determine partitions if rollup is guaranteed and the user didn't provide a specific value.
    final boolean determineNumPartitions = partitionsSpec.needsDeterminePartitions(false);

    // if we were given number of shards per interval and the intervals, we don't need to scan the data
    if (!determineNumPartitions && !determineIntervals) {
      log.info("Skipping determine partition scan");
      if (partitionsSpec.getType() == SecondaryPartitionType.HASH) {
        return PartialHashSegmentGenerateTask.createHashPartitionAnalysisFromPartitionsSpec(
            granularitySpec,
            (HashedPartitionsSpec) partitionsSpec,
            null // not overriding numShards
        );
      } else if (partitionsSpec.getType() == SecondaryPartitionType.LINEAR) {
        return createLinearPartitionAnalysis(granularitySpec, (DynamicPartitionsSpec) partitionsSpec);
      } else {
        throw new UOE("%s", partitionsSpec.getClass().getName());
      }
    } else {
      // determine intervals containing data and prime HLL collectors
      log.info("Determining intervals and shardSpecs");
      return createShardSpecsFromInput(
          jsonMapper,
          ingestionSchema,
          inputSource,
          tmpDir,
          granularitySpec,
          partitionsSpec,
          determineIntervals
      );
    }
  }

  private static boolean addDeterminePartitionStatsToReport(boolean isFullReport, IngestionState ingestionState)
  {
    return isFullReport
           || ingestionState == IngestionState.DETERMINE_PARTITIONS;
  }

  private static LinearPartitionAnalysis createLinearPartitionAnalysis(
      GranularitySpec granularitySpec,
      @Nonnull DynamicPartitionsSpec partitionsSpec
  )
  {
    final Iterable intervals = granularitySpec.sortedBucketIntervals();
    final int numBucketsPerInterval = 1;
    final LinearPartitionAnalysis partitionAnalysis = new LinearPartitionAnalysis(partitionsSpec);
    intervals.forEach(interval -> partitionAnalysis.updateBucket(interval, numBucketsPerInterval));
    return partitionAnalysis;
  }

  private PartitionAnalysis createShardSpecsFromInput(
      ObjectMapper jsonMapper,
      IndexIngestionSpec ingestionSchema,
      InputSource inputSource,
      File tmpDir,
      GranularitySpec granularitySpec,
      @Nonnull PartitionsSpec partitionsSpec,
      boolean determineIntervals
  ) throws IOException
  {
    assert partitionsSpec.getType() != SecondaryPartitionType.RANGE;
    long determineShardSpecsStartMillis = System.currentTimeMillis();

    final Map> hllCollectors = collectIntervalsAndShardSpecs(
        jsonMapper,
        ingestionSchema,
        inputSource,
        tmpDir,
        granularitySpec,
        partitionsSpec,
        determineIntervals
    );
    final PartitionAnalysis partitionAnalysis;
    if (partitionsSpec.getType() == SecondaryPartitionType.LINEAR) {
      partitionAnalysis = new LinearPartitionAnalysis((DynamicPartitionsSpec) partitionsSpec);
    } else if (partitionsSpec.getType() == SecondaryPartitionType.HASH) {
      partitionAnalysis = new HashPartitionAnalysis((HashedPartitionsSpec) partitionsSpec);
    } else {
      throw new UOE("%s", partitionsSpec.getClass().getName());
    }
    for (final Map.Entry> entry : hllCollectors.entrySet()) {
      final Interval interval = entry.getKey();
      final int numBucketsPerInterval;
      if (partitionsSpec.getType() == SecondaryPartitionType.HASH) {
        final HashedPartitionsSpec hashedPartitionsSpec = (HashedPartitionsSpec) partitionsSpec;
        final HyperLogLogCollector collector = entry.getValue().orNull();

        if (partitionsSpec.needsDeterminePartitions(false)) {
          final long numRows = Preconditions.checkNotNull(collector, "HLL collector").estimateCardinalityRound();
          final int nonNullMaxRowsPerSegment = partitionsSpec.getMaxRowsPerSegment() == null
                                               ? PartitionsSpec.DEFAULT_MAX_ROWS_PER_SEGMENT
                                               : partitionsSpec.getMaxRowsPerSegment();
          numBucketsPerInterval = (int) Math.ceil((double) numRows / nonNullMaxRowsPerSegment);
          log.info(
              "Estimated [%,d] rows of data for interval [%s], creating [%,d] shards",
              numRows,
              interval,
              numBucketsPerInterval
          );
        } else {
          numBucketsPerInterval = hashedPartitionsSpec.getNumShards() == null ? 1 : hashedPartitionsSpec.getNumShards();
          log.info("Creating [%,d] buckets for interval [%s]", numBucketsPerInterval, interval);
        }
      } else {
        numBucketsPerInterval = 1;
      }

      partitionAnalysis.updateBucket(interval, numBucketsPerInterval);
    }
    log.info("Found intervals and shardSpecs in %,dms", System.currentTimeMillis() - determineShardSpecsStartMillis);

    return partitionAnalysis;
  }

  private Map> collectIntervalsAndShardSpecs(
      ObjectMapper jsonMapper,
      IndexIngestionSpec ingestionSchema,
      InputSource inputSource,
      File tmpDir,
      GranularitySpec granularitySpec,
      @Nonnull PartitionsSpec partitionsSpec,
      boolean determineIntervals
  ) throws IOException
  {
    final Map> hllCollectors = new TreeMap<>(
        Comparators.intervalsByStartThenEnd()
    );
    final Granularity queryGranularity = granularitySpec.getQueryGranularity();
    try (final CloseableIterator inputRowIterator = AbstractBatchIndexTask.inputSourceReader(
        tmpDir,
        ingestionSchema.getDataSchema(),
        inputSource,
        inputSource.needsFormat() ? getInputFormat(ingestionSchema) : null,
        allowNonNullRowsWithinInputIntervalsOf(granularitySpec),
        determinePartitionsMeters,
        determinePartitionsParseExceptionHandler
    )) {
      while (inputRowIterator.hasNext()) {
        final InputRow inputRow = inputRowIterator.next();

        final Interval interval;
        if (determineIntervals) {
          interval = granularitySpec.getSegmentGranularity().bucket(inputRow.getTimestamp());
        } else {
          final Optional optInterval = granularitySpec.bucketInterval(inputRow.getTimestamp());
          // this interval must exist since it passed the rowFilter
          assert optInterval.isPresent();
          interval = optInterval.get();
        }

        if (partitionsSpec.needsDeterminePartitions(false)) {
          hllCollectors.computeIfAbsent(interval, intv -> Optional.of(HyperLogLogCollector.makeLatestCollector()));

          List