org.apache.druid.indexing.common.task.IndexTask Maven / Gradle / Ivy
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package org.apache.druid.indexing.common.task;
import com.fasterxml.jackson.annotation.JsonCreator;
import com.fasterxml.jackson.annotation.JsonIgnore;
import com.fasterxml.jackson.annotation.JsonProperty;
import com.fasterxml.jackson.annotation.JsonTypeName;
import com.fasterxml.jackson.databind.ObjectMapper;
import com.google.common.base.Optional;
import com.google.common.base.Preconditions;
import com.google.common.base.Throwables;
import com.google.common.collect.ImmutableList;
import com.google.common.collect.ImmutableSet;
import com.google.common.hash.HashFunction;
import com.google.common.hash.Hashing;
import com.google.common.util.concurrent.ListenableFuture;
import org.apache.druid.data.input.InputFormat;
import org.apache.druid.data.input.InputRow;
import org.apache.druid.data.input.InputSource;
import org.apache.druid.data.input.Rows;
import org.apache.druid.hll.HyperLogLogCollector;
import org.apache.druid.indexer.Checks;
import org.apache.druid.indexer.IngestionState;
import org.apache.druid.indexer.Property;
import org.apache.druid.indexer.TaskStatus;
import org.apache.druid.indexer.partitions.DynamicPartitionsSpec;
import org.apache.druid.indexer.partitions.HashedPartitionsSpec;
import org.apache.druid.indexer.partitions.PartitionsSpec;
import org.apache.druid.indexer.partitions.SecondaryPartitionType;
import org.apache.druid.indexer.report.TaskReport;
import org.apache.druid.indexing.common.TaskLockType;
import org.apache.druid.indexing.common.TaskRealtimeMetricsMonitorBuilder;
import org.apache.druid.indexing.common.TaskToolbox;
import org.apache.druid.indexing.common.actions.TaskActionClient;
import org.apache.druid.indexing.common.stats.TaskRealtimeMetricsMonitor;
import org.apache.druid.indexing.common.task.batch.parallel.PartialHashSegmentGenerateTask;
import org.apache.druid.indexing.common.task.batch.parallel.TombstoneHelper;
import org.apache.druid.indexing.common.task.batch.parallel.iterator.DefaultIndexTaskInputRowIteratorBuilder;
import org.apache.druid.indexing.common.task.batch.partition.CompletePartitionAnalysis;
import org.apache.druid.indexing.common.task.batch.partition.HashPartitionAnalysis;
import org.apache.druid.indexing.common.task.batch.partition.LinearPartitionAnalysis;
import org.apache.druid.indexing.common.task.batch.partition.PartitionAnalysis;
import org.apache.druid.indexing.input.DruidInputSource;
import org.apache.druid.indexing.input.TaskInputSource;
import org.apache.druid.indexing.overlord.sampler.InputSourceSampler;
import org.apache.druid.java.util.common.IAE;
import org.apache.druid.java.util.common.ISE;
import org.apache.druid.java.util.common.JodaUtils;
import org.apache.druid.java.util.common.Pair;
import org.apache.druid.java.util.common.StringUtils;
import org.apache.druid.java.util.common.UOE;
import org.apache.druid.java.util.common.granularity.Granularity;
import org.apache.druid.java.util.common.guava.Comparators;
import org.apache.druid.java.util.common.logger.Logger;
import org.apache.druid.java.util.common.parsers.CloseableIterator;
import org.apache.druid.segment.IndexMerger;
import org.apache.druid.segment.IndexSpec;
import org.apache.druid.segment.SegmentSchemaMapping;
import org.apache.druid.segment.incremental.AppendableIndexSpec;
import org.apache.druid.segment.incremental.ParseExceptionHandler;
import org.apache.druid.segment.incremental.ParseExceptionReport;
import org.apache.druid.segment.incremental.RowIngestionMeters;
import org.apache.druid.segment.indexing.BatchIOConfig;
import org.apache.druid.segment.indexing.DataSchema;
import org.apache.druid.segment.indexing.IngestionSpec;
import org.apache.druid.segment.indexing.TuningConfig;
import org.apache.druid.segment.indexing.granularity.ArbitraryGranularitySpec;
import org.apache.druid.segment.indexing.granularity.GranularitySpec;
import org.apache.druid.segment.realtime.ChatHandler;
import org.apache.druid.segment.realtime.SegmentGenerationMetrics;
import org.apache.druid.segment.realtime.appenderator.Appenderator;
import org.apache.druid.segment.realtime.appenderator.AppenderatorConfig;
import org.apache.druid.segment.realtime.appenderator.BaseAppenderatorDriver;
import org.apache.druid.segment.realtime.appenderator.BatchAppenderatorDriver;
import org.apache.druid.segment.realtime.appenderator.SegmentIdWithShardSpec;
import org.apache.druid.segment.realtime.appenderator.SegmentsAndCommitMetadata;
import org.apache.druid.segment.realtime.appenderator.TransactionalSegmentPublisher;
import org.apache.druid.segment.writeout.SegmentWriteOutMediumFactory;
import org.apache.druid.server.security.Action;
import org.apache.druid.server.security.AuthorizerMapper;
import org.apache.druid.server.security.Resource;
import org.apache.druid.server.security.ResourceAction;
import org.apache.druid.server.security.ResourceType;
import org.apache.druid.timeline.DataSegment;
import org.apache.druid.timeline.partition.HashBasedNumberedShardSpec;
import org.apache.druid.timeline.partition.NumberedShardSpec;
import org.apache.druid.utils.CircularBuffer;
import org.checkerframework.checker.nullness.qual.MonotonicNonNull;
import org.joda.time.Interval;
import org.joda.time.Period;
import javax.annotation.Nonnull;
import javax.annotation.Nullable;
import javax.servlet.http.HttpServletRequest;
import javax.ws.rs.GET;
import javax.ws.rs.Path;
import javax.ws.rs.Produces;
import javax.ws.rs.QueryParam;
import javax.ws.rs.core.Context;
import javax.ws.rs.core.MediaType;
import javax.ws.rs.core.Response;
import java.io.File;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Collections;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Objects;
import java.util.Set;
import java.util.TreeMap;
import java.util.concurrent.ExecutionException;
import java.util.concurrent.TimeUnit;
import java.util.concurrent.TimeoutException;
import java.util.concurrent.atomic.AtomicReference;
import java.util.function.Function;
import java.util.stream.Collectors;
public class IndexTask extends AbstractBatchIndexTask implements ChatHandler, PendingSegmentAllocatingTask
{
public static final HashFunction HASH_FUNCTION = Hashing.murmur3_128();
public static final String TYPE = "index";
private static final Logger log = new Logger(IndexTask.class);
private static String makeGroupId(IndexIngestionSpec ingestionSchema, IngestionMode ingestionMode)
{
return makeGroupId(ingestionSchema.dataSchema.getDataSource(), ingestionMode);
}
private static String makeGroupId(String dataSource, IngestionMode ingestionMode)
{
if (ingestionMode == IngestionMode.APPEND) {
// Shared locking group for all tasks that append, since they are OK to run concurrently.
return StringUtils.format("%s_append_%s", TYPE, dataSource);
} else {
// Return null, one locking group per task.
return null;
}
}
private final String baseSequenceName;
private final IndexIngestionSpec ingestionSchema;
private IngestionState ingestionState;
private boolean isStandAloneTask;
@MonotonicNonNull
private ParseExceptionHandler determinePartitionsParseExceptionHandler;
@MonotonicNonNull
private ParseExceptionHandler buildSegmentsParseExceptionHandler;
@MonotonicNonNull
private AuthorizerMapper authorizerMapper;
@MonotonicNonNull
private RowIngestionMeters determinePartitionsMeters;
@MonotonicNonNull
private RowIngestionMeters buildSegmentsMeters;
@Nullable
private String errorMsg;
private TaskReport.ReportMap completionReports;
@JsonCreator
public IndexTask(
@JsonProperty("id") final String id,
@JsonProperty("resource") final TaskResource taskResource,
@JsonProperty("spec") final IndexIngestionSpec ingestionSchema,
@JsonProperty("context") final Map context
)
{
this(
id,
makeGroupId(
ingestionSchema,
computeBatchIngestionMode(ingestionSchema.getIOConfig())
),
taskResource,
ingestionSchema.dataSchema.getDataSource(),
null,
ingestionSchema,
context,
-1,
true
);
}
/**
* @param isStandAloneTask used to specify if indextask.run() is run as a part of another task
* skips writing reports and cleanup if not a standalone task
*/
public IndexTask(
String id,
String groupId,
TaskResource resource,
String dataSource,
@Nullable String baseSequenceName,
IndexIngestionSpec ingestionSchema,
Map context,
int maxAllowedLockCount,
boolean isStandAloneTask
)
{
super(
getOrMakeId(id, TYPE, dataSource),
groupId,
resource,
dataSource,
context,
maxAllowedLockCount,
computeBatchIngestionMode(ingestionSchema.getIOConfig())
);
this.baseSequenceName = baseSequenceName == null ? getId() : baseSequenceName;
this.ingestionSchema = ingestionSchema;
this.ingestionState = IngestionState.NOT_STARTED;
this.isStandAloneTask = isStandAloneTask;
}
@Override
public String getType()
{
return TYPE;
}
@Override
public boolean isReady(TaskActionClient taskActionClient) throws Exception
{
final IndexTuningConfig tuningConfig = getIngestionSchema().getTuningConfig();
if (tuningConfig != null && tuningConfig.getPartitionsSpec() != null) {
if (tuningConfig.getPartitionsSpec().getType() != SecondaryPartitionType.LINEAR
&& tuningConfig.getPartitionsSpec().getType() != SecondaryPartitionType.HASH) {
throw new UOE("partitionsSpec[%s] is not supported", tuningConfig.getPartitionsSpec().getClass().getName());
}
}
return determineLockGranularityAndTryLock(
taskActionClient,
ingestionSchema.dataSchema.getGranularitySpec().inputIntervals()
);
}
@Override
public boolean requireLockExistingSegments()
{
return isGuaranteedRollup(getIngestionMode(), ingestionSchema.tuningConfig)
|| (getIngestionMode() != IngestionMode.APPEND);
}
@Override
public List findSegmentsToLock(TaskActionClient taskActionClient, List intervals)
throws IOException
{
return findInputSegments(
getDataSource(),
taskActionClient,
intervals
);
}
@Override
public boolean isPerfectRollup()
{
return isGuaranteedRollup(getIngestionMode(), ingestionSchema.tuningConfig);
}
@Nullable
@Override
public Granularity getSegmentGranularity()
{
final GranularitySpec granularitySpec = ingestionSchema.getDataSchema().getGranularitySpec();
if (granularitySpec instanceof ArbitraryGranularitySpec) {
return null;
} else {
return granularitySpec.getSegmentGranularity();
}
}
@Override
public String getTaskAllocatorId()
{
return getGroupId();
}
@Nonnull
@JsonIgnore
@Override
public Set getInputSourceResources()
{
return getIngestionSchema().getIOConfig().getInputSource() != null ?
getIngestionSchema().getIOConfig().getInputSource().getTypes()
.stream()
.map(i -> new ResourceAction(new Resource(i, ResourceType.EXTERNAL), Action.READ))
.collect(Collectors.toSet()) :
ImmutableSet.of();
}
@Nullable
@JsonIgnore
public TaskReport.ReportMap getCompletionReports()
{
return completionReports;
}
@GET
@Path("/unparseableEvents")
@Produces(MediaType.APPLICATION_JSON)
public Response getUnparseableEvents(
@Context final HttpServletRequest req,
@QueryParam("full") String full
)
{
IndexTaskUtils.datasourceAuthorizationCheck(req, Action.READ, getDataSource(), authorizerMapper);
return Response.ok(doGetUnparseableEvents(full != null)).build();
}
public Map doGetUnparseableEvents(boolean isFullReport)
{
final Map events = new HashMap<>();
if (addDeterminePartitionStatsToReport(isFullReport, ingestionState)) {
events.put(
RowIngestionMeters.DETERMINE_PARTITIONS,
IndexTaskUtils.getReportListFromSavedParseExceptions(
determinePartitionsParseExceptionHandler.getSavedParseExceptionReports()
)
);
}
if (addBuildSegmentStatsToReport(isFullReport, ingestionState)) {
events.put(
RowIngestionMeters.BUILD_SEGMENTS,
IndexTaskUtils.getReportListFromSavedParseExceptions(
buildSegmentsParseExceptionHandler.getSavedParseExceptionReports()
)
);
}
return events;
}
public Map doGetRowStats(boolean isFullReport)
{
Map returnMap = new HashMap<>();
Map totalsMap = new HashMap<>();
Map averagesMap = new HashMap<>();
if (addDeterminePartitionStatsToReport(isFullReport, ingestionState)) {
totalsMap.put(
RowIngestionMeters.DETERMINE_PARTITIONS,
determinePartitionsMeters.getTotals()
);
averagesMap.put(
RowIngestionMeters.DETERMINE_PARTITIONS,
determinePartitionsMeters.getMovingAverages()
);
}
if (addBuildSegmentStatsToReport(isFullReport, ingestionState)) {
totalsMap.put(
RowIngestionMeters.BUILD_SEGMENTS,
buildSegmentsMeters.getTotals()
);
averagesMap.put(
RowIngestionMeters.BUILD_SEGMENTS,
buildSegmentsMeters.getMovingAverages()
);
}
returnMap.put("totals", totalsMap);
returnMap.put("movingAverages", averagesMap);
return returnMap;
}
@GET
@Path("/rowStats")
@Produces(MediaType.APPLICATION_JSON)
public Response getRowStats(
@Context final HttpServletRequest req,
@QueryParam("full") String full
)
{
IndexTaskUtils.datasourceAuthorizationCheck(req, Action.READ, getDataSource(), authorizerMapper);
return Response.ok(doGetRowStats(full != null)).build();
}
@GET
@Path("/liveReports")
@Produces(MediaType.APPLICATION_JSON)
public Response getLiveReports(
@Context final HttpServletRequest req,
@QueryParam("full") String full
)
{
IndexTaskUtils.datasourceAuthorizationCheck(req, Action.READ, getDataSource(), authorizerMapper);
final TaskReport.ReportMap liveReports = buildLiveIngestionStatsReport(
ingestionState,
getTaskCompletionUnparseableEvents(),
doGetRowStats(full != null)
);
return Response.ok(liveReports).build();
}
@JsonProperty("spec")
public IndexIngestionSpec getIngestionSchema()
{
return ingestionSchema;
}
@Override
public TaskStatus runTask(final TaskToolbox toolbox)
{
try {
// emit metric for sequential batch ingestion mode:
emitMetric(toolbox.getEmitter(), "ingest/count", 1);
log.debug("Found chat handler of class[%s]", toolbox.getChatHandlerProvider().getClass().getName());
if (toolbox.getChatHandlerProvider().get(getId()).isPresent()) {
// This is a workaround for ParallelIndexSupervisorTask to avoid double registering when it runs in the
// sequential mode. See ParallelIndexSupervisorTask.runSequential().
// Note that all HTTP endpoints are not available in this case. This works only for
// ParallelIndexSupervisorTask because it doesn't support APIs for live ingestion reports.
log.warn("Chat handler is already registered. Skipping chat handler registration.");
} else {
toolbox.getChatHandlerProvider().register(getId(), this, false);
}
this.authorizerMapper = toolbox.getAuthorizerMapper();
this.determinePartitionsMeters = toolbox.getRowIngestionMetersFactory().createRowIngestionMeters();
this.buildSegmentsMeters = toolbox.getRowIngestionMetersFactory().createRowIngestionMeters();
this.determinePartitionsParseExceptionHandler = new ParseExceptionHandler(
determinePartitionsMeters,
ingestionSchema.getTuningConfig().isLogParseExceptions(),
ingestionSchema.getTuningConfig().getMaxParseExceptions(),
ingestionSchema.getTuningConfig().getMaxSavedParseExceptions()
);
this.buildSegmentsParseExceptionHandler = new ParseExceptionHandler(
buildSegmentsMeters,
ingestionSchema.getTuningConfig().isLogParseExceptions(),
ingestionSchema.getTuningConfig().getMaxParseExceptions(),
ingestionSchema.getTuningConfig().getMaxSavedParseExceptions()
);
final boolean determineIntervals = ingestionSchema.getDataSchema()
.getGranularitySpec()
.inputIntervals()
.isEmpty();
final InputSource inputSource = ingestionSchema.getIOConfig()
.getNonNullInputSource(toolbox);
final File tmpDir = toolbox.getIndexingTmpDir();
ingestionState = IngestionState.DETERMINE_PARTITIONS;
// Initialize maxRowsPerSegment and maxTotalRows lazily
final IndexTuningConfig tuningConfig = ingestionSchema.tuningConfig;
final PartitionsSpec partitionsSpec = tuningConfig.getGivenOrDefaultPartitionsSpec();
final PartitionAnalysis partitionAnalysis = determineShardSpecs(
toolbox,
inputSource,
tmpDir,
partitionsSpec
);
final List allocateIntervals = new ArrayList<>(partitionAnalysis.getAllIntervalsToIndex());
final DataSchema dataSchema;
if (determineIntervals) {
final boolean gotLocks = determineLockGranularityAndTryLock(
toolbox.getTaskActionClient(),
allocateIntervals
);
if (!gotLocks) {
throw new ISE("Failed to get locks for intervals[%s]", allocateIntervals);
}
dataSchema = ingestionSchema.getDataSchema().withGranularitySpec(
ingestionSchema.getDataSchema()
.getGranularitySpec()
.withIntervals(JodaUtils.condenseIntervals(allocateIntervals))
);
} else {
dataSchema = ingestionSchema.getDataSchema();
}
ingestionState = IngestionState.BUILD_SEGMENTS;
return generateAndPublishSegments(
toolbox,
dataSchema,
inputSource,
tmpDir,
partitionAnalysis
);
}
catch (Exception e) {
log.error(e, "Encountered exception in %s.", ingestionState);
errorMsg = Throwables.getStackTraceAsString(e);
updateAndWriteCompletionReports(toolbox);
return TaskStatus.failure(
getId(),
errorMsg
);
}
finally {
toolbox.getChatHandlerProvider().unregister(getId());
}
}
private void updateAndWriteCompletionReports(TaskToolbox toolbox)
{
updateAndWriteCompletionReports(toolbox, null, null);
}
private void updateAndWriteCompletionReports(TaskToolbox toolbox, Long segmentsRead, Long segmentsPublished)
{
completionReports = buildIngestionStatsAndContextReport(ingestionState, errorMsg, segmentsRead, segmentsPublished);
if (isStandAloneTask) {
toolbox.getTaskReportFileWriter().write(getId(), completionReports);
}
}
@Override
protected Map getTaskCompletionUnparseableEvents()
{
Map unparseableEventsMap = new HashMap<>();
CircularBuffer determinePartitionsParseExceptionReports =
determinePartitionsParseExceptionHandler.getSavedParseExceptionReports();
CircularBuffer buildSegmentsParseExceptionReports =
buildSegmentsParseExceptionHandler.getSavedParseExceptionReports();
if (determinePartitionsParseExceptionReports != null || buildSegmentsParseExceptionReports != null) {
unparseableEventsMap.put(
RowIngestionMeters.DETERMINE_PARTITIONS,
IndexTaskUtils.getReportListFromSavedParseExceptions(determinePartitionsParseExceptionReports)
);
unparseableEventsMap.put(
RowIngestionMeters.BUILD_SEGMENTS,
IndexTaskUtils.getReportListFromSavedParseExceptions(buildSegmentsParseExceptionReports)
);
}
return unparseableEventsMap;
}
@Override
protected Map getTaskCompletionRowStats()
{
Map metrics = new HashMap<>();
metrics.put(
RowIngestionMeters.DETERMINE_PARTITIONS,
determinePartitionsMeters.getTotals()
);
metrics.put(
RowIngestionMeters.BUILD_SEGMENTS,
buildSegmentsMeters.getTotals()
);
return metrics;
}
/**
* Determines intervals and shardSpecs for input data. This method first checks that it must determine intervals and
* shardSpecs by itself. Intervals must be determined if they are not specified in {@link GranularitySpec}.
* ShardSpecs must be determined if the perfect rollup must be guaranteed even though the number of shards is not
* specified in {@link IndexTuningConfig}.
*
* If both intervals and shardSpecs don't have to be determined, this method simply returns {@link ShardSpecs} for the
* given intervals. Here, if {@link HashedPartitionsSpec#numShards} is not specified, {@link NumberedShardSpec} is
* used.
*
* If one of intervals or shardSpecs need to be determined, this method reads the entire input for determining one of
* them. If the perfect rollup must be guaranteed, {@link HashBasedNumberedShardSpec} is used for hash partitioning
* of input data. In the future we may want to also support single-dimension partitioning.
*
* @return a map indicating how many shardSpecs need to be created per interval.
*/
private PartitionAnalysis determineShardSpecs(
final TaskToolbox toolbox,
final InputSource inputSource,
final File tmpDir,
@Nonnull final PartitionsSpec partitionsSpec
) throws IOException
{
final ObjectMapper jsonMapper = toolbox.getJsonMapper();
final GranularitySpec granularitySpec = ingestionSchema.getDataSchema().getGranularitySpec();
// Must determine intervals if unknown, since we acquire all locks before processing any data.
final boolean determineIntervals = granularitySpec.inputIntervals().isEmpty();
// Must determine partitions if rollup is guaranteed and the user didn't provide a specific value.
final boolean determineNumPartitions = partitionsSpec.needsDeterminePartitions(false);
// if we were given number of shards per interval and the intervals, we don't need to scan the data
if (!determineNumPartitions && !determineIntervals) {
log.info("Skipping determine partition scan");
if (partitionsSpec.getType() == SecondaryPartitionType.HASH) {
return PartialHashSegmentGenerateTask.createHashPartitionAnalysisFromPartitionsSpec(
granularitySpec,
(HashedPartitionsSpec) partitionsSpec,
null // not overriding numShards
);
} else if (partitionsSpec.getType() == SecondaryPartitionType.LINEAR) {
return createLinearPartitionAnalysis(granularitySpec, (DynamicPartitionsSpec) partitionsSpec);
} else {
throw new UOE("%s", partitionsSpec.getClass().getName());
}
} else {
// determine intervals containing data and prime HLL collectors
log.info("Determining intervals and shardSpecs");
return createShardSpecsFromInput(
jsonMapper,
ingestionSchema,
inputSource,
tmpDir,
granularitySpec,
partitionsSpec,
determineIntervals
);
}
}
private static boolean addDeterminePartitionStatsToReport(boolean isFullReport, IngestionState ingestionState)
{
return isFullReport
|| ingestionState == IngestionState.DETERMINE_PARTITIONS;
}
private static LinearPartitionAnalysis createLinearPartitionAnalysis(
GranularitySpec granularitySpec,
@Nonnull DynamicPartitionsSpec partitionsSpec
)
{
final Iterable intervals = granularitySpec.sortedBucketIntervals();
final int numBucketsPerInterval = 1;
final LinearPartitionAnalysis partitionAnalysis = new LinearPartitionAnalysis(partitionsSpec);
intervals.forEach(interval -> partitionAnalysis.updateBucket(interval, numBucketsPerInterval));
return partitionAnalysis;
}
private PartitionAnalysis createShardSpecsFromInput(
ObjectMapper jsonMapper,
IndexIngestionSpec ingestionSchema,
InputSource inputSource,
File tmpDir,
GranularitySpec granularitySpec,
@Nonnull PartitionsSpec partitionsSpec,
boolean determineIntervals
) throws IOException
{
assert partitionsSpec.getType() != SecondaryPartitionType.RANGE;
long determineShardSpecsStartMillis = System.currentTimeMillis();
final Map> hllCollectors = collectIntervalsAndShardSpecs(
jsonMapper,
ingestionSchema,
inputSource,
tmpDir,
granularitySpec,
partitionsSpec,
determineIntervals
);
final PartitionAnalysis partitionAnalysis;
if (partitionsSpec.getType() == SecondaryPartitionType.LINEAR) {
partitionAnalysis = new LinearPartitionAnalysis((DynamicPartitionsSpec) partitionsSpec);
} else if (partitionsSpec.getType() == SecondaryPartitionType.HASH) {
partitionAnalysis = new HashPartitionAnalysis((HashedPartitionsSpec) partitionsSpec);
} else {
throw new UOE("%s", partitionsSpec.getClass().getName());
}
for (final Map.Entry> entry : hllCollectors.entrySet()) {
final Interval interval = entry.getKey();
final int numBucketsPerInterval;
if (partitionsSpec.getType() == SecondaryPartitionType.HASH) {
final HashedPartitionsSpec hashedPartitionsSpec = (HashedPartitionsSpec) partitionsSpec;
final HyperLogLogCollector collector = entry.getValue().orNull();
if (partitionsSpec.needsDeterminePartitions(false)) {
final long numRows = Preconditions.checkNotNull(collector, "HLL collector").estimateCardinalityRound();
final int nonNullMaxRowsPerSegment = partitionsSpec.getMaxRowsPerSegment() == null
? PartitionsSpec.DEFAULT_MAX_ROWS_PER_SEGMENT
: partitionsSpec.getMaxRowsPerSegment();
numBucketsPerInterval = (int) Math.ceil((double) numRows / nonNullMaxRowsPerSegment);
log.info(
"Estimated [%,d] rows of data for interval [%s], creating [%,d] shards",
numRows,
interval,
numBucketsPerInterval
);
} else {
numBucketsPerInterval = hashedPartitionsSpec.getNumShards() == null ? 1 : hashedPartitionsSpec.getNumShards();
log.info("Creating [%,d] buckets for interval [%s]", numBucketsPerInterval, interval);
}
} else {
numBucketsPerInterval = 1;
}
partitionAnalysis.updateBucket(interval, numBucketsPerInterval);
}
log.info("Found intervals and shardSpecs in %,dms", System.currentTimeMillis() - determineShardSpecsStartMillis);
return partitionAnalysis;
}
private Map> collectIntervalsAndShardSpecs(
ObjectMapper jsonMapper,
IndexIngestionSpec ingestionSchema,
InputSource inputSource,
File tmpDir,
GranularitySpec granularitySpec,
@Nonnull PartitionsSpec partitionsSpec,
boolean determineIntervals
) throws IOException
{
final Map> hllCollectors = new TreeMap<>(
Comparators.intervalsByStartThenEnd()
);
final Granularity queryGranularity = granularitySpec.getQueryGranularity();
try (final CloseableIterator inputRowIterator = AbstractBatchIndexTask.inputSourceReader(
tmpDir,
ingestionSchema.getDataSchema(),
inputSource,
inputSource.needsFormat() ? getInputFormat(ingestionSchema) : null,
allowNonNullRowsWithinInputIntervalsOf(granularitySpec),
determinePartitionsMeters,
determinePartitionsParseExceptionHandler
)) {
while (inputRowIterator.hasNext()) {
final InputRow inputRow = inputRowIterator.next();
final Interval interval;
if (determineIntervals) {
interval = granularitySpec.getSegmentGranularity().bucket(inputRow.getTimestamp());
} else {
final Optional optInterval = granularitySpec.bucketInterval(inputRow.getTimestamp());
// this interval must exist since it passed the rowFilter
assert optInterval.isPresent();
interval = optInterval.get();
}
if (partitionsSpec.needsDeterminePartitions(false)) {
hllCollectors.computeIfAbsent(interval, intv -> Optional.of(HyperLogLogCollector.makeLatestCollector()));
List
© 2015 - 2025 Weber Informatics LLC | Privacy Policy