Many resources are needed to download a project. Please understand that we have to compensate our server costs. Thank you in advance. Project price only 1 $
You can buy this project and download/modify it how often you want.
/*
* Licensed to Metamarkets Group Inc. (Metamarkets) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. Metamarkets licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package io.druid.indexing.common.task;
import com.fasterxml.jackson.annotation.JsonCreator;
import com.fasterxml.jackson.annotation.JsonIgnore;
import com.fasterxml.jackson.annotation.JsonProperty;
import com.fasterxml.jackson.annotation.JsonTypeName;
import com.fasterxml.jackson.databind.ObjectMapper;
import com.google.common.base.Joiner;
import com.google.common.base.Optional;
import com.google.common.base.Preconditions;
import com.google.common.base.Throwables;
import com.google.common.collect.ImmutableList;
import com.google.common.collect.ImmutableMap;
import com.google.common.collect.Iterables;
import com.google.common.hash.HashFunction;
import com.google.common.hash.Hashing;
import com.google.common.util.concurrent.ListenableFuture;
import io.druid.data.input.Firehose;
import io.druid.data.input.FirehoseFactory;
import io.druid.data.input.InputRow;
import io.druid.data.input.Rows;
import io.druid.hll.HyperLogLogCollector;
import io.druid.indexing.appenderator.ActionBasedSegmentAllocator;
import io.druid.indexing.appenderator.ActionBasedUsedSegmentChecker;
import io.druid.indexing.common.TaskLock;
import io.druid.indexing.common.TaskStatus;
import io.druid.indexing.common.TaskToolbox;
import io.druid.indexing.common.actions.SegmentTransactionalInsertAction;
import io.druid.indexing.common.actions.TaskActionClient;
import io.druid.indexing.firehose.IngestSegmentFirehoseFactory;
import io.druid.java.util.common.ISE;
import io.druid.java.util.common.JodaUtils;
import io.druid.java.util.common.StringUtils;
import io.druid.java.util.common.granularity.Granularity;
import io.druid.java.util.common.guava.Comparators;
import io.druid.java.util.common.logger.Logger;
import io.druid.java.util.common.parsers.ParseException;
import io.druid.query.DruidMetrics;
import io.druid.segment.IndexSpec;
import io.druid.segment.indexing.DataSchema;
import io.druid.segment.indexing.IOConfig;
import io.druid.segment.indexing.IngestionSpec;
import io.druid.segment.indexing.RealtimeIOConfig;
import io.druid.segment.indexing.TuningConfig;
import io.druid.segment.indexing.granularity.GranularitySpec;
import io.druid.segment.realtime.FireDepartment;
import io.druid.segment.realtime.FireDepartmentMetrics;
import io.druid.segment.realtime.RealtimeMetricsMonitor;
import io.druid.segment.realtime.appenderator.Appenderator;
import io.druid.segment.realtime.appenderator.AppenderatorConfig;
import io.druid.segment.realtime.appenderator.BaseAppenderatorDriver;
import io.druid.segment.realtime.appenderator.AppenderatorDriverAddResult;
import io.druid.segment.realtime.appenderator.Appenderators;
import io.druid.segment.realtime.appenderator.BatchAppenderatorDriver;
import io.druid.segment.realtime.appenderator.SegmentAllocator;
import io.druid.segment.realtime.appenderator.SegmentIdentifier;
import io.druid.segment.realtime.appenderator.SegmentsAndMetadata;
import io.druid.segment.realtime.appenderator.TransactionalSegmentPublisher;
import io.druid.segment.writeout.SegmentWriteOutMediumFactory;
import io.druid.timeline.DataSegment;
import io.druid.timeline.partition.HashBasedNumberedShardSpec;
import io.druid.timeline.partition.NoneShardSpec;
import io.druid.timeline.partition.NumberedShardSpec;
import io.druid.timeline.partition.ShardSpec;
import org.codehaus.plexus.util.FileUtils;
import org.joda.time.DateTime;
import org.joda.time.Interval;
import org.joda.time.Period;
import javax.annotation.Nullable;
import java.io.File;
import java.io.IOException;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;
import java.util.Objects;
import java.util.Set;
import java.util.SortedSet;
import java.util.TreeMap;
import java.util.TreeSet;
import java.util.concurrent.ExecutionException;
import java.util.concurrent.TimeUnit;
import java.util.concurrent.TimeoutException;
import java.util.concurrent.atomic.AtomicInteger;
import java.util.function.BiFunction;
import java.util.stream.Collectors;
import java.util.stream.IntStream;
public class IndexTask extends AbstractTask
{
private static final Logger log = new Logger(IndexTask.class);
private static final HashFunction hashFunction = Hashing.murmur3_128();
private static final String TYPE = "index";
private static String makeGroupId(IndexIngestionSpec ingestionSchema)
{
return makeGroupId(ingestionSchema.ioConfig.appendToExisting, ingestionSchema.dataSchema.getDataSource());
}
private static String makeGroupId(boolean isAppendToExisting, String dataSource)
{
if (isAppendToExisting) {
// Shared locking group for all tasks that append, since they are OK to run concurrently.
return StringUtils.format("%s_append_%s", TYPE, dataSource);
} else {
// Return null, one locking group per task.
return null;
}
}
@JsonIgnore
private final IndexIngestionSpec ingestionSchema;
@JsonCreator
public IndexTask(
@JsonProperty("id") final String id,
@JsonProperty("resource") final TaskResource taskResource,
@JsonProperty("spec") final IndexIngestionSpec ingestionSchema,
@JsonProperty("context") final Map context
)
{
this(
id,
makeGroupId(ingestionSchema),
taskResource,
ingestionSchema.dataSchema.getDataSource(),
ingestionSchema,
context
);
}
IndexTask(
String id,
String groupId,
TaskResource resource,
String dataSource,
IndexIngestionSpec ingestionSchema,
Map context
)
{
super(
getOrMakeId(id, TYPE, dataSource),
groupId,
resource,
dataSource,
context
);
this.ingestionSchema = ingestionSchema;
}
@Override
public int getPriority()
{
return getContextValue(Tasks.PRIORITY_KEY, Tasks.DEFAULT_BATCH_INDEX_TASK_PRIORITY);
}
@Override
public String getType()
{
return TYPE;
}
@Override
public boolean isReady(TaskActionClient taskActionClient) throws Exception
{
final Optional> intervals = ingestionSchema.getDataSchema()
.getGranularitySpec()
.bucketIntervals();
if (intervals.isPresent()) {
return isReady(taskActionClient, intervals.get());
} else {
return true;
}
}
static boolean isReady(TaskActionClient actionClient, SortedSet intervals) throws IOException
{
final List locks = getTaskLocks(actionClient);
if (locks.size() == 0) {
try {
Tasks.tryAcquireExclusiveLocks(actionClient, intervals);
}
catch (Exception e) {
return false;
}
}
return true;
}
@JsonProperty("spec")
public IndexIngestionSpec getIngestionSchema()
{
return ingestionSchema;
}
@Override
public TaskStatus run(final TaskToolbox toolbox) throws Exception
{
final boolean determineIntervals = !ingestionSchema.getDataSchema()
.getGranularitySpec()
.bucketIntervals()
.isPresent();
final FirehoseFactory firehoseFactory = ingestionSchema.getIOConfig().getFirehoseFactory();
if (firehoseFactory instanceof IngestSegmentFirehoseFactory) {
// pass toolbox to Firehose
((IngestSegmentFirehoseFactory) firehoseFactory).setTaskToolbox(toolbox);
}
final File firehoseTempDir = toolbox.getFirehoseTemporaryDir();
// Firehose temporary directory is automatically removed when this IndexTask completes.
FileUtils.forceMkdir(firehoseTempDir);
final ShardSpecs shardSpecs = determineShardSpecs(toolbox, firehoseFactory, firehoseTempDir);
final DataSchema dataSchema;
final Map versions;
if (determineIntervals) {
final SortedSet intervals = new TreeSet<>(Comparators.intervalsByStartThenEnd());
intervals.addAll(shardSpecs.getIntervals());
final Map locks = Tasks.tryAcquireExclusiveLocks(toolbox.getTaskActionClient(), intervals);
versions = locks.entrySet().stream()
.collect(Collectors.toMap(Entry::getKey, entry -> entry.getValue().getVersion()));
dataSchema = ingestionSchema.getDataSchema().withGranularitySpec(
ingestionSchema.getDataSchema()
.getGranularitySpec()
.withIntervals(
JodaUtils.condenseIntervals(
shardSpecs.getIntervals()
)
)
);
} else {
versions = getTaskLocks(toolbox.getTaskActionClient())
.stream()
.collect(Collectors.toMap(TaskLock::getInterval, TaskLock::getVersion));
dataSchema = ingestionSchema.getDataSchema();
}
if (generateAndPublishSegments(toolbox, dataSchema, shardSpecs, versions, firehoseFactory, firehoseTempDir)) {
return TaskStatus.success(getId());
} else {
return TaskStatus.failure(getId());
}
}
private static String findVersion(Map versions, Interval interval)
{
return versions.entrySet().stream()
.filter(entry -> entry.getKey().contains(interval))
.map(Entry::getValue)
.findFirst()
.orElseThrow(() -> new ISE("Cannot find a version for interval[%s]", interval));
}
private static boolean isGuaranteedRollup(IndexIOConfig ioConfig, IndexTuningConfig tuningConfig)
{
Preconditions.checkState(
!(tuningConfig.isForceGuaranteedRollup() &&
(tuningConfig.isForceExtendableShardSpecs() || ioConfig.isAppendToExisting())),
"Perfect rollup cannot be guaranteed with extendable shardSpecs"
);
return tuningConfig.isForceGuaranteedRollup();
}
private static boolean isExtendableShardSpecs(IndexIOConfig ioConfig, IndexTuningConfig tuningConfig)
{
return !isGuaranteedRollup(ioConfig, tuningConfig);
}
/**
* Determines intervals and shardSpecs for input data. This method first checks that it must determine intervals and
* shardSpecs by itself. Intervals must be determined if they are not specified in {@link GranularitySpec}.
* ShardSpecs must be determined if the perfect rollup must be guaranteed even though the number of shards is not
* specified in {@link IndexTuningConfig}.
*
* If both intervals and shardSpecs don't have to be determined, this method simply returns {@link ShardSpecs} for the
* given intervals. Here, if {@link IndexTuningConfig#numShards} is not specified, {@link NumberedShardSpec} is used.
*
* If one of intervals or shardSpecs need to be determined, this method reads the entire input for determining one of
* them. If the perfect rollup must be guaranteed, {@link HashBasedNumberedShardSpec} is used for hash partitioning
* of input data. In the future we may want to also support single-dimension partitioning.
*
* @return generated {@link ShardSpecs} representing a map of intervals and corresponding shard specs
*/
private ShardSpecs determineShardSpecs(
final TaskToolbox toolbox,
final FirehoseFactory firehoseFactory,
final File firehoseTempDir
) throws IOException
{
final ObjectMapper jsonMapper = toolbox.getObjectMapper();
final IndexTuningConfig tuningConfig = ingestionSchema.getTuningConfig();
final IndexIOConfig ioConfig = ingestionSchema.getIOConfig();
final GranularitySpec granularitySpec = ingestionSchema.getDataSchema().getGranularitySpec();
// Must determine intervals if unknown, since we acquire all locks before processing any data.
final boolean determineIntervals = !granularitySpec.bucketIntervals().isPresent();
// Must determine partitions if rollup is guaranteed and the user didn't provide a specific value.
final boolean determineNumPartitions = tuningConfig.getNumShards() == null
&& isGuaranteedRollup(ioConfig, tuningConfig);
// if we were given number of shards per interval and the intervals, we don't need to scan the data
if (!determineNumPartitions && !determineIntervals) {
log.info("Skipping determine partition scan");
return createShardSpecWithoutInputScan(
jsonMapper,
granularitySpec,
ioConfig,
tuningConfig
);
} else {
// determine intervals containing data and prime HLL collectors
return createShardSpecsFromInput(
jsonMapper,
ingestionSchema,
firehoseFactory,
firehoseTempDir,
granularitySpec,
tuningConfig,
determineIntervals,
determineNumPartitions
);
}
}
private static ShardSpecs createShardSpecWithoutInputScan(
ObjectMapper jsonMapper,
GranularitySpec granularitySpec,
IndexIOConfig ioConfig,
IndexTuningConfig tuningConfig
)
{
final Map> shardSpecs = new HashMap<>();
final SortedSet intervals = granularitySpec.bucketIntervals().get();
if (isGuaranteedRollup(ioConfig, tuningConfig)) {
// Overwrite mode, guaranteed rollup: shardSpecs must be known in advance.
final int numShards = tuningConfig.getNumShards() == null ? 1 : tuningConfig.getNumShards();
final BiFunction shardSpecCreateFn = getShardSpecCreateFunction(
numShards,
jsonMapper
);
for (Interval interval : intervals) {
final List intervalShardSpecs = IntStream.range(0, numShards)
.mapToObj(
shardId -> shardSpecCreateFn.apply(shardId, numShards)
)
.collect(Collectors.toList());
shardSpecs.put(interval, intervalShardSpecs);
}
} else {
for (Interval interval : intervals) {
shardSpecs.put(interval, ImmutableList.of());
}
}
return new ShardSpecs(shardSpecs);
}
private static ShardSpecs createShardSpecsFromInput(
ObjectMapper jsonMapper,
IndexIngestionSpec ingestionSchema,
FirehoseFactory firehoseFactory,
File firehoseTempDir,
GranularitySpec granularitySpec,
IndexTuningConfig tuningConfig,
boolean determineIntervals,
boolean determineNumPartitions
) throws IOException
{
log.info("Determining intervals and shardSpecs");
long determineShardSpecsStartMillis = System.currentTimeMillis();
final Map> hllCollectors = collectIntervalsAndShardSpecs(
jsonMapper,
ingestionSchema,
firehoseFactory,
firehoseTempDir,
granularitySpec,
determineIntervals,
determineNumPartitions
);
final Map> intervalToShardSpecs = new HashMap<>();
final int defaultNumShards = tuningConfig.getNumShards() == null ? 1 : tuningConfig.getNumShards();
for (final Map.Entry> entry : hllCollectors.entrySet()) {
final Interval interval = entry.getKey();
final HyperLogLogCollector collector = entry.getValue().orNull();
final int numShards;
if (determineNumPartitions) {
final long numRows = collector.estimateCardinalityRound();
numShards = (int) Math.ceil((double) numRows / tuningConfig.getTargetPartitionSize());
log.info("Estimated [%,d] rows of data for interval [%s], creating [%,d] shards", numRows, interval, numShards);
} else {
numShards = defaultNumShards;
log.info("Creating [%,d] shards for interval [%s]", numShards, interval);
}
if (isGuaranteedRollup(ingestionSchema.getIOConfig(), ingestionSchema.getTuningConfig())) {
// Overwrite mode, guaranteed rollup: shardSpecs must be known in advance.
final BiFunction shardSpecCreateFn = getShardSpecCreateFunction(
numShards,
jsonMapper
);
final List intervalShardSpecs = IntStream.range(0, numShards)
.mapToObj(
shardId -> shardSpecCreateFn.apply(shardId, numShards)
).collect(Collectors.toList());
intervalToShardSpecs.put(interval, intervalShardSpecs);
} else {
intervalToShardSpecs.put(interval, ImmutableList.of());
}
}
log.info("Found intervals and shardSpecs in %,dms", System.currentTimeMillis() - determineShardSpecsStartMillis);
return new ShardSpecs(intervalToShardSpecs);
}
private static Map> collectIntervalsAndShardSpecs(
ObjectMapper jsonMapper,
IndexIngestionSpec ingestionSchema,
FirehoseFactory firehoseFactory,
File firehoseTempDir,
GranularitySpec granularitySpec,
boolean determineIntervals,
boolean determineNumPartitions
) throws IOException
{
final Map> hllCollectors = new TreeMap<>(
Comparators.intervalsByStartThenEnd()
);
int thrownAway = 0;
int unparseable = 0;
final Granularity queryGranularity = granularitySpec.getQueryGranularity();
try (
final Firehose firehose = firehoseFactory.connect(ingestionSchema.getDataSchema().getParser(), firehoseTempDir)
) {
while (firehose.hasMore()) {
try {
final InputRow inputRow = firehose.nextRow();
// The null inputRow means the caller must skip this row.
if (inputRow == null) {
continue;
}
final Interval interval;
if (determineIntervals) {
interval = granularitySpec.getSegmentGranularity().bucket(inputRow.getTimestamp());
} else {
final Optional optInterval = granularitySpec.bucketInterval(inputRow.getTimestamp());
if (!optInterval.isPresent()) {
thrownAway++;
continue;
}
interval = optInterval.get();
}
if (determineNumPartitions) {
if (!hllCollectors.containsKey(interval)) {
hllCollectors.put(interval, Optional.of(HyperLogLogCollector.makeLatestCollector()));
}
List