Many resources are needed to download a project. Please understand that we have to compensate our server costs. Thank you in advance. Project price only 1 $
You can buy this project and download/modify it how often you want.
/*
* Licensed to Metamarkets Group Inc. (Metamarkets) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. Metamarkets licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package io.druid.indexing.common.task;
import com.fasterxml.jackson.annotation.JacksonInject;
import com.fasterxml.jackson.annotation.JsonCreator;
import com.fasterxml.jackson.annotation.JsonIgnore;
import com.fasterxml.jackson.annotation.JsonProperty;
import com.fasterxml.jackson.annotation.JsonTypeName;
import com.fasterxml.jackson.databind.ObjectMapper;
import com.google.common.base.Function;
import com.google.common.base.Joiner;
import com.google.common.base.Optional;
import com.google.common.base.Preconditions;
import com.google.common.base.Supplier;
import com.google.common.collect.ImmutableList;
import com.google.common.collect.ImmutableMap;
import com.google.common.collect.ImmutableSortedMap;
import com.google.common.collect.Iterables;
import com.google.common.collect.Lists;
import com.google.common.collect.Maps;
import com.google.common.hash.HashFunction;
import com.google.common.hash.Hashing;
import io.druid.common.utils.JodaUtils;
import io.druid.data.input.Committer;
import io.druid.data.input.Firehose;
import io.druid.data.input.FirehoseFactory;
import io.druid.data.input.InputRow;
import io.druid.data.input.Rows;
import io.druid.guice.annotations.Smile;
import io.druid.hll.HyperLogLogCollector;
import io.druid.indexing.appenderator.ActionBasedSegmentAllocator;
import io.druid.indexing.appenderator.ActionBasedUsedSegmentChecker;
import io.druid.indexing.common.TaskLock;
import io.druid.indexing.common.TaskStatus;
import io.druid.indexing.common.TaskToolbox;
import io.druid.indexing.common.actions.LockAcquireAction;
import io.druid.indexing.common.actions.LockTryAcquireAction;
import io.druid.indexing.common.actions.SegmentTransactionalInsertAction;
import io.druid.indexing.common.actions.TaskActionClient;
import io.druid.java.util.common.ISE;
import io.druid.java.util.common.granularity.Granularity;
import io.druid.java.util.common.guava.Comparators;
import io.druid.java.util.common.logger.Logger;
import io.druid.java.util.common.parsers.ParseException;
import io.druid.query.DruidMetrics;
import io.druid.segment.IndexSpec;
import io.druid.segment.indexing.DataSchema;
import io.druid.segment.indexing.IOConfig;
import io.druid.segment.indexing.IngestionSpec;
import io.druid.segment.indexing.RealtimeIOConfig;
import io.druid.segment.indexing.TuningConfig;
import io.druid.segment.indexing.granularity.GranularitySpec;
import io.druid.segment.realtime.FireDepartment;
import io.druid.segment.realtime.FireDepartmentMetrics;
import io.druid.segment.realtime.RealtimeMetricsMonitor;
import io.druid.segment.realtime.appenderator.Appenderator;
import io.druid.segment.realtime.appenderator.AppenderatorConfig;
import io.druid.segment.realtime.appenderator.Appenderators;
import io.druid.segment.realtime.appenderator.FiniteAppenderatorDriver;
import io.druid.segment.realtime.appenderator.SegmentAllocator;
import io.druid.segment.realtime.appenderator.SegmentIdentifier;
import io.druid.segment.realtime.appenderator.SegmentsAndMetadata;
import io.druid.segment.realtime.appenderator.TransactionalSegmentPublisher;
import io.druid.segment.realtime.firehose.ReplayableFirehoseFactory;
import io.druid.segment.realtime.plumber.Committers;
import io.druid.segment.realtime.plumber.NoopSegmentHandoffNotifierFactory;
import io.druid.timeline.DataSegment;
import io.druid.timeline.partition.HashBasedNumberedShardSpec;
import io.druid.timeline.partition.NoneShardSpec;
import io.druid.timeline.partition.NumberedShardSpec;
import io.druid.timeline.partition.ShardSpec;
import io.druid.timeline.partition.ShardSpecLookup;
import org.joda.time.DateTime;
import org.joda.time.Interval;
import org.joda.time.Period;
import javax.annotation.Nullable;
import java.io.File;
import java.io.IOException;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.SortedSet;
public class IndexTask extends AbstractTask
{
private static final Logger log = new Logger(IndexTask.class);
private static final HashFunction hashFunction = Hashing.murmur3_128();
private static String makeId(String id, IndexIngestionSpec ingestionSchema)
{
return id != null ? id : String.format("index_%s_%s", makeDataSource(ingestionSchema), new DateTime());
}
private static String makeDataSource(IndexIngestionSpec ingestionSchema)
{
return ingestionSchema.getDataSchema().getDataSource();
}
@JsonIgnore
private final IndexIngestionSpec ingestionSchema;
private final ObjectMapper smileMapper;
@JsonCreator
public IndexTask(
@JsonProperty("id") final String id,
@JsonProperty("resource") final TaskResource taskResource,
@JsonProperty("spec") final IndexIngestionSpec ingestionSchema,
@JsonProperty("context") final Map context,
@Smile @JacksonInject final ObjectMapper smileMapper
)
{
super(makeId(id, ingestionSchema), null, taskResource, makeDataSource(ingestionSchema), context);
this.ingestionSchema = ingestionSchema;
this.smileMapper = smileMapper;
}
@Override
public String getType()
{
return "index";
}
@Override
public boolean isReady(TaskActionClient taskActionClient) throws Exception
{
Optional> intervals = ingestionSchema.getDataSchema().getGranularitySpec().bucketIntervals();
if (intervals.isPresent()) {
Interval interval = JodaUtils.umbrellaInterval(intervals.get());
return taskActionClient.submit(new LockTryAcquireAction(interval)) != null;
} else {
return true;
}
}
@JsonProperty("spec")
public IndexIngestionSpec getIngestionSchema()
{
return ingestionSchema;
}
@Override
public TaskStatus run(final TaskToolbox toolbox) throws Exception
{
final boolean determineIntervals = !ingestionSchema.getDataSchema()
.getGranularitySpec()
.bucketIntervals()
.isPresent();
final FirehoseFactory delegateFirehoseFactory = ingestionSchema.getIOConfig().getFirehoseFactory();
final FirehoseFactory firehoseFactory;
if (ingestionSchema.getIOConfig().isSkipFirehoseCaching()
|| delegateFirehoseFactory instanceof ReplayableFirehoseFactory) {
firehoseFactory = delegateFirehoseFactory;
} else {
firehoseFactory = new ReplayableFirehoseFactory(
delegateFirehoseFactory,
ingestionSchema.getTuningConfig().isReportParseExceptions(),
null,
null,
smileMapper
);
}
final Map> shardSpecs = determineShardSpecs(toolbox, firehoseFactory);
final String version;
final DataSchema dataSchema;
if (determineIntervals) {
Interval interval = JodaUtils.umbrellaInterval(shardSpecs.keySet());
TaskLock lock = toolbox.getTaskActionClient().submit(new LockAcquireAction(interval));
version = lock.getVersion();
dataSchema = ingestionSchema.getDataSchema().withGranularitySpec(
ingestionSchema.getDataSchema()
.getGranularitySpec()
.withIntervals(
JodaUtils.condenseIntervals(
shardSpecs.keySet()
)
)
);
} else {
version = Iterables.getOnlyElement(getTaskLocks(toolbox)).getVersion();
dataSchema = ingestionSchema.getDataSchema();
}
if (generateAndPublishSegments(toolbox, dataSchema, shardSpecs, version, firehoseFactory)) {
return TaskStatus.success(getId());
} else {
return TaskStatus.failure(getId());
}
}
/**
* Determines the number of shards for each interval using a hash of queryGranularity timestamp + all dimensions (i.e
* hash-based partitioning). In the future we may want to also support single-dimension partitioning.
*/
private Map> determineShardSpecs(
final TaskToolbox toolbox,
final FirehoseFactory firehoseFactory
) throws IOException
{
final ObjectMapper jsonMapper = toolbox.getObjectMapper();
final GranularitySpec granularitySpec = ingestionSchema.getDataSchema().getGranularitySpec();
final Granularity queryGranularity = granularitySpec.getQueryGranularity();
final boolean determineNumPartitions = ingestionSchema.getTuningConfig().getNumShards() == null;
final boolean determineIntervals = !ingestionSchema.getDataSchema()
.getGranularitySpec()
.bucketIntervals()
.isPresent();
final Map> shardSpecs = Maps.newHashMap();
// if we were given number of shards per interval and the intervals, we don't need to scan the data
if (!determineNumPartitions && !determineIntervals) {
log.info("numShards and intervals provided, skipping determine partition scan");
final SortedSet intervals = ingestionSchema.getDataSchema()
.getGranularitySpec()
.bucketIntervals()
.get();
final int numShards = ingestionSchema.getTuningConfig().getNumShards();
for (Interval interval : intervals) {
final List intervalShardSpecs = Lists.newArrayListWithCapacity(numShards);
if (numShards > 1) {
for (int i = 0; i < numShards; i++) {
intervalShardSpecs.add(new HashBasedNumberedShardSpec(i, numShards, null, jsonMapper));
}
} else {
intervalShardSpecs.add(NoneShardSpec.instance());
}
shardSpecs.put(interval, intervalShardSpecs);
}
return shardSpecs;
}
// determine intervals containing data and prime HLL collectors
final Map> hllCollectors = Maps.newHashMap();
int thrownAway = 0;
log.info("Determining intervals and shardSpecs");
long determineShardSpecsStartMillis = System.currentTimeMillis();
try (final Firehose firehose = firehoseFactory.connect(ingestionSchema.getDataSchema().getParser())) {
while (firehose.hasMore()) {
final InputRow inputRow = firehose.nextRow();
final Interval interval;
if (determineIntervals) {
interval = granularitySpec.getSegmentGranularity().bucket(inputRow.getTimestamp());
} else {
final Optional optInterval = granularitySpec.bucketInterval(inputRow.getTimestamp());
if (!optInterval.isPresent()) {
thrownAway++;
continue;
}
interval = optInterval.get();
}
if (!determineNumPartitions) {
// we don't need to determine partitions but we still need to determine intervals, so add an Optional.absent()
// for the interval and don't instantiate a HLL collector
if (!hllCollectors.containsKey(interval)) {
hllCollectors.put(interval, Optional.absent());
}
continue;
}
if (!hllCollectors.containsKey(interval)) {
hllCollectors.put(interval, Optional.of(HyperLogLogCollector.makeLatestCollector()));
}
List