
io.druid.indexing.common.task.IndexTask Maven / Gradle / Ivy
/*
* Licensed to Metamarkets Group Inc. (Metamarkets) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. Metamarkets licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package io.druid.indexing.common.task;
import com.fasterxml.jackson.annotation.JacksonInject;
import com.fasterxml.jackson.annotation.JsonCreator;
import com.fasterxml.jackson.annotation.JsonIgnore;
import com.fasterxml.jackson.annotation.JsonProperty;
import com.fasterxml.jackson.annotation.JsonTypeName;
import com.fasterxml.jackson.databind.ObjectMapper;
import com.google.common.base.Optional;
import com.google.common.base.Preconditions;
import com.google.common.base.Supplier;
import com.google.common.collect.ImmutableList;
import com.google.common.collect.Iterables;
import com.google.common.collect.Lists;
import com.google.common.collect.Sets;
import com.google.common.hash.HashFunction;
import com.google.common.hash.Hashing;
import com.metamx.common.ISE;
import com.metamx.common.guava.Comparators;
import com.metamx.common.logger.Logger;
import io.druid.data.input.Committer;
import io.druid.data.input.Firehose;
import io.druid.data.input.FirehoseFactory;
import io.druid.data.input.InputRow;
import io.druid.data.input.Rows;
import io.druid.granularity.QueryGranularity;
import io.druid.indexing.common.TaskLock;
import io.druid.indexing.common.TaskStatus;
import io.druid.indexing.common.TaskToolbox;
import io.druid.indexing.common.index.YeOldePlumberSchool;
import io.druid.query.aggregation.hyperloglog.HyperLogLogCollector;
import io.druid.segment.IndexSpec;
import io.druid.segment.indexing.DataSchema;
import io.druid.segment.indexing.IOConfig;
import io.druid.segment.indexing.IngestionSpec;
import io.druid.segment.indexing.RealtimeTuningConfig;
import io.druid.segment.indexing.TuningConfig;
import io.druid.segment.indexing.granularity.GranularitySpec;
import io.druid.segment.loading.DataSegmentPusher;
import io.druid.segment.realtime.FireDepartmentMetrics;
import io.druid.segment.realtime.plumber.Committers;
import io.druid.segment.realtime.plumber.Plumber;
import io.druid.timeline.DataSegment;
import io.druid.timeline.partition.HashBasedNumberedShardSpec;
import io.druid.timeline.partition.NoneShardSpec;
import io.druid.timeline.partition.ShardSpec;
import org.joda.time.DateTime;
import org.joda.time.Interval;
import javax.annotation.Nullable;
import java.io.File;
import java.io.IOException;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.SortedSet;
import java.util.concurrent.CopyOnWriteArrayList;
public class IndexTask extends AbstractFixedIntervalTask
{
private static final Logger log = new Logger(IndexTask.class);
private static HashFunction hashFunction = Hashing.murmur3_128();
/**
* Should we index this inputRow? Decision is based on our interval and shardSpec.
*
* @param inputRow the row to check
*
* @return true or false
*/
private static boolean shouldIndex(
final ShardSpec shardSpec,
final Interval interval,
final InputRow inputRow,
final QueryGranularity rollupGran
)
{
return interval.contains(inputRow.getTimestampFromEpoch())
&& shardSpec.isInChunk(rollupGran.truncate(inputRow.getTimestampFromEpoch()), inputRow);
}
private static String makeId(String id, IndexIngestionSpec ingestionSchema)
{
if (id == null) {
return String.format("index_%s_%s", makeDataSource(ingestionSchema), new DateTime().toString());
}
return id;
}
private static String makeDataSource(IndexIngestionSpec ingestionSchema)
{
return ingestionSchema.getDataSchema().getDataSource();
}
private static Interval makeInterval(IndexIngestionSpec ingestionSchema)
{
GranularitySpec spec = ingestionSchema.getDataSchema().getGranularitySpec();
return new Interval(
spec.bucketIntervals().get().first().getStart(),
spec.bucketIntervals().get().last().getEnd()
);
}
static RealtimeTuningConfig convertTuningConfig(
ShardSpec shardSpec,
int rowFlushBoundary,
IndexSpec indexSpec,
boolean buildV9Directly
)
{
return new RealtimeTuningConfig(
rowFlushBoundary,
null,
null,
null,
null,
null,
null,
shardSpec,
indexSpec,
buildV9Directly,
0,
0,
true,
null
);
}
@JsonIgnore
private final IndexIngestionSpec ingestionSchema;
private final ObjectMapper jsonMapper;
@JsonCreator
public IndexTask(
@JsonProperty("id") String id,
@JsonProperty("resource") TaskResource taskResource,
@JsonProperty("spec") IndexIngestionSpec ingestionSchema,
@JacksonInject ObjectMapper jsonMapper,
@JsonProperty("context") Map context
)
{
super(
// _not_ the version, just something uniqueish
makeId(id, ingestionSchema),
taskResource,
makeDataSource(ingestionSchema),
makeInterval(ingestionSchema),
context
);
this.ingestionSchema = ingestionSchema;
this.jsonMapper = jsonMapper;
}
@Override
public String getType()
{
return "index";
}
@JsonProperty("spec")
public IndexIngestionSpec getIngestionSchema()
{
return ingestionSchema;
}
@Override
public TaskStatus run(TaskToolbox toolbox) throws Exception
{
final GranularitySpec granularitySpec = ingestionSchema.getDataSchema().getGranularitySpec();
final int targetPartitionSize = ingestionSchema.getTuningConfig().getTargetPartitionSize();
final TaskLock myLock = Iterables.getOnlyElement(getTaskLocks(toolbox));
final Set segments = Sets.newHashSet();
final Set validIntervals = Sets.intersection(granularitySpec.bucketIntervals().get(), getDataIntervals());
if (validIntervals.isEmpty()) {
throw new ISE("No valid data intervals found. Check your configs!");
}
for (final Interval bucket : validIntervals) {
final List shardSpecs;
if (targetPartitionSize > 0) {
shardSpecs = determinePartitions(bucket, targetPartitionSize, granularitySpec.getQueryGranularity());
} else {
int numShards = ingestionSchema.getTuningConfig().getNumShards();
if (numShards > 0) {
shardSpecs = Lists.newArrayList();
for (int i = 0; i < numShards; i++) {
shardSpecs.add(new HashBasedNumberedShardSpec(i, numShards, null, jsonMapper));
}
} else {
shardSpecs = ImmutableList.of(new NoneShardSpec());
}
}
for (final ShardSpec shardSpec : shardSpecs) {
final DataSegment segment = generateSegment(
toolbox,
ingestionSchema.getDataSchema(),
shardSpec,
bucket,
myLock.getVersion()
);
segments.add(segment);
}
}
toolbox.publishSegments(segments);
return TaskStatus.success(getId());
}
private SortedSet getDataIntervals() throws IOException
{
final FirehoseFactory firehoseFactory = ingestionSchema.getIOConfig().getFirehoseFactory();
final GranularitySpec granularitySpec = ingestionSchema.getDataSchema().getGranularitySpec();
SortedSet retVal = Sets.newTreeSet(Comparators.intervalsByStartThenEnd());
int unparsed = 0;
try (Firehose firehose = firehoseFactory.connect(ingestionSchema.getDataSchema().getParser())) {
while (firehose.hasMore()) {
final InputRow inputRow = firehose.nextRow();
DateTime dt = new DateTime(inputRow.getTimestampFromEpoch());
Optional interval = granularitySpec.bucketInterval(dt);
if (interval.isPresent()) {
retVal.add(interval.get());
} else {
unparsed++;
}
}
}
if (unparsed > 0) {
log.warn("Unable to to find a matching interval for [%,d] events", unparsed);
}
return retVal;
}
private List determinePartitions(
final Interval interval,
final int targetPartitionSize,
final QueryGranularity queryGranularity
) throws IOException
{
log.info("Determining partitions for interval[%s] with targetPartitionSize[%d]", interval, targetPartitionSize);
final FirehoseFactory firehoseFactory = ingestionSchema.getIOConfig().getFirehoseFactory();
// The implementation of this determine partitions stuff is less than optimal. Should be done better.
// Use HLL to estimate number of rows
HyperLogLogCollector collector = HyperLogLogCollector.makeLatestCollector();
// Load data
try (Firehose firehose = firehoseFactory.connect(ingestionSchema.getDataSchema().getParser())) {
while (firehose.hasMore()) {
final InputRow inputRow = firehose.nextRow();
if (interval.contains(inputRow.getTimestampFromEpoch())) {
final List
© 2015 - 2025 Weber Informatics LLC | Privacy Policy