Please wait. This can take some minutes ...
Many resources are needed to download a project. Please understand that we have to compensate our server costs. Thank you in advance.
Project price only 1 $
You can buy this project and download/modify it how often you want.
io.druid.indexer.HadoopDruidIndexerConfig Maven / Gradle / Ivy
/*
* Licensed to Metamarkets Group Inc. (Metamarkets) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. Metamarkets licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package io.druid.indexer;
import com.fasterxml.jackson.annotation.JsonCreator;
import com.fasterxml.jackson.annotation.JsonIgnore;
import com.fasterxml.jackson.annotation.JsonProperty;
import com.fasterxml.jackson.databind.ObjectMapper;
import com.google.common.base.Function;
import com.google.common.base.Joiner;
import com.google.common.base.Optional;
import com.google.common.base.Preconditions;
import com.google.common.base.Splitter;
import com.google.common.base.Throwables;
import com.google.common.collect.ImmutableList;
import com.google.common.collect.Lists;
import com.google.common.collect.Maps;
import com.google.inject.Binder;
import com.google.inject.Injector;
import com.google.inject.Key;
import com.google.inject.Module;
import io.druid.data.input.InputRow;
import io.druid.data.input.impl.InputRowParser;
import io.druid.guice.GuiceInjectors;
import io.druid.guice.JsonConfigProvider;
import io.druid.guice.annotations.Self;
import io.druid.indexer.partitions.PartitionsSpec;
import io.druid.indexer.path.PathSpec;
import io.druid.initialization.Initialization;
import io.druid.java.util.common.DateTimes;
import io.druid.java.util.common.JodaUtils;
import io.druid.java.util.common.StringUtils;
import io.druid.java.util.common.granularity.Granularity;
import io.druid.java.util.common.guava.FunctionalIterable;
import io.druid.java.util.common.jackson.JacksonUtils;
import io.druid.java.util.common.logger.Logger;
import io.druid.segment.IndexIO;
import io.druid.segment.IndexMerger;
import io.druid.segment.IndexMergerV9;
import io.druid.segment.IndexSpec;
import io.druid.segment.indexing.granularity.GranularitySpec;
import io.druid.segment.loading.DataSegmentPusher;
import io.druid.server.DruidNode;
import io.druid.timeline.DataSegment;
import io.druid.timeline.partition.ShardSpec;
import io.druid.timeline.partition.ShardSpecLookup;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.mapreduce.Job;
import org.joda.time.DateTime;
import org.joda.time.Interval;
import org.joda.time.format.ISODateTimeFormat;
import java.io.File;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.Reader;
import java.nio.charset.Charset;
import java.nio.charset.StandardCharsets;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.SortedSet;
/**
*/
public class HadoopDruidIndexerConfig
{
private static final Logger log = new Logger(HadoopDruidIndexerConfig.class);
private static final Injector injector;
public static final String CONFIG_PROPERTY = "druid.indexer.config";
public static final Charset JAVA_NATIVE_CHARSET = Charset.forName("Unicode");
public static final Splitter TAB_SPLITTER = Splitter.on("\t");
public static final Joiner TAB_JOINER = Joiner.on("\t");
public static final ObjectMapper JSON_MAPPER;
public static final IndexIO INDEX_IO;
public static final IndexMerger INDEX_MERGER_V9;
public static final HadoopKerberosConfig HADOOP_KERBEROS_CONFIG;
public static final DataSegmentPusher DATA_SEGMENT_PUSHER;
private static final String DEFAULT_WORKING_PATH = "/tmp/druid-indexing";
static {
injector = Initialization.makeInjectorWithModules(
GuiceInjectors.makeStartupInjector(),
ImmutableList.of(
new Module()
{
@Override
public void configure(Binder binder)
{
JsonConfigProvider.bindInstance(
binder, Key.get(DruidNode.class, Self.class), new DruidNode("hadoop-indexer", null, null, null, true, false)
);
JsonConfigProvider.bind(binder, "druid.hadoop.security.kerberos", HadoopKerberosConfig.class);
}
},
new IndexingHadoopModule()
)
);
JSON_MAPPER = injector.getInstance(ObjectMapper.class);
INDEX_IO = injector.getInstance(IndexIO.class);
INDEX_MERGER_V9 = injector.getInstance(IndexMergerV9.class);
HADOOP_KERBEROS_CONFIG = injector.getInstance(HadoopKerberosConfig.class);
DATA_SEGMENT_PUSHER = injector.getInstance(DataSegmentPusher.class);
}
public enum IndexJobCounters
{
INVALID_ROW_COUNTER
}
public static HadoopDruidIndexerConfig fromSpec(HadoopIngestionSpec spec)
{
return new HadoopDruidIndexerConfig(spec);
}
public static HadoopDruidIndexerConfig fromMap(Map argSpec)
{
// Eventually PathSpec needs to get rid of its Hadoop dependency, then maybe this can be ingested directly without
// the Map<> intermediary
if (argSpec.containsKey("spec")) {
return HadoopDruidIndexerConfig.JSON_MAPPER.convertValue(
argSpec,
HadoopDruidIndexerConfig.class
);
}
return new HadoopDruidIndexerConfig(
HadoopDruidIndexerConfig.JSON_MAPPER.convertValue(
argSpec,
HadoopIngestionSpec.class
)
);
}
@SuppressWarnings("unchecked")
public static HadoopDruidIndexerConfig fromFile(File file)
{
try {
return fromMap(
HadoopDruidIndexerConfig.JSON_MAPPER.readValue(
file, JacksonUtils.TYPE_REFERENCE_MAP_STRING_OBJECT
)
);
}
catch (IOException e) {
throw Throwables.propagate(e);
}
}
@SuppressWarnings("unchecked")
public static HadoopDruidIndexerConfig fromString(String str)
{
// This is a map to try and prevent dependency screwbally-ness
try {
return fromMap(
HadoopDruidIndexerConfig.JSON_MAPPER.readValue(
str, JacksonUtils.TYPE_REFERENCE_MAP_STRING_OBJECT
)
);
}
catch (IOException e) {
throw Throwables.propagate(e);
}
}
@SuppressWarnings("unchecked")
public static HadoopDruidIndexerConfig fromDistributedFileSystem(String path)
{
try {
Path pt = new Path(path);
FileSystem fs = pt.getFileSystem(new Configuration());
Reader reader = new InputStreamReader(fs.open(pt), StandardCharsets.UTF_8);
return fromMap(
HadoopDruidIndexerConfig.JSON_MAPPER.readValue(
reader, JacksonUtils.TYPE_REFERENCE_MAP_STRING_OBJECT
)
);
}
catch (Exception e) {
throw Throwables.propagate(e);
}
}
public static HadoopDruidIndexerConfig fromConfiguration(Configuration conf)
{
final HadoopDruidIndexerConfig retVal = fromString(conf.get(HadoopDruidIndexerConfig.CONFIG_PROPERTY));
retVal.verify();
return retVal;
}
private HadoopIngestionSpec schema;
private PathSpec pathSpec;
private final Map shardSpecLookups = Maps.newHashMap();
private final Map> hadoopShardSpecLookup = Maps.newHashMap();
private final Granularity rollupGran;
private final List allowedHadoopPrefix;
@JsonCreator
public HadoopDruidIndexerConfig(
final @JsonProperty("spec") HadoopIngestionSpec spec
)
{
this.schema = spec;
this.pathSpec = JSON_MAPPER.convertValue(spec.getIOConfig().getPathSpec(), PathSpec.class);
for (Map.Entry> entry : spec.getTuningConfig().getShardSpecs().entrySet()) {
if (entry.getValue() == null || entry.getValue().isEmpty()) {
continue;
}
final ShardSpec actualSpec = entry.getValue().get(0).getActualSpec();
shardSpecLookups.put(
entry.getKey(), actualSpec.getLookup(
Lists.transform(
entry.getValue(), new Function()
{
@Override
public ShardSpec apply(HadoopyShardSpec input)
{
return input.getActualSpec();
}
}
)
)
);
Map innerHadoopShardSpecLookup = Maps.newHashMap();
for (HadoopyShardSpec hadoopyShardSpec : entry.getValue()) {
innerHadoopShardSpecLookup.put(hadoopyShardSpec.getActualSpec(), hadoopyShardSpec);
}
hadoopShardSpecLookup.put(entry.getKey(), innerHadoopShardSpecLookup);
}
this.rollupGran = spec.getDataSchema().getGranularitySpec().getQueryGranularity();
// User-specified list plus our additional bonus list.
this.allowedHadoopPrefix = new ArrayList<>();
this.allowedHadoopPrefix.add("druid.storage");
this.allowedHadoopPrefix.add("druid.javascript");
this.allowedHadoopPrefix.addAll(DATA_SEGMENT_PUSHER.getAllowedPropertyPrefixesForHadoop());
this.allowedHadoopPrefix.addAll(spec.getTuningConfig().getUserAllowedHadoopPrefix());
}
@JsonProperty(value = "spec")
public HadoopIngestionSpec getSchema()
{
return schema;
}
@JsonIgnore
public PathSpec getPathSpec()
{
return pathSpec;
}
public String getDataSource()
{
return schema.getDataSchema().getDataSource();
}
public GranularitySpec getGranularitySpec()
{
return schema.getDataSchema().getGranularitySpec();
}
public void setGranularitySpec(GranularitySpec granularitySpec)
{
this.schema = schema.withDataSchema(schema.getDataSchema().withGranularitySpec(granularitySpec));
this.pathSpec = JSON_MAPPER.convertValue(schema.getIOConfig().getPathSpec(), PathSpec.class);
}
public PartitionsSpec getPartitionsSpec()
{
return schema.getTuningConfig().getPartitionsSpec();
}
public IndexSpec getIndexSpec()
{
return schema.getTuningConfig().getIndexSpec();
}
public boolean isOverwriteFiles()
{
return schema.getTuningConfig().isOverwriteFiles();
}
public boolean isIgnoreInvalidRows()
{
return schema.getTuningConfig().isIgnoreInvalidRows();
}
public void setShardSpecs(Map> shardSpecs)
{
this.schema = schema.withTuningConfig(schema.getTuningConfig().withShardSpecs(shardSpecs));
this.pathSpec = JSON_MAPPER.convertValue(schema.getIOConfig().getPathSpec(), PathSpec.class);
}
public Optional> getIntervals()
{
Optional> setOptional = schema.getDataSchema().getGranularitySpec().bucketIntervals();
if (setOptional.isPresent()) {
return Optional.of((List) JodaUtils.condenseIntervals(setOptional.get()));
} else {
return Optional.absent();
}
}
public boolean isDeterminingPartitions()
{
return schema.getTuningConfig().getPartitionsSpec().isDeterminingPartitions();
}
public Long getTargetPartitionSize()
{
return schema.getTuningConfig().getPartitionsSpec().getTargetPartitionSize();
}
public boolean isForceExtendableShardSpecs()
{
return schema.getTuningConfig().isForceExtendableShardSpecs();
}
public long getMaxPartitionSize()
{
return schema.getTuningConfig().getPartitionsSpec().getMaxPartitionSize();
}
public boolean isUpdaterJobSpecSet()
{
return (schema.getIOConfig().getMetadataUpdateSpec() != null);
}
public boolean isCombineText()
{
return schema.getTuningConfig().isCombineText();
}
public InputRowParser getParser()
{
return schema.getDataSchema().getParser();
}
public HadoopyShardSpec getShardSpec(Bucket bucket)
{
return schema.getTuningConfig().getShardSpecs().get(bucket.time.getMillis()).get(bucket.partitionNum);
}
public int getShardSpecCount(Bucket bucket)
{
return schema.getTuningConfig().getShardSpecs().get(bucket.time.getMillis()).size();
}
/**
* Job instance should have Configuration set (by calling {@link #addJobProperties(Job)}
* or via injected system properties) before this method is called. The {@link PathSpec} may
* create objects which depend on the values of these configurations.
* @param job
* @return
* @throws IOException
*/
public Job addInputPaths(Job job) throws IOException
{
return pathSpec.addInputPaths(this, job);
}
/********************************************
Granularity/Bucket Helper Methods
********************************************/
/**
* Get the proper bucket for some input row.
*
* @param inputRow an InputRow
*
* @return the Bucket that this row belongs to
*/
public Optional getBucket(InputRow inputRow)
{
final Optional timeBucket = schema.getDataSchema().getGranularitySpec().bucketInterval(
DateTimes.utc(inputRow.getTimestampFromEpoch())
);
if (!timeBucket.isPresent()) {
return Optional.absent();
}
final DateTime bucketStart = timeBucket.get().getStart();
final ShardSpec actualSpec = shardSpecLookups.get(bucketStart.getMillis())
.getShardSpec(
rollupGran.bucketStart(inputRow.getTimestamp()).getMillis(),
inputRow
);
final HadoopyShardSpec hadoopyShardSpec = hadoopShardSpecLookup.get(bucketStart.getMillis()).get(actualSpec);
return Optional.of(
new Bucket(
hadoopyShardSpec.getShardNum(),
bucketStart,
actualSpec.getPartitionNum()
)
);
}
public Optional> getSegmentGranularIntervals()
{
return Optional.fromNullable(
(Set) schema.getDataSchema()
.getGranularitySpec()
.bucketIntervals()
.orNull()
);
}
public List getInputIntervals()
{
return schema.getDataSchema()
.getGranularitySpec()
.inputIntervals();
}
public Optional> getAllBuckets()
{
Optional> intervals = getSegmentGranularIntervals();
if (intervals.isPresent()) {
return Optional.of(
(Iterable) FunctionalIterable
.create(intervals.get())
.transformCat(
new Function>()
{
@Override
public Iterable apply(Interval input)
{
final DateTime bucketTime = input.getStart();
final List specs = schema.getTuningConfig().getShardSpecs().get(bucketTime.getMillis());
if (specs == null) {
return ImmutableList.of();
}
return FunctionalIterable
.create(specs)
.transform(
new Function()
{
int i = 0;
@Override
public Bucket apply(HadoopyShardSpec input)
{
return new Bucket(input.getShardNum(), bucketTime, i++);
}
}
);
}
}
)
);
} else {
return Optional.absent();
}
}
public String getWorkingPath()
{
final String workingPath = schema.getTuningConfig().getWorkingPath();
return workingPath == null ? DEFAULT_WORKING_PATH : workingPath;
}
/******************************************
Path helper logic
******************************************/
/**
* Make the intermediate path for this job run.
*
* @return the intermediate path for this job run.
*/
public Path makeIntermediatePath()
{
return new Path(
StringUtils.format(
"%s/%s/%s_%s",
getWorkingPath(),
schema.getDataSchema().getDataSource(),
schema.getTuningConfig().getVersion().replace(":", ""),
schema.getUniqueId()
)
);
}
public Path makeSegmentPartitionInfoPath(Interval bucketInterval)
{
return new Path(
StringUtils.format(
"%s/%s_%s/partitions.json",
makeIntermediatePath(),
ISODateTimeFormat.basicDateTime().print(bucketInterval.getStart()),
ISODateTimeFormat.basicDateTime().print(bucketInterval.getEnd())
)
);
}
public Path makeIntervalInfoPath()
{
return new Path(
StringUtils.format(
"%s/intervals.json",
makeIntermediatePath()
)
);
}
public Path makeDescriptorInfoDir()
{
return new Path(makeIntermediatePath(), "segmentDescriptorInfo");
}
public Path makeGroupedDataDir()
{
return new Path(makeIntermediatePath(), "groupedData");
}
public Path makeDescriptorInfoPath(DataSegment segment)
{
return new Path(makeDescriptorInfoDir(), StringUtils.format("%s.json", segment.getIdentifier().replace(":", "")));
}
public void addJobProperties(Job job)
{
addJobProperties(job.getConfiguration());
}
public void addJobProperties(Configuration conf)
{
for (final Map.Entry entry : schema.getTuningConfig().getJobProperties().entrySet()) {
conf.set(entry.getKey(), entry.getValue());
}
}
public void intoConfiguration(Job job)
{
Configuration conf = job.getConfiguration();
try {
conf.set(HadoopDruidIndexerConfig.CONFIG_PROPERTY, HadoopDruidIndexerConfig.JSON_MAPPER.writeValueAsString(this));
}
catch (IOException e) {
throw Throwables.propagate(e);
}
}
public void verify()
{
try {
log.info("Running with config:%n%s", JSON_MAPPER.writerWithDefaultPrettyPrinter().writeValueAsString(this));
}
catch (IOException e) {
throw Throwables.propagate(e);
}
Preconditions.checkNotNull(schema.getDataSchema().getDataSource(), "dataSource");
Preconditions.checkNotNull(schema.getDataSchema().getParser().getParseSpec(), "parseSpec");
Preconditions.checkNotNull(schema.getDataSchema().getParser().getParseSpec().getTimestampSpec(), "timestampSpec");
Preconditions.checkNotNull(schema.getDataSchema().getGranularitySpec(), "granularitySpec");
Preconditions.checkNotNull(pathSpec, "inputSpec");
Preconditions.checkNotNull(schema.getTuningConfig().getWorkingPath(), "workingPath");
Preconditions.checkNotNull(schema.getIOConfig().getSegmentOutputPath(), "segmentOutputPath");
Preconditions.checkNotNull(schema.getTuningConfig().getVersion(), "version");
}
public List getAllowedHadoopPrefix()
{
return allowedHadoopPrefix;
}
}