org.apache.druid.indexer.HadoopDruidIndexerConfig Maven / Gradle / Ivy
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package org.apache.druid.indexer;
import com.fasterxml.jackson.annotation.JsonCreator;
import com.fasterxml.jackson.annotation.JsonIgnore;
import com.fasterxml.jackson.annotation.JsonProperty;
import com.fasterxml.jackson.databind.ObjectMapper;
import com.google.common.base.Function;
import com.google.common.base.Joiner;
import com.google.common.base.Optional;
import com.google.common.base.Preconditions;
import com.google.common.base.Splitter;
import com.google.common.collect.ImmutableList;
import com.google.common.collect.Lists;
import com.google.inject.Injector;
import com.google.inject.Key;
import com.google.inject.Module;
import org.apache.druid.data.input.InputRow;
import org.apache.druid.data.input.impl.InputRowParser;
import org.apache.druid.guice.GuiceInjectors;
import org.apache.druid.guice.JsonConfigProvider;
import org.apache.druid.guice.annotations.Self;
import org.apache.druid.indexer.partitions.DimensionBasedPartitionsSpec;
import org.apache.druid.indexer.path.PathSpec;
import org.apache.druid.initialization.Initialization;
import org.apache.druid.java.util.common.DateTimes;
import org.apache.druid.java.util.common.JodaUtils;
import org.apache.druid.java.util.common.StringUtils;
import org.apache.druid.java.util.common.granularity.Granularity;
import org.apache.druid.java.util.common.guava.FunctionalIterable;
import org.apache.druid.java.util.common.jackson.JacksonUtils;
import org.apache.druid.segment.IndexIO;
import org.apache.druid.segment.IndexMerger;
import org.apache.druid.segment.IndexMergerV9;
import org.apache.druid.segment.IndexSpec;
import org.apache.druid.segment.indexing.granularity.GranularitySpec;
import org.apache.druid.segment.loading.DataSegmentPusher;
import org.apache.druid.server.DruidNode;
import org.apache.druid.timeline.DataSegment;
import org.apache.druid.timeline.partition.ShardSpec;
import org.apache.druid.timeline.partition.ShardSpecLookup;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.mapreduce.Job;
import org.joda.time.DateTime;
import org.joda.time.Interval;
import org.joda.time.format.ISODateTimeFormat;
import java.io.File;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.Reader;
import java.nio.charset.Charset;
import java.nio.charset.StandardCharsets;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Properties;
/**
*
*/
public class HadoopDruidIndexerConfig
{
private static final Injector INJECTOR;
static final String CONFIG_PROPERTY = "druid.indexer.config";
static final Charset JAVA_NATIVE_CHARSET = Charset.forName("Unicode");
static final Splitter TAB_SPLITTER = Splitter.on("\t");
static final Joiner TAB_JOINER = Joiner.on("\t");
public static final ObjectMapper JSON_MAPPER;
public static final IndexIO INDEX_IO;
static final IndexMerger INDEX_MERGER_V9; // storeEmptyColumns is off for this indexMerger
static final HadoopKerberosConfig HADOOP_KERBEROS_CONFIG;
static final DataSegmentPusher DATA_SEGMENT_PUSHER;
private static final String DEFAULT_WORKING_PATH = "/tmp/druid-indexing";
/**
* Hadoop tasks running in an Indexer process need a reference to the Properties instance created
* in PropertiesModule so that the task sees properties that were specified in Druid's config files.
*
* This is not strictly necessary for Peon-based tasks which have all properties, including config file properties,
* specified on their command line by ForkingTaskRunner (so they could use System.getProperties() only),
* but we always use the injected Properties for consistency.
*/
public static final Properties PROPERTIES;
static {
INJECTOR = Initialization.makeInjectorWithModules(
GuiceInjectors.makeStartupInjector(),
ImmutableList.of(
(Module) binder -> {
JsonConfigProvider.bindInstance(
binder,
Key.get(DruidNode.class, Self.class),
new DruidNode("hadoop-indexer", null, false, null, null, true, false)
);
JsonConfigProvider.bind(binder, "druid.hadoop.security.kerberos", HadoopKerberosConfig.class);
},
new IndexingHadoopModule()
)
);
JSON_MAPPER = INJECTOR.getInstance(ObjectMapper.class);
INDEX_IO = INJECTOR.getInstance(IndexIO.class);
INDEX_MERGER_V9 = INJECTOR.getInstance(IndexMergerV9.class);
HADOOP_KERBEROS_CONFIG = INJECTOR.getInstance(HadoopKerberosConfig.class);
DATA_SEGMENT_PUSHER = INJECTOR.getInstance(DataSegmentPusher.class);
PROPERTIES = INJECTOR.getInstance(Properties.class);
}
public enum IndexJobCounters
{
INVALID_ROW_COUNTER,
ROWS_PROCESSED_COUNTER,
ROWS_PROCESSED_WITH_ERRORS_COUNTER,
ROWS_UNPARSEABLE_COUNTER,
ROWS_THROWN_AWAY_COUNTER
}
public static HadoopDruidIndexerConfig fromSpec(HadoopIngestionSpec spec)
{
return new HadoopDruidIndexerConfig(spec);
}
private static HadoopDruidIndexerConfig fromMap(Map argSpec)
{
// Eventually PathSpec needs to get rid of its Hadoop dependency, then maybe this can be ingested directly without
// the Map<> intermediary
if (argSpec.containsKey("spec")) {
return HadoopDruidIndexerConfig.JSON_MAPPER.convertValue(
argSpec,
HadoopDruidIndexerConfig.class
);
}
return new HadoopDruidIndexerConfig(
HadoopDruidIndexerConfig.JSON_MAPPER.convertValue(
argSpec,
HadoopIngestionSpec.class
)
);
}
@SuppressWarnings("unchecked")
public static HadoopDruidIndexerConfig fromFile(File file)
{
try {
return fromMap(
HadoopDruidIndexerConfig.JSON_MAPPER.readValue(file, JacksonUtils.TYPE_REFERENCE_MAP_STRING_OBJECT)
);
}
catch (IOException e) {
throw new RuntimeException(e);
}
}
@SuppressWarnings("unchecked")
public static HadoopDruidIndexerConfig fromString(String str)
{
// This is a map to try and prevent dependency screwbally-ness
try {
return fromMap(
HadoopDruidIndexerConfig.JSON_MAPPER.readValue(str, JacksonUtils.TYPE_REFERENCE_MAP_STRING_OBJECT)
);
}
catch (IOException e) {
throw new RuntimeException(e);
}
}
@SuppressWarnings("unchecked")
public static HadoopDruidIndexerConfig fromDistributedFileSystem(String path)
{
try {
Path pt = new Path(path);
FileSystem fs = pt.getFileSystem(new Configuration());
Reader reader = new InputStreamReader(fs.open(pt), StandardCharsets.UTF_8);
return fromMap(
HadoopDruidIndexerConfig.JSON_MAPPER.readValue(reader, JacksonUtils.TYPE_REFERENCE_MAP_STRING_OBJECT)
);
}
catch (Exception e) {
throw new RuntimeException(e);
}
}
public static HadoopDruidIndexerConfig fromConfiguration(Configuration conf)
{
final HadoopDruidIndexerConfig retVal = fromString(conf.get(HadoopDruidIndexerConfig.CONFIG_PROPERTY));
retVal.verify();
return retVal;
}
private HadoopIngestionSpec schema;
private PathSpec pathSpec;
private String hadoopJobIdFileName;
private final Map shardSpecLookups = new HashMap<>();
private final Map> hadoopShardSpecLookup = new HashMap<>();
private final Granularity rollupGran;
private final List allowedHadoopPrefix;
@JsonCreator
public HadoopDruidIndexerConfig(
final @JsonProperty("spec") HadoopIngestionSpec spec
)
{
this.schema = spec;
this.pathSpec = JSON_MAPPER.convertValue(spec.getIOConfig().getPathSpec(), PathSpec.class);
for (Map.Entry> entry : spec.getTuningConfig().getShardSpecs().entrySet()) {
if (entry.getValue() == null || entry.getValue().isEmpty()) {
continue;
}
final ShardSpec actualSpec = entry.getValue().get(0).getActualSpec();
shardSpecLookups.put(
entry.getKey(), actualSpec.getLookup(
Lists.transform(
entry.getValue(), HadoopyShardSpec::getActualSpec
)
)
);
Map innerHadoopShardSpecLookup = new HashMap<>();
for (HadoopyShardSpec hadoopyShardSpec : entry.getValue()) {
innerHadoopShardSpecLookup.put(hadoopyShardSpec.getActualSpec(), hadoopyShardSpec);
}
hadoopShardSpecLookup.put(entry.getKey(), innerHadoopShardSpecLookup);
}
this.rollupGran = spec.getDataSchema().getGranularitySpec().getQueryGranularity();
// User-specified list plus our additional bonus list.
this.allowedHadoopPrefix = new ArrayList<>();
this.allowedHadoopPrefix.add("druid.storage");
this.allowedHadoopPrefix.add("druid.javascript");
this.allowedHadoopPrefix.addAll(DATA_SEGMENT_PUSHER.getAllowedPropertyPrefixesForHadoop());
this.allowedHadoopPrefix.addAll(spec.getTuningConfig().getUserAllowedHadoopPrefix());
}
@JsonProperty(value = "spec")
public HadoopIngestionSpec getSchema()
{
return schema;
}
@JsonIgnore
public PathSpec getPathSpec()
{
return pathSpec;
}
public String getDataSource()
{
return schema.getDataSchema().getDataSource();
}
public GranularitySpec getGranularitySpec()
{
return schema.getDataSchema().getGranularitySpec();
}
public void setGranularitySpec(GranularitySpec granularitySpec)
{
this.schema = schema.withDataSchema(schema.getDataSchema().withGranularitySpec(granularitySpec));
this.pathSpec = JSON_MAPPER.convertValue(schema.getIOConfig().getPathSpec(), PathSpec.class);
}
public DimensionBasedPartitionsSpec getPartitionsSpec()
{
return schema.getTuningConfig().getPartitionsSpec();
}
public IndexSpec getIndexSpec()
{
return schema.getTuningConfig().getIndexSpec();
}
public IndexSpec getIndexSpecForIntermediatePersists()
{
return schema.getTuningConfig().getIndexSpecForIntermediatePersists();
}
boolean isOverwriteFiles()
{
return schema.getTuningConfig().isOverwriteFiles();
}
public void setShardSpecs(Map> shardSpecs)
{
this.schema = schema.withTuningConfig(schema.getTuningConfig().withShardSpecs(shardSpecs));
this.pathSpec = JSON_MAPPER.convertValue(schema.getIOConfig().getPathSpec(), PathSpec.class);
}
public Optional> getIntervals()
{
Iterable bucketIntervals = schema.getDataSchema().getGranularitySpec().sortedBucketIntervals();
if (bucketIntervals.iterator().hasNext()) {
return Optional.of(JodaUtils.condenseIntervals(bucketIntervals));
} else {
return Optional.absent();
}
}
boolean isDeterminingPartitions()
{
return schema.getTuningConfig().getPartitionsSpec().needsDeterminePartitions(true);
}
public int getTargetPartitionSize()
{
DimensionBasedPartitionsSpec spec = schema.getTuningConfig().getPartitionsSpec();
if (spec.getTargetRowsPerSegment() != null) {
return spec.getTargetRowsPerSegment();
}
final Integer targetPartitionSize = spec.getMaxRowsPerSegment();
return targetPartitionSize == null ? -1 : targetPartitionSize;
}
boolean isForceExtendableShardSpecs()
{
return schema.getTuningConfig().isForceExtendableShardSpecs();
}
public boolean isUpdaterJobSpecSet()
{
return (schema.getIOConfig().getMetadataUpdateSpec() != null);
}
public boolean isCombineText()
{
return schema.getTuningConfig().isCombineText();
}
public InputRowParser getParser()
{
return Preconditions.checkNotNull(schema.getDataSchema().getParser(), "inputRowParser");
}
public HadoopyShardSpec getShardSpec(Bucket bucket)
{
return schema.getTuningConfig().getShardSpecs().get(bucket.time.getMillis()).get(bucket.partitionNum);
}
int getShardSpecCount(Bucket bucket)
{
return schema.getTuningConfig().getShardSpecs().get(bucket.time.getMillis()).size();
}
public boolean isLogParseExceptions()
{
return schema.getTuningConfig().isLogParseExceptions();
}
public int getMaxParseExceptions()
{
return schema.getTuningConfig().getMaxParseExceptions();
}
public Map getAllowedProperties()
{
Map allowedPropertiesMap = new HashMap<>();
for (String propName : PROPERTIES.stringPropertyNames()) {
for (String prefix : allowedHadoopPrefix) {
if (propName.equals(prefix) || propName.startsWith(prefix + ".")) {
allowedPropertiesMap.put(propName, PROPERTIES.getProperty(propName));
break;
}
}
}
return allowedPropertiesMap;
}
boolean isUseYarnRMJobStatusFallback()
{
return schema.getTuningConfig().isUseYarnRMJobStatusFallback();
}
void setHadoopJobIdFileName(String hadoopJobIdFileName)
{
this.hadoopJobIdFileName = hadoopJobIdFileName;
}
String getHadoopJobIdFileName()
{
return hadoopJobIdFileName;
}
/**
* Job instance should have Configuration set (by calling {@link #addJobProperties(Job)}
* or via injected system properties) before this method is called. The {@link PathSpec} may
* create objects which depend on the values of these configurations.
*/
public Job addInputPaths(Job job) throws IOException
{
return pathSpec.addInputPaths(this, job);
}
/********************************************
Granularity/Bucket Helper Methods
********************************************/
/**
* Get the proper bucket for some input row.
*
* @param inputRow an InputRow
* @return the Bucket that this row belongs to
*/
Optional getBucket(InputRow inputRow)
{
final Optional timeBucket = schema.getDataSchema().getGranularitySpec().bucketInterval(
DateTimes.utc(inputRow.getTimestampFromEpoch())
);
if (!timeBucket.isPresent()) {
return Optional.absent();
}
final DateTime bucketStart = timeBucket.get().getStart();
final ShardSpec actualSpec = shardSpecLookups.get(bucketStart.getMillis())
.getShardSpec(
rollupGran.bucketStart(inputRow.getTimestamp()).getMillis(),
inputRow
);
final HadoopyShardSpec hadoopyShardSpec = hadoopShardSpecLookup.get(bucketStart.getMillis()).get(actualSpec);
return Optional.of(
new Bucket(
hadoopyShardSpec.getShardNum(),
bucketStart,
actualSpec.getPartitionNum()
)
);
}
Iterable getSegmentGranularIntervals()
{
return
schema.getDataSchema()
.getGranularitySpec()
.sortedBucketIntervals();
}
public List getInputIntervals()
{
return schema.getDataSchema()
.getGranularitySpec()
.inputIntervals();
}
Optional> getAllBuckets()
{
Iterable intervals = getSegmentGranularIntervals();
if (intervals.iterator().hasNext()) {
return Optional.of(
FunctionalIterable
.create(intervals)
.transformCat(
input -> {
final DateTime bucketTime = input.getStart();
final List specs = schema.getTuningConfig()
.getShardSpecs()
.get(bucketTime.getMillis());
if (specs == null) {
return ImmutableList.of();
}
return FunctionalIterable
.create(specs)
.transform(
new Function()
{
int i = 0;
@Override
public Bucket apply(HadoopyShardSpec input)
{
return new Bucket(input.getShardNum(), bucketTime, i++);
}
}
);
}
)
);
} else {
return Optional.absent();
}
}
public String getWorkingPath()
{
final String workingPath = schema.getTuningConfig().getWorkingPath();
return workingPath == null ? DEFAULT_WORKING_PATH : workingPath;
}
/******************************************
Path helper logic
******************************************/
/**
* Make the intermediate path for this job run.
*
* @return the intermediate path for this job run.
*/
Path makeIntermediatePath()
{
return new Path(
StringUtils.format(
"%s/%s/%s_%s",
getWorkingPath(),
schema.getDataSchema().getDataSource(),
StringUtils.removeChar(schema.getTuningConfig().getVersion(), ':'),
schema.getUniqueId()
)
);
}
Path makeSegmentPartitionInfoPath(Interval bucketInterval)
{
return new Path(
StringUtils.format(
"%s/%s_%s/partitions.json",
makeIntermediatePath(),
ISODateTimeFormat.basicDateTime().print(bucketInterval.getStart()),
ISODateTimeFormat.basicDateTime().print(bucketInterval.getEnd())
)
);
}
Path makeIntervalInfoPath()
{
return new Path(
StringUtils.format(
"%s/intervals.json",
makeIntermediatePath()
)
);
}
Path makeDescriptorInfoDir()
{
return new Path(makeIntermediatePath(), "segmentDescriptorInfo");
}
Path makeGroupedDataDir()
{
return new Path(makeIntermediatePath(), "groupedData");
}
Path makeDescriptorInfoPath(DataSegment segment)
{
return new Path(makeDescriptorInfoDir(), StringUtils.removeChar(segment.getId() + ".json", ':'));
}
void addJobProperties(Job job)
{
addJobProperties(job.getConfiguration());
}
void addJobProperties(Configuration conf)
{
for (final Map.Entry entry : schema.getTuningConfig().getJobProperties().entrySet()) {
conf.set(entry.getKey(), entry.getValue());
}
}
public void intoConfiguration(Job job)
{
Configuration conf = job.getConfiguration();
try {
conf.set(HadoopDruidIndexerConfig.CONFIG_PROPERTY, HadoopDruidIndexerConfig.JSON_MAPPER.writeValueAsString(this));
}
catch (IOException e) {
throw new RuntimeException(e);
}
}
public void verify()
{
Preconditions.checkNotNull(schema.getDataSchema().getDataSource(), "dataSource");
Preconditions.checkNotNull(schema.getDataSchema().getParser(), "inputRowParser");
Preconditions.checkNotNull(schema.getDataSchema().getParser().getParseSpec(), "parseSpec");
Preconditions.checkNotNull(schema.getDataSchema().getGranularitySpec(), "granularitySpec");
Preconditions.checkNotNull(pathSpec, "inputSpec");
Preconditions.checkNotNull(schema.getTuningConfig().getWorkingPath(), "workingPath");
Preconditions.checkNotNull(schema.getIOConfig().getSegmentOutputPath(), "segmentOutputPath");
Preconditions.checkNotNull(schema.getTuningConfig().getVersion(), "version");
}
}