org.apache.druid.indexer.HadoopDruidIndexerConfig Maven / Gradle / Ivy

Go to download
/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *   http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied.  See the License for the
 * specific language governing permissions and limitations
 * under the License.
 */

package org.apache.druid.indexer;

import com.fasterxml.jackson.annotation.JsonCreator;
import com.fasterxml.jackson.annotation.JsonIgnore;
import com.fasterxml.jackson.annotation.JsonProperty;
import com.fasterxml.jackson.databind.ObjectMapper;
import com.google.common.base.Function;
import com.google.common.base.Joiner;
import com.google.common.base.Optional;
import com.google.common.base.Preconditions;
import com.google.common.base.Splitter;
import com.google.common.collect.ImmutableList;
import com.google.common.collect.Lists;
import com.google.inject.Injector;
import com.google.inject.Key;
import com.google.inject.Module;
import org.apache.druid.data.input.InputRow;
import org.apache.druid.data.input.impl.InputRowParser;
import org.apache.druid.guice.GuiceInjectors;
import org.apache.druid.guice.JsonConfigProvider;
import org.apache.druid.guice.annotations.Self;
import org.apache.druid.indexer.partitions.DimensionBasedPartitionsSpec;
import org.apache.druid.indexer.path.PathSpec;
import org.apache.druid.initialization.Initialization;
import org.apache.druid.java.util.common.DateTimes;
import org.apache.druid.java.util.common.JodaUtils;
import org.apache.druid.java.util.common.StringUtils;
import org.apache.druid.java.util.common.granularity.Granularity;
import org.apache.druid.java.util.common.guava.FunctionalIterable;
import org.apache.druid.java.util.common.jackson.JacksonUtils;
import org.apache.druid.segment.IndexIO;
import org.apache.druid.segment.IndexMerger;
import org.apache.druid.segment.IndexMergerV9;
import org.apache.druid.segment.IndexSpec;
import org.apache.druid.segment.indexing.granularity.GranularitySpec;
import org.apache.druid.segment.loading.DataSegmentPusher;
import org.apache.druid.server.DruidNode;
import org.apache.druid.timeline.DataSegment;
import org.apache.druid.timeline.partition.ShardSpec;
import org.apache.druid.timeline.partition.ShardSpecLookup;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.mapreduce.Job;
import org.joda.time.DateTime;
import org.joda.time.Interval;
import org.joda.time.format.ISODateTimeFormat;

import java.io.File;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.Reader;
import java.nio.charset.Charset;
import java.nio.charset.StandardCharsets;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Properties;

/**
 *
 */
public class HadoopDruidIndexerConfig
{
  private static final Injector INJECTOR;

  static final String CONFIG_PROPERTY = "druid.indexer.config";
  static final Charset JAVA_NATIVE_CHARSET = Charset.forName("Unicode");
  static final Splitter TAB_SPLITTER = Splitter.on("\t");
  static final Joiner TAB_JOINER = Joiner.on("\t");
  public static final ObjectMapper JSON_MAPPER;
  public static final IndexIO INDEX_IO;
  static final IndexMerger INDEX_MERGER_V9; // storeEmptyColumns is off for this indexMerger
  static final HadoopKerberosConfig HADOOP_KERBEROS_CONFIG;
  static final DataSegmentPusher DATA_SEGMENT_PUSHER;
  private static final String DEFAULT_WORKING_PATH = "/tmp/druid-indexing";

  /**
   * Hadoop tasks running in an Indexer process need a reference to the Properties instance created
   * in PropertiesModule so that the task sees properties that were specified in Druid's config files.
   * 
   * This is not strictly necessary for Peon-based tasks which have all properties, including config file properties,
   * specified on their command line by ForkingTaskRunner (so they could use System.getProperties() only),
   * but we always use the injected Properties for consistency.
   */
  public static final Properties PROPERTIES;

  static {
    INJECTOR = Initialization.makeInjectorWithModules(
        GuiceInjectors.makeStartupInjector(),
        ImmutableList.of(
            (Module) binder -> {
              JsonConfigProvider.bindInstance(
                  binder,
                  Key.get(DruidNode.class, Self.class),
                  new DruidNode("hadoop-indexer", null, false, null, null, true, false)
              );
              JsonConfigProvider.bind(binder, "druid.hadoop.security.kerberos", HadoopKerberosConfig.class);
            },
            new IndexingHadoopModule()
        )
    );
    JSON_MAPPER = INJECTOR.getInstance(ObjectMapper.class);
    INDEX_IO = INJECTOR.getInstance(IndexIO.class);
    INDEX_MERGER_V9 = INJECTOR.getInstance(IndexMergerV9.class);
    HADOOP_KERBEROS_CONFIG = INJECTOR.getInstance(HadoopKerberosConfig.class);
    DATA_SEGMENT_PUSHER = INJECTOR.getInstance(DataSegmentPusher.class);
    PROPERTIES = INJECTOR.getInstance(Properties.class);
  }

  public enum IndexJobCounters
  {
    INVALID_ROW_COUNTER,
    ROWS_PROCESSED_COUNTER,
    ROWS_PROCESSED_WITH_ERRORS_COUNTER,
    ROWS_UNPARSEABLE_COUNTER,
    ROWS_THROWN_AWAY_COUNTER
  }

  public static HadoopDruidIndexerConfig fromSpec(HadoopIngestionSpec spec)
  {
    return new HadoopDruidIndexerConfig(spec);
  }

  private static HadoopDruidIndexerConfig fromMap(Map argSpec)
  {
    // Eventually PathSpec needs to get rid of its Hadoop dependency, then maybe this can be ingested directly without
    // the Map<> intermediary

    if (argSpec.containsKey("spec")) {
      return HadoopDruidIndexerConfig.JSON_MAPPER.convertValue(
          argSpec,
          HadoopDruidIndexerConfig.class
      );
    }
    return new HadoopDruidIndexerConfig(
        HadoopDruidIndexerConfig.JSON_MAPPER.convertValue(
            argSpec,
            HadoopIngestionSpec.class
        )
    );
  }

  @SuppressWarnings("unchecked")
  public static HadoopDruidIndexerConfig fromFile(File file)
  {
    try {
      return fromMap(
          HadoopDruidIndexerConfig.JSON_MAPPER.readValue(file, JacksonUtils.TYPE_REFERENCE_MAP_STRING_OBJECT)
      );
    }
    catch (IOException e) {
      throw new RuntimeException(e);
    }
  }

  @SuppressWarnings("unchecked")
  public static HadoopDruidIndexerConfig fromString(String str)
  {
    // This is a map to try and prevent dependency screwbally-ness
    try {
      return fromMap(
          HadoopDruidIndexerConfig.JSON_MAPPER.readValue(str, JacksonUtils.TYPE_REFERENCE_MAP_STRING_OBJECT)
      );
    }
    catch (IOException e) {
      throw new RuntimeException(e);
    }
  }

  @SuppressWarnings("unchecked")
  public static HadoopDruidIndexerConfig fromDistributedFileSystem(String path)
  {
    try {
      Path pt = new Path(path);
      FileSystem fs = pt.getFileSystem(new Configuration());
      Reader reader = new InputStreamReader(fs.open(pt), StandardCharsets.UTF_8);

      return fromMap(
          HadoopDruidIndexerConfig.JSON_MAPPER.readValue(reader, JacksonUtils.TYPE_REFERENCE_MAP_STRING_OBJECT)
      );
    }
    catch (Exception e) {
      throw new RuntimeException(e);
    }
  }

  public static HadoopDruidIndexerConfig fromConfiguration(Configuration conf)
  {
    final HadoopDruidIndexerConfig retVal = fromString(conf.get(HadoopDruidIndexerConfig.CONFIG_PROPERTY));
    retVal.verify();
    return retVal;
  }

  private HadoopIngestionSpec schema;
  private PathSpec pathSpec;
  private String hadoopJobIdFileName;
  private final Map shardSpecLookups = new HashMap<>();
  private final Map> hadoopShardSpecLookup = new HashMap<>();
  private final Granularity rollupGran;
  private final List allowedHadoopPrefix;

  @JsonCreator
  public HadoopDruidIndexerConfig(
      final @JsonProperty("spec") HadoopIngestionSpec spec
  )
  {
    this.schema = spec;
    this.pathSpec = JSON_MAPPER.convertValue(spec.getIOConfig().getPathSpec(), PathSpec.class);
    for (Map.Entry> entry : spec.getTuningConfig().getShardSpecs().entrySet()) {
      if (entry.getValue() == null || entry.getValue().isEmpty()) {
        continue;
      }
      final ShardSpec actualSpec = entry.getValue().get(0).getActualSpec();
      shardSpecLookups.put(
          entry.getKey(), actualSpec.getLookup(
              Lists.transform(
                  entry.getValue(), HadoopyShardSpec::getActualSpec
              )
          )
      );

      Map innerHadoopShardSpecLookup = new HashMap<>();
      for (HadoopyShardSpec hadoopyShardSpec : entry.getValue()) {
        innerHadoopShardSpecLookup.put(hadoopyShardSpec.getActualSpec(), hadoopyShardSpec);
      }
      hadoopShardSpecLookup.put(entry.getKey(), innerHadoopShardSpecLookup);

    }
    this.rollupGran = spec.getDataSchema().getGranularitySpec().getQueryGranularity();

    // User-specified list plus our additional bonus list.
    this.allowedHadoopPrefix = new ArrayList<>();
    this.allowedHadoopPrefix.add("druid.storage");
    this.allowedHadoopPrefix.add("druid.javascript");
    this.allowedHadoopPrefix.addAll(DATA_SEGMENT_PUSHER.getAllowedPropertyPrefixesForHadoop());
    this.allowedHadoopPrefix.addAll(spec.getTuningConfig().getUserAllowedHadoopPrefix());
  }

  @JsonProperty(value = "spec")
  public HadoopIngestionSpec getSchema()
  {
    return schema;
  }

  @JsonIgnore
  public PathSpec getPathSpec()
  {
    return pathSpec;
  }

  public String getDataSource()
  {
    return schema.getDataSchema().getDataSource();
  }

  public GranularitySpec getGranularitySpec()
  {
    return schema.getDataSchema().getGranularitySpec();
  }

  public void setGranularitySpec(GranularitySpec granularitySpec)
  {
    this.schema = schema.withDataSchema(schema.getDataSchema().withGranularitySpec(granularitySpec));
    this.pathSpec = JSON_MAPPER.convertValue(schema.getIOConfig().getPathSpec(), PathSpec.class);
  }

  public DimensionBasedPartitionsSpec getPartitionsSpec()
  {
    return schema.getTuningConfig().getPartitionsSpec();
  }

  public IndexSpec getIndexSpec()
  {
    return schema.getTuningConfig().getIndexSpec();
  }

  public IndexSpec getIndexSpecForIntermediatePersists()
  {
    return schema.getTuningConfig().getIndexSpecForIntermediatePersists();
  }

  boolean isOverwriteFiles()
  {
    return schema.getTuningConfig().isOverwriteFiles();
  }

  public void setShardSpecs(Map> shardSpecs)
  {
    this.schema = schema.withTuningConfig(schema.getTuningConfig().withShardSpecs(shardSpecs));
    this.pathSpec = JSON_MAPPER.convertValue(schema.getIOConfig().getPathSpec(), PathSpec.class);
  }

  public Optional> getIntervals()
  {
    Iterable bucketIntervals = schema.getDataSchema().getGranularitySpec().sortedBucketIntervals();
    if (bucketIntervals.iterator().hasNext()) {
      return Optional.of(JodaUtils.condenseIntervals(bucketIntervals));
    } else {
      return Optional.absent();
    }
  }

  boolean isDeterminingPartitions()
  {
    return schema.getTuningConfig().getPartitionsSpec().needsDeterminePartitions(true);
  }

  public int getTargetPartitionSize()
  {
    DimensionBasedPartitionsSpec spec = schema.getTuningConfig().getPartitionsSpec();

    if (spec.getTargetRowsPerSegment() != null) {
      return spec.getTargetRowsPerSegment();
    }

    final Integer targetPartitionSize = spec.getMaxRowsPerSegment();
    return targetPartitionSize == null ? -1 : targetPartitionSize;
  }

  boolean isForceExtendableShardSpecs()
  {
    return schema.getTuningConfig().isForceExtendableShardSpecs();
  }

  public boolean isUpdaterJobSpecSet()
  {
    return (schema.getIOConfig().getMetadataUpdateSpec() != null);
  }

  public boolean isCombineText()
  {
    return schema.getTuningConfig().isCombineText();
  }

  public InputRowParser getParser()
  {
    return Preconditions.checkNotNull(schema.getDataSchema().getParser(), "inputRowParser");
  }

  public HadoopyShardSpec getShardSpec(Bucket bucket)
  {
    return schema.getTuningConfig().getShardSpecs().get(bucket.time.getMillis()).get(bucket.partitionNum);
  }

  int getShardSpecCount(Bucket bucket)
  {
    return schema.getTuningConfig().getShardSpecs().get(bucket.time.getMillis()).size();
  }

  public boolean isLogParseExceptions()
  {
    return schema.getTuningConfig().isLogParseExceptions();
  }

  public int getMaxParseExceptions()
  {
    return schema.getTuningConfig().getMaxParseExceptions();
  }

  public Map getAllowedProperties()
  {
    Map allowedPropertiesMap = new HashMap<>();
    for (String propName : PROPERTIES.stringPropertyNames()) {
      for (String prefix : allowedHadoopPrefix) {
        if (propName.equals(prefix) || propName.startsWith(prefix + ".")) {
          allowedPropertiesMap.put(propName, PROPERTIES.getProperty(propName));
          break;
        }
      }
    }
    return allowedPropertiesMap;
  }

  boolean isUseYarnRMJobStatusFallback()
  {
    return schema.getTuningConfig().isUseYarnRMJobStatusFallback();
  }

  void setHadoopJobIdFileName(String hadoopJobIdFileName)
  {
    this.hadoopJobIdFileName = hadoopJobIdFileName;
  }

  String getHadoopJobIdFileName()
  {
    return hadoopJobIdFileName;
  }

  /**
   * Job instance should have Configuration set (by calling {@link #addJobProperties(Job)}
   * or via injected system properties) before this method is called.  The {@link PathSpec} may
   * create objects which depend on the values of these configurations.
   */
  public Job addInputPaths(Job job) throws IOException
  {
    return pathSpec.addInputPaths(this, job);
  }

  /********************************************
   Granularity/Bucket Helper Methods
   ********************************************/

  /**
   * Get the proper bucket for some input row.
   *
   * @param inputRow an InputRow
   * @return the Bucket that this row belongs to
   */
  Optional getBucket(InputRow inputRow)
  {
    final Optional timeBucket = schema.getDataSchema().getGranularitySpec().bucketInterval(
        DateTimes.utc(inputRow.getTimestampFromEpoch())
    );
    if (!timeBucket.isPresent()) {
      return Optional.absent();
    }
    final DateTime bucketStart = timeBucket.get().getStart();
    final ShardSpec actualSpec = shardSpecLookups.get(bucketStart.getMillis())
                                                 .getShardSpec(
                                                     rollupGran.bucketStart(inputRow.getTimestamp()).getMillis(),
                                                     inputRow
                                                 );
    final HadoopyShardSpec hadoopyShardSpec = hadoopShardSpecLookup.get(bucketStart.getMillis()).get(actualSpec);

    return Optional.of(
        new Bucket(
            hadoopyShardSpec.getShardNum(),
            bucketStart,
            actualSpec.getPartitionNum()
        )
    );

  }

  Iterable getSegmentGranularIntervals()
  {
    return
        schema.getDataSchema()
              .getGranularitySpec()
              .sortedBucketIntervals();
  }

  public List getInputIntervals()
  {
    return schema.getDataSchema()
                 .getGranularitySpec()
                 .inputIntervals();
  }

  Optional> getAllBuckets()
  {
    Iterable intervals = getSegmentGranularIntervals();
    if (intervals.iterator().hasNext()) {
      return Optional.of(
          FunctionalIterable
              .create(intervals)
              .transformCat(
                  input -> {
                    final DateTime bucketTime = input.getStart();
                    final List specs = schema.getTuningConfig()
                                                               .getShardSpecs()
                                                               .get(bucketTime.getMillis());
                    if (specs == null) {
                      return ImmutableList.of();
                    }

                    return FunctionalIterable
                        .create(specs)
                        .transform(
                            new Function()
                            {
                              int i = 0;

                              @Override
                              public Bucket apply(HadoopyShardSpec input)
                              {
                                return new Bucket(input.getShardNum(), bucketTime, i++);
                              }
                            }
                        );
                  }
              )
      );
    } else {
      return Optional.absent();
    }
  }

  public String getWorkingPath()
  {
    final String workingPath = schema.getTuningConfig().getWorkingPath();
    return workingPath == null ? DEFAULT_WORKING_PATH : workingPath;
  }

  /******************************************
   Path helper logic
   ******************************************/

  /**
   * Make the intermediate path for this job run.
   *
   * @return the intermediate path for this job run.
   */

  Path makeIntermediatePath()
  {
    return new Path(
        StringUtils.format(
            "%s/%s/%s_%s",
            getWorkingPath(),
            schema.getDataSchema().getDataSource(),
            StringUtils.removeChar(schema.getTuningConfig().getVersion(), ':'),
            schema.getUniqueId()
        )
    );
  }

  Path makeSegmentPartitionInfoPath(Interval bucketInterval)
  {
    return new Path(
        StringUtils.format(
            "%s/%s_%s/partitions.json",
            makeIntermediatePath(),
            ISODateTimeFormat.basicDateTime().print(bucketInterval.getStart()),
            ISODateTimeFormat.basicDateTime().print(bucketInterval.getEnd())
        )
    );
  }

  Path makeIntervalInfoPath()
  {
    return new Path(
        StringUtils.format(
            "%s/intervals.json",
            makeIntermediatePath()
        )
    );
  }

  Path makeDescriptorInfoDir()
  {
    return new Path(makeIntermediatePath(), "segmentDescriptorInfo");
  }

  Path makeGroupedDataDir()
  {
    return new Path(makeIntermediatePath(), "groupedData");
  }

  Path makeDescriptorInfoPath(DataSegment segment)
  {
    return new Path(makeDescriptorInfoDir(), StringUtils.removeChar(segment.getId() + ".json", ':'));
  }

  void addJobProperties(Job job)
  {
    addJobProperties(job.getConfiguration());
  }

  void addJobProperties(Configuration conf)
  {
    for (final Map.Entry entry : schema.getTuningConfig().getJobProperties().entrySet()) {
      conf.set(entry.getKey(), entry.getValue());
    }
  }

  public void intoConfiguration(Job job)
  {
    Configuration conf = job.getConfiguration();

    try {
      conf.set(HadoopDruidIndexerConfig.CONFIG_PROPERTY, HadoopDruidIndexerConfig.JSON_MAPPER.writeValueAsString(this));
    }
    catch (IOException e) {
      throw new RuntimeException(e);
    }
  }

  public void verify()
  {
    Preconditions.checkNotNull(schema.getDataSchema().getDataSource(), "dataSource");
    Preconditions.checkNotNull(schema.getDataSchema().getParser(), "inputRowParser");
    Preconditions.checkNotNull(schema.getDataSchema().getParser().getParseSpec(), "parseSpec");
    Preconditions.checkNotNull(schema.getDataSchema().getGranularitySpec(), "granularitySpec");
    Preconditions.checkNotNull(pathSpec, "inputSpec");
    Preconditions.checkNotNull(schema.getTuningConfig().getWorkingPath(), "workingPath");
    Preconditions.checkNotNull(schema.getIOConfig().getSegmentOutputPath(), "segmentOutputPath");
    Preconditions.checkNotNull(schema.getTuningConfig().getVersion(), "version");
  }
}