org.apache.druid.indexer.DetermineHashedPartitionsJob Maven / Gradle / Ivy

Go to download
/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *   http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied.  See the License for the
 * specific language governing permissions and limitations
 * under the License.
 */

package org.apache.druid.indexer;

import com.fasterxml.jackson.core.type.TypeReference;
import com.google.common.base.Optional;
import com.google.common.collect.ImmutableMap;
import com.google.common.collect.Iterators;
import com.google.common.collect.Lists;
import com.google.common.hash.HashFunction;
import com.google.common.hash.Hashing;
import com.google.common.io.Closeables;
import org.apache.druid.data.input.InputRow;
import org.apache.druid.data.input.Rows;
import org.apache.druid.hll.HyperLogLogCollector;
import org.apache.druid.indexer.partitions.HashedPartitionsSpec;
import org.apache.druid.indexer.partitions.PartitionsSpec;
import org.apache.druid.java.util.common.DateTimes;
import org.apache.druid.java.util.common.ISE;
import org.apache.druid.java.util.common.StringUtils;
import org.apache.druid.java.util.common.granularity.Granularity;
import org.apache.druid.java.util.common.logger.Logger;
import org.apache.druid.segment.indexing.granularity.UniformGranularitySpec;
import org.apache.druid.timeline.partition.HashBasedNumberedShardSpec;
import org.apache.druid.timeline.partition.HashPartitionFunction;
import org.apache.hadoop.conf.Configurable;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.BytesWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.mapreduce.Counters;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Partitioner;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat;
import org.joda.time.DateTime;
import org.joda.time.DateTimeComparator;
import org.joda.time.Interval;

import javax.annotation.Nullable;
import java.io.IOException;
import java.io.OutputStream;
import java.nio.ByteBuffer;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.TreeMap;

/**
 * Determines appropriate ShardSpecs for a job by determining approximate cardinality of data set using HyperLogLog
 */
public class DetermineHashedPartitionsJob implements Jobby
{
  private static final Logger log = new Logger(DetermineHashedPartitionsJob.class);
  private final HadoopDruidIndexerConfig config;
  private String failureCause;
  private Job groupByJob;
  private long startTime;

  public DetermineHashedPartitionsJob(
      HadoopDruidIndexerConfig config
  )
  {
    this.config = config;
  }

  @Override
  public boolean run()
  {
    try {
      /*
       * Group by (timestamp, dimensions) so we can correctly count dimension values as they would appear
       * in the final segment.
       */
      startTime = System.currentTimeMillis();
      groupByJob = Job.getInstance(
          new Configuration(),
          StringUtils.format("%s-determine_partitions_hashed-%s", config.getDataSource(), config.getIntervals())
      );

      JobHelper.injectSystemProperties(groupByJob.getConfiguration(), config);
      config.addJobProperties(groupByJob);
      groupByJob.setMapperClass(DetermineCardinalityMapper.class);
      groupByJob.setMapOutputKeyClass(LongWritable.class);
      groupByJob.setMapOutputValueClass(BytesWritable.class);
      groupByJob.setReducerClass(DetermineCardinalityReducer.class);
      groupByJob.setOutputKeyClass(NullWritable.class);
      groupByJob.setOutputValueClass(NullWritable.class);
      groupByJob.setOutputFormatClass(SequenceFileOutputFormat.class);
      groupByJob.setPartitionerClass(DetermineHashedPartitionsPartitioner.class);
      if (config.getInputIntervals().isEmpty()) {
        groupByJob.setNumReduceTasks(1);
      } else {
        groupByJob.setNumReduceTasks(Iterators.size(config.getSegmentGranularIntervals().iterator()));
      }
      JobHelper.setupClasspath(
          JobHelper.distributedClassPath(config.getWorkingPath()),
          JobHelper.distributedClassPath(config.makeIntermediatePath()),
          groupByJob
      );

      config.addInputPaths(groupByJob);
      config.intoConfiguration(groupByJob);
      FileOutputFormat.setOutputPath(groupByJob, config.makeGroupedDataDir());

      groupByJob.submit();
      log.info("Job %s submitted, status available at: %s", groupByJob.getJobName(), groupByJob.getTrackingURL());

      // Store the jobId in the file
      if (groupByJob.getJobID() != null) {
        JobHelper.writeJobIdToFile(config.getHadoopJobIdFileName(), groupByJob.getJobID().toString());
      }

      try {
        if (!groupByJob.waitForCompletion(true)) {
          log.error("Job failed: %s", groupByJob.getJobID());
          failureCause = Utils.getFailureMessage(groupByJob, HadoopDruidIndexerConfig.JSON_MAPPER);
          return false;
        }
      }
      catch (IOException ioe) {
        if (!Utils.checkAppSuccessForJobIOException(ioe, groupByJob, config.isUseYarnRMJobStatusFallback())) {
          throw ioe;
        }
      }

      /*
       * Load partitions and intervals determined by the previous job.
       */

      log.info("Job completed, loading up partitions for intervals[%s].", config.getSegmentGranularIntervals());
      FileSystem fileSystem = null;
      if (config.getInputIntervals().isEmpty()) {
        final Path intervalInfoPath = config.makeIntervalInfoPath();
        fileSystem = intervalInfoPath.getFileSystem(groupByJob.getConfiguration());
        if (!Utils.exists(groupByJob, fileSystem, intervalInfoPath)) {
          throw new ISE("Path[%s] didn't exist!?", intervalInfoPath);
        }
        List intervals = HadoopDruidIndexerConfig.JSON_MAPPER.readValue(
            Utils.openInputStream(groupByJob, intervalInfoPath),
            new TypeReference>()
            {
            }
        );
        config.setGranularitySpec(
            new UniformGranularitySpec(
                config.getGranularitySpec().getSegmentGranularity(),
                config.getGranularitySpec().getQueryGranularity(),
                config.getGranularitySpec().isRollup(),
                intervals
            )
        );
        log.info("Determined Intervals for Job [%s].", config.getSegmentGranularIntervals());
      }
      Map> shardSpecs = new TreeMap<>(DateTimeComparator.getInstance());
      PartitionsSpec partitionsSpec = config.getPartitionsSpec();
      if (!(partitionsSpec instanceof HashedPartitionsSpec)) {
        throw new ISE(
            "%s is expected, but got %s",
            HashedPartitionsSpec.class.getName(),
            partitionsSpec.getClass().getName()
        );
      }
      HashPartitionFunction partitionFunction = ((HashedPartitionsSpec) partitionsSpec).getPartitionFunction();
      int shardCount = 0;
      for (Interval segmentGranularity : config.getSegmentGranularIntervals()) {
        DateTime bucket = segmentGranularity.getStart();

        final Path partitionInfoPath = config.makeSegmentPartitionInfoPath(segmentGranularity);
        if (fileSystem == null) {
          fileSystem = partitionInfoPath.getFileSystem(groupByJob.getConfiguration());
        }
        if (Utils.exists(groupByJob, fileSystem, partitionInfoPath)) {
          final Long numRows = HadoopDruidIndexerConfig.JSON_MAPPER.readValue(
              Utils.openInputStream(groupByJob, partitionInfoPath),
              Long.class
          );

          log.info("Found approximately [%,d] rows in data.", numRows);

          final int numberOfShards = (int) Math.ceil((double) numRows / config.getTargetPartitionSize());

          log.info("Creating [%,d] shards", numberOfShards);

          List actualSpecs = Lists.newArrayListWithExpectedSize(numberOfShards);
          for (int i = 0; i < numberOfShards; ++i) {
            actualSpecs.add(
                new HadoopyShardSpec(
                    new HashBasedNumberedShardSpec(
                        i,
                        numberOfShards,
                        i,
                        numberOfShards,
                        null,
                        partitionFunction,
                        HadoopDruidIndexerConfig.JSON_MAPPER
                    ),
                    shardCount++
                )
            );
            log.info("DateTime[%s], partition[%d], spec[%s]", bucket, i, actualSpecs.get(i));
          }

          shardSpecs.put(bucket.getMillis(), actualSpecs);

        } else {
          log.info("Path[%s] didn't exist!?", partitionInfoPath);
        }
      }

      config.setShardSpecs(shardSpecs);
      log.info(
          "DetermineHashedPartitionsJob took %d millis",
          (System.currentTimeMillis() - startTime)
      );

      return true;
    }
    catch (Exception e) {
      throw new RuntimeException(e);
    }
  }

  @Override
  public Map getStats()
  {
    if (groupByJob == null) {
      return null;
    }

    try {
      Counters jobCounters = groupByJob.getCounters();

      Map metrics = TaskMetricsUtils.makeIngestionRowMetrics(
          jobCounters.findCounter(HadoopDruidIndexerConfig.IndexJobCounters.ROWS_PROCESSED_COUNTER).getValue(),
          jobCounters.findCounter(HadoopDruidIndexerConfig.IndexJobCounters.ROWS_PROCESSED_WITH_ERRORS_COUNTER)
                     .getValue(),
          jobCounters.findCounter(HadoopDruidIndexerConfig.IndexJobCounters.ROWS_UNPARSEABLE_COUNTER).getValue(),
          jobCounters.findCounter(HadoopDruidIndexerConfig.IndexJobCounters.ROWS_THROWN_AWAY_COUNTER).getValue()
      );

      return metrics;
    }
    catch (IllegalStateException ise) {
      log.debug("Couldn't get counters due to job state");
      return null;
    }
    catch (Exception e) {
      log.debug(e, "Encountered exception in getStats().");
      return null;
    }
  }

  @Nullable
  @Override
  public String getErrorMessage()
  {
    return failureCause;
  }

  public static class DetermineCardinalityMapper extends HadoopDruidIndexerMapper
  {
    private static HashFunction hashFunction = Hashing.murmur3_128();
    @Nullable
    private Granularity rollupGranularity = null;
    @Nullable
    private Map hyperLogLogs;
    @Nullable
    private HadoopDruidIndexerConfig config;
    private boolean determineIntervals;

    @Override
    protected void setup(Context context)
        throws IOException, InterruptedException
    {
      super.setup(context);
      rollupGranularity = getConfig().getGranularitySpec().getQueryGranularity();
      config = HadoopDruidIndexerConfig.fromConfiguration(context.getConfiguration());
      Iterable intervals = config.getSegmentGranularIntervals();
      if (intervals.iterator().hasNext()) {
        determineIntervals = false;
        final ImmutableMap.Builder builder = ImmutableMap.builder();
        for (final Interval bucketInterval : intervals) {
          builder.put(bucketInterval, HyperLogLogCollector.makeLatestCollector());
        }
        hyperLogLogs = builder.build();
      } else {
        determineIntervals = true;
        hyperLogLogs = new HashMap<>();
      }
    }

    @Override
    protected void innerMap(
        InputRow inputRow,
        Context context
    ) throws IOException
    {

      final List