io.druid.indexer.path.DatasourcePathSpec Maven / Gradle / Ivy

/*
 * Licensed to Metamarkets Group Inc. (Metamarkets) under one
 * or more contributor license agreements. See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership. Metamarkets licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License. You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied. See the License for the
 * specific language governing permissions and limitations
 * under the License.
 */

package io.druid.indexer.path;

import com.fasterxml.jackson.annotation.JacksonInject;
import com.fasterxml.jackson.annotation.JsonCreator;
import com.fasterxml.jackson.annotation.JsonProperty;
import com.fasterxml.jackson.databind.ObjectMapper;
import com.google.common.base.Function;
import com.google.common.base.Preconditions;
import com.google.common.collect.Iterables;
import com.google.common.collect.Lists;
import com.google.common.collect.Sets;
import io.druid.indexer.HadoopDruidIndexerConfig;
import io.druid.indexer.hadoop.DatasourceIngestionSpec;
import io.druid.indexer.hadoop.DatasourceInputFormat;
import io.druid.indexer.hadoop.WindowedDataSegment;
import io.druid.java.util.common.ISE;
import io.druid.java.util.common.logger.Logger;
import io.druid.query.aggregation.AggregatorFactory;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.MultipleInputs;

import java.io.IOException;
import java.util.List;
import java.util.Set;

public class DatasourcePathSpec implements PathSpec
{
  private static final Logger logger = new Logger(DatasourcePathSpec.class);

  private final ObjectMapper mapper;
  private final DatasourceIngestionSpec ingestionSpec;
  private final long maxSplitSize;
  private final List segments;

  @JsonCreator
  public DatasourcePathSpec(
      @JacksonInject ObjectMapper mapper,
      @JsonProperty("segments") List segments,
      @JsonProperty("ingestionSpec") DatasourceIngestionSpec spec,
      @JsonProperty("maxSplitSize") Long maxSplitSize
  )
  {
    this.mapper = Preconditions.checkNotNull(mapper, "null mapper");
    this.segments = segments;
    this.ingestionSpec = Preconditions.checkNotNull(spec, "null ingestionSpec");

    if (maxSplitSize == null) {
      this.maxSplitSize = 0;
    } else {
      this.maxSplitSize = maxSplitSize.longValue();
    }
  }

  @JsonProperty
  public List getSegments()
  {
    return segments;
  }

  @JsonProperty
  public DatasourceIngestionSpec getIngestionSpec()
  {
    return ingestionSpec;
  }

  @JsonProperty
  public long getMaxSplitSize()
  {
    return maxSplitSize;
  }

  @Override
  public Job addInputPaths(
      HadoopDruidIndexerConfig config, Job job
  ) throws IOException
  {
    if (segments == null || segments.isEmpty()) {
      if (ingestionSpec.isIgnoreWhenNoSegments()) {
        logger.warn("No segments found for ingestionSpec [%s]", ingestionSpec);
        return job;
      } else {
        throw new ISE("No segments found for ingestion spec [%s]", ingestionSpec);
      }
    }

    logger.info(
        "Found total [%d] segments for [%s]  in interval [%s]",
        segments.size(),
        ingestionSpec.getDataSource(),
        ingestionSpec.getIntervals()
    );

    DatasourceIngestionSpec updatedIngestionSpec = ingestionSpec;
    if (updatedIngestionSpec.getDimensions() == null) {
      List dims;
      if (config.getParser().getParseSpec().getDimensionsSpec().hasCustomDimensions()) {
        dims = config.getParser().getParseSpec().getDimensionsSpec().getDimensionNames();
      } else {
        Set dimSet = Sets.newHashSet(
            Iterables.concat(
                Iterables.transform(
                    segments,
                    new Function>()
                    {
                      @Override
                      public Iterable apply(WindowedDataSegment dataSegment)
                      {
                        return dataSegment.getSegment().getDimensions();
                      }
                    }
                )
            )
        );
        dims = Lists.newArrayList(
            Sets.difference(
                dimSet,
                config.getParser()
                      .getParseSpec()
                      .getDimensionsSpec()
                      .getDimensionExclusions()
            )
        );
      }
      updatedIngestionSpec = updatedIngestionSpec.withDimensions(dims);
    }

    if (updatedIngestionSpec.getMetrics() == null) {
      Set metrics = Sets.newHashSet();
      final AggregatorFactory[] cols = config.getSchema().getDataSchema().getAggregators();
      if (cols != null) {
        for (AggregatorFactory col : cols) {
          metrics.add(col.getName());
        }
      }
      updatedIngestionSpec = updatedIngestionSpec.withMetrics(Lists.newArrayList(metrics));
    }

    updatedIngestionSpec = updatedIngestionSpec.withQueryGranularity(config.getGranularitySpec().getQueryGranularity());

    // propagate in the transformSpec from the overall job config
    updatedIngestionSpec = updatedIngestionSpec.withTransformSpec(
        config.getSchema().getDataSchema().getTransformSpec()
    );

    job.getConfiguration().set(DatasourceInputFormat.CONF_DRUID_SCHEMA, mapper.writeValueAsString(updatedIngestionSpec));
    job.getConfiguration().set(DatasourceInputFormat.CONF_INPUT_SEGMENTS, mapper.writeValueAsString(segments));
    job.getConfiguration().set(DatasourceInputFormat.CONF_MAX_SPLIT_SIZE, String.valueOf(maxSplitSize));
    MultipleInputs.addInputPath(job, new Path("/dummy/tobe/ignored"), DatasourceInputFormat.class);

    return job;
  }

  @Override
  public boolean equals(Object o)
  {
    if (this == o) {
      return true;
    }
    if (o == null || getClass() != o.getClass()) {
      return false;
    }

    DatasourcePathSpec that = (DatasourcePathSpec) o;

    if (maxSplitSize != that.maxSplitSize) {
      return false;
    }
    if (!ingestionSpec.equals(that.ingestionSpec)) {
      return false;
    }
    return !(segments != null ? !segments.equals(that.segments) : that.segments != null);

  }

  @Override
  public int hashCode()
  {
    int result = ingestionSpec.hashCode();
    result = 31 * result + (int) (maxSplitSize ^ (maxSplitSize >>> 32));
    result = 31 * result + (segments != null ? segments.hashCode() : 0);
    return result;
  }
}