All Downloads are FREE. Search and download functionalities are using the official Maven repository.

io.druid.indexing.common.task.HadoopIndexTask Maven / Gradle / Ivy

/*
 * Druid - a distributed column store.
 * Copyright 2012 - 2015 Metamarkets Group Inc.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package io.druid.indexing.common.task;

import com.fasterxml.jackson.annotation.JacksonInject;
import com.fasterxml.jackson.annotation.JsonCreator;
import com.fasterxml.jackson.annotation.JsonIgnore;
import com.fasterxml.jackson.annotation.JsonProperty;
import com.fasterxml.jackson.core.type.TypeReference;
import com.fasterxml.jackson.databind.ObjectMapper;
import com.google.common.base.Optional;
import com.google.common.base.Preconditions;
import com.google.common.collect.ImmutableList;
import com.google.common.collect.Iterables;
import com.metamx.common.logger.Logger;
import io.druid.common.utils.JodaUtils;
import io.druid.indexer.HadoopDruidDetermineConfigurationJob;
import io.druid.indexer.HadoopDruidIndexerConfig;
import io.druid.indexer.HadoopDruidIndexerJob;
import io.druid.indexer.HadoopIngestionSpec;
import io.druid.indexer.Jobby;
import io.druid.indexer.MetadataStorageUpdaterJobHandler;
import io.druid.indexing.common.TaskLock;
import io.druid.indexing.common.TaskStatus;
import io.druid.indexing.common.TaskToolbox;
import io.druid.indexing.common.actions.LockAcquireAction;
import io.druid.indexing.common.actions.LockTryAcquireAction;
import io.druid.indexing.common.actions.TaskActionClient;
import io.druid.indexing.hadoop.OverlordActionBasedUsedSegmentLister;
import io.druid.timeline.DataSegment;
import java.util.Map;
import org.joda.time.DateTime;
import org.joda.time.Interval;

import java.util.List;
import java.util.SortedSet;

public class HadoopIndexTask extends HadoopTask
{
  private static final Logger log = new Logger(HadoopIndexTask.class);

  private static String getTheDataSource(HadoopIngestionSpec spec)
  {
    return spec.getDataSchema().getDataSource();
  }

  @JsonIgnore
  private HadoopIngestionSpec spec;

  @JsonIgnore
  private final String classpathPrefix;

  @JsonIgnore
  private final ObjectMapper jsonMapper;

  /**
   * @param spec is used by the HadoopDruidIndexerJob to set up the appropriate parameters
   *             for creating Druid index segments. It may be modified.
   *             

* Here, we will ensure that the DbConnectorConfig field of the spec is set to null, such that the * job does not push a list of published segments the database. Instead, we will use the method * IndexGeneratorJob.getPublishedSegments() to simply return a list of the published * segments, and let the indexing service report these segments to the database. */ @JsonCreator public HadoopIndexTask( @JsonProperty("id") String id, @JsonProperty("spec") HadoopIngestionSpec spec, @JsonProperty("hadoopCoordinates") String hadoopCoordinates, @JsonProperty("hadoopDependencyCoordinates") List hadoopDependencyCoordinates, @JsonProperty("classpathPrefix") String classpathPrefix, @JacksonInject ObjectMapper jsonMapper, @JsonProperty("context") Map context ) { super( id != null ? id : String.format("index_hadoop_%s_%s", getTheDataSource(spec), new DateTime()), getTheDataSource(spec), hadoopDependencyCoordinates == null ? (hadoopCoordinates == null ? null : ImmutableList.of(hadoopCoordinates)) : hadoopDependencyCoordinates, context ); this.spec = spec; // Some HadoopIngestionSpec stuff doesn't make sense in the context of the indexing service Preconditions.checkArgument( this.spec.getIOConfig().getSegmentOutputPath() == null, "segmentOutputPath must be absent" ); Preconditions.checkArgument(this.spec.getTuningConfig().getWorkingPath() == null, "workingPath must be absent"); Preconditions.checkArgument( this.spec.getIOConfig().getMetadataUpdateSpec() == null, "metadataUpdateSpec must be absent" ); this.classpathPrefix = classpathPrefix; this.jsonMapper = Preconditions.checkNotNull(jsonMapper, "null ObjectMappper"); } @Override public String getType() { return "index_hadoop"; } @Override public boolean isReady(TaskActionClient taskActionClient) throws Exception { Optional> intervals = spec.getDataSchema().getGranularitySpec().bucketIntervals(); if (intervals.isPresent()) { Interval interval = JodaUtils.umbrellaInterval( JodaUtils.condenseIntervals( intervals.get() ) ); return taskActionClient.submit(new LockTryAcquireAction(interval)).isPresent(); } else { return true; } } @JsonProperty("spec") public HadoopIngestionSpec getSpec() { return spec; } @JsonProperty public List getHadoopDependencyCoordinates() { return super.getHadoopDependencyCoordinates(); } @JsonProperty @Override public String getClasspathPrefix() { return classpathPrefix; } @SuppressWarnings("unchecked") @Override public TaskStatus run(TaskToolbox toolbox) throws Exception { final ClassLoader loader = buildClassLoader(toolbox); boolean determineIntervals = !spec.getDataSchema().getGranularitySpec().bucketIntervals().isPresent(); spec = HadoopIngestionSpec.updateSegmentListIfDatasourcePathSpecIsUsed( spec, jsonMapper, new OverlordActionBasedUsedSegmentLister(toolbox) ); final String config = invokeForeignLoader( "io.druid.indexing.common.task.HadoopIndexTask$HadoopDetermineConfigInnerProcessing", new String[]{ toolbox.getObjectMapper().writeValueAsString(spec), toolbox.getConfig().getHadoopWorkingPath(), toolbox.getSegmentPusher().getPathForHadoop(getDataSource()) }, loader ); final HadoopIngestionSpec indexerSchema = toolbox .getObjectMapper() .readValue(config, HadoopIngestionSpec.class); // We should have a lock from before we started running only if interval was specified final String version; if (determineIntervals) { Interval interval = JodaUtils.umbrellaInterval( JodaUtils.condenseIntervals( indexerSchema.getDataSchema().getGranularitySpec().bucketIntervals().get() ) ); TaskLock lock = toolbox.getTaskActionClient().submit(new LockAcquireAction(interval)); version = lock.getVersion(); } else { Iterable locks = getTaskLocks(toolbox); final TaskLock myLock = Iterables.getOnlyElement(locks); version = myLock.getVersion(); } log.info("Setting version to: %s", version); final String segments = invokeForeignLoader( "io.druid.indexing.common.task.HadoopIndexTask$HadoopIndexGeneratorInnerProcessing", new String[]{ toolbox.getObjectMapper().writeValueAsString(indexerSchema), version }, loader ); if (segments != null) { List publishedSegments = toolbox.getObjectMapper().readValue( segments, new TypeReference>() { } ); toolbox.pushSegments(publishedSegments); return TaskStatus.success(getId()); } else { return TaskStatus.failure(getId()); } } public static class HadoopIndexGeneratorInnerProcessing { public static String runTask(String[] args) throws Exception { final String schema = args[0]; String version = args[1]; final HadoopIngestionSpec theSchema = HadoopDruidIndexerConfig.jsonMapper .readValue( schema, HadoopIngestionSpec.class ); final HadoopDruidIndexerConfig config = HadoopDruidIndexerConfig.fromSpec( theSchema .withTuningConfig(theSchema.getTuningConfig().withVersion(version)) ); // MetadataStorageUpdaterJobHandler is only needed when running standalone without indexing service // In that case the whatever runs the Hadoop Index Task must ensure MetadataStorageUpdaterJobHandler // can be injected based on the configuration given in config.getSchema().getIOConfig().getMetadataUpdateSpec() final MetadataStorageUpdaterJobHandler maybeHandler; if (config.isUpdaterJobSpecSet()) { maybeHandler = injector.getInstance(MetadataStorageUpdaterJobHandler.class); } else { maybeHandler = null; } HadoopDruidIndexerJob job = new HadoopDruidIndexerJob(config, maybeHandler); log.info("Starting a hadoop index generator job..."); if (job.run()) { return HadoopDruidIndexerConfig.jsonMapper.writeValueAsString(job.getPublishedSegments()); } return null; } } public static class HadoopDetermineConfigInnerProcessing { public static String runTask(String[] args) throws Exception { final String schema = args[0]; final String workingPath = args[1]; final String segmentOutputPath = args[2]; final HadoopIngestionSpec theSchema = HadoopDruidIndexerConfig.jsonMapper .readValue( schema, HadoopIngestionSpec.class ); final HadoopDruidIndexerConfig config = HadoopDruidIndexerConfig.fromSpec( theSchema .withIOConfig(theSchema.getIOConfig().withSegmentOutputPath(segmentOutputPath)) .withTuningConfig(theSchema.getTuningConfig().withWorkingPath(workingPath)) ); Jobby job = new HadoopDruidDetermineConfigurationJob(config); log.info("Starting a hadoop determine configuration job..."); if (job.run()) { return HadoopDruidIndexerConfig.jsonMapper.writeValueAsString(config.getSchema()); } return null; } } }





© 2015 - 2024 Weber Informatics LLC | Privacy Policy