org.apache.hadoop.hive.ql.exec.tez.TezProcessor Maven / Gradle / Ivy

Show more of this group Show more artifacts with this name
Show all versions of hive-exec
There is a newer version: 4.0.0
/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.hadoop.hive.ql.exec.tez;

import java.io.IOException;
import java.nio.ByteBuffer;
import java.nio.IntBuffer;
import java.text.NumberFormat;
import java.util.List;
import java.util.Map;
import java.util.concurrent.atomic.AtomicBoolean;

import org.apache.hadoop.hive.conf.Constants;
import org.apache.tez.runtime.api.TaskFailureType;
import org.apache.tez.runtime.api.events.CustomProcessorEvent;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.hive.ql.log.PerfLogger;
import org.apache.hadoop.hive.ql.session.SessionState;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.OutputCollector;
import org.apache.hadoop.util.StringUtils;
import org.apache.tez.common.TezUtils;
import org.apache.tez.mapreduce.processor.MRTaskReporter;
import org.apache.tez.runtime.api.AbstractLogicalIOProcessor;
import org.apache.tez.runtime.api.Event;
import org.apache.tez.runtime.api.ExecutionContext;
import org.apache.tez.runtime.api.LogicalInput;
import org.apache.tez.runtime.api.LogicalOutput;
import org.apache.tez.runtime.api.ProcessorContext;
import org.apache.tez.runtime.library.api.KeyValueWriter;

import com.google.common.base.Throwables;

/**
 * Hive processor for Tez that forms the vertices in Tez and processes the data.
 * Does what ExecMapper and ExecReducer does for hive in MR framework.
 */
public class TezProcessor extends AbstractLogicalIOProcessor {

  /**
   * This provides the ability to pass things into TezProcessor, which is normally impossible
   * because of how Tez APIs are structured. Piggyback on ExecutionContext.
   */
  public static interface Hook {
    void initializeHook(TezProcessor source);
  }

  private static final Logger LOG = LoggerFactory.getLogger(TezProcessor.class);
  protected boolean isMap = false;

  protected RecordProcessor rproc = null;
  private final AtomicBoolean aborted = new AtomicBoolean(false);

  protected JobConf jobConf;

  private static final String CLASS_NAME = TezProcessor.class.getName();
  private final PerfLogger perfLogger = SessionState.getPerfLogger();

  // TODO: Replace with direct call to ProgressHelper, when reliably available.
  private static class ReflectiveProgressHelper {

    Configuration conf;
    Class progressHelperClass = null;
    Object progressHelper = null;

    ReflectiveProgressHelper(Configuration conf,
                             Map inputs,
                             ProcessorContext processorContext,
                             String processorName) {
      this.conf = conf;
      try {
        progressHelperClass = this.conf.getClassByName("org.apache.tez.common.ProgressHelper");
        progressHelper = progressHelperClass.getDeclaredConstructor(Map.class, ProcessorContext.class, String.class)
                            .newInstance(inputs, processorContext, processorName);
        LOG.debug("ProgressHelper initialized!");
      }
      catch(Exception ex) {
        LOG.warn("Could not find ProgressHelper. " + ex);
      }
    }

    private boolean isValid() {
      return progressHelperClass != null && progressHelper != null;
    }

    void scheduleProgressTaskService(long delay, long period) {
      if (!isValid()) {
        LOG.warn("ProgressHelper uninitialized. Bailing on scheduleProgressTaskService()");
        return;
      }
      try {
        progressHelperClass.getDeclaredMethod("scheduleProgressTaskService", long.class, long.class)
            .invoke(progressHelper, delay, period);
        LOG.debug("scheduleProgressTaskService() called!");
      } catch (Exception exception) {
        LOG.warn("Could not scheduleProgressTaskService.", exception);
      }
    }

    void shutDownProgressTaskService() {
      if (!isValid()) {
        LOG.warn("ProgressHelper uninitialized. Bailing on scheduleProgressTaskService()");
        return;
      }
      try {
        progressHelperClass.getDeclaredMethod("shutDownProgressTaskService").invoke(progressHelper);
        LOG.debug("shutDownProgressTaskService() called!");
      }
      catch (Exception exception) {
        LOG.warn("Could not shutDownProgressTaskService.", exception);
      }
    }
  }

  protected ProcessorContext processorContext;
  private ReflectiveProgressHelper progressHelper;

  protected static final NumberFormat taskIdFormat = NumberFormat.getInstance();
  protected static final NumberFormat jobIdFormat = NumberFormat.getInstance();
  static {
    taskIdFormat.setGroupingUsed(false);
    taskIdFormat.setMinimumIntegerDigits(6);
    jobIdFormat.setGroupingUsed(false);
    jobIdFormat.setMinimumIntegerDigits(4);
  }

  public TezProcessor(ProcessorContext context) {
    super(context);
    ObjectCache.setupObjectRegistry(context.getObjectRegistry());
  }

  @Override
  public void close() throws IOException {
    // we have to close in the processor's run method, because tez closes inputs
    // before calling close (TEZ-955) and we might need to read inputs
    // when we flush the pipeline.
      if (progressHelper != null) {
        progressHelper.shutDownProgressTaskService();
      }
  }

  @Override
  public void handleEvents(List arg0) {
    // As of now only used for Bucket MapJoin, there is exactly one event in the list.
    assert arg0.size() <= 1;
    for (Event event : arg0) {
      CustomProcessorEvent cpEvent = (CustomProcessorEvent) event;
      ByteBuffer buffer = cpEvent.getPayload();
      // Get int view of the buffer
      IntBuffer intBuffer = buffer.asIntBuffer();
      jobConf.setInt(Constants.LLAP_NUM_BUCKETS, intBuffer.get(0));
      jobConf.setInt(Constants.LLAP_BUCKET_ID, intBuffer.get(1));
    }
  }

  @Override
  public void initialize() throws IOException {
    perfLogger.PerfLogBegin(CLASS_NAME, PerfLogger.TEZ_INITIALIZE_PROCESSOR);
    Configuration conf = TezUtils.createConfFromUserPayload(getContext().getUserPayload());
    this.jobConf = new JobConf(conf);
    this.processorContext = getContext();
    ExecutionContext execCtx = processorContext.getExecutionContext();
    if (execCtx instanceof Hook) {
      ((Hook)execCtx).initializeHook(this);
    }
    setupMRLegacyConfigs(processorContext);
    perfLogger.PerfLogEnd(CLASS_NAME, PerfLogger.TEZ_INITIALIZE_PROCESSOR);
  }

  private void setupMRLegacyConfigs(ProcessorContext processorContext) {
    // Hive "insert overwrite local directory" uses task id as dir name
    // Setting the id in jobconf helps to have the similar dir name as MR
    StringBuilder taskAttemptIdBuilder = new StringBuilder("attempt_");
    taskAttemptIdBuilder.append(processorContext.getApplicationId().getClusterTimestamp())
        .append("_")
        .append(jobIdFormat.format(processorContext.getApplicationId().getId()))
        .append("_");
    if (isMap) {
      taskAttemptIdBuilder.append("m_");
    } else {
      taskAttemptIdBuilder.append("r_");
    }
    taskAttemptIdBuilder.append(taskIdFormat.format(processorContext.getTaskIndex()))
      .append("_")
      .append(processorContext.getTaskAttemptNumber());

    // In MR, mapreduce.task.attempt.id is same as mapred.task.id. Go figure.
    String taskAttemptIdStr = taskAttemptIdBuilder.toString();
    this.jobConf.set("mapred.task.id", taskAttemptIdStr);
    this.jobConf.set("mapreduce.task.attempt.id", taskAttemptIdStr);
    this.jobConf.setInt("mapred.task.partition", processorContext.getTaskIndex());
  }

  @Override
  public void run(Map inputs, Map outputs)
      throws Exception {

    if (aborted.get()) {
      return;
    }

    perfLogger.PerfLogBegin(CLASS_NAME, PerfLogger.TEZ_RUN_PROCESSOR);
    // in case of broadcast-join read the broadcast edge inputs
    // (possibly asynchronously)

    if (LOG.isDebugEnabled()) {
      LOG.debug("Running task: " + getContext().getUniqueIdentifier());
    }

    synchronized (this) {
      // This check isn't absolutely mandatory, given the aborted check outside of the
      // Processor creation.
      if (aborted.get()) {
        return;
      }

      // leverage TEZ-3437: Improve synchronization and the progress report behavior.
      progressHelper = new ReflectiveProgressHelper(jobConf, inputs, getContext(), this.getClass().getSimpleName());


      // There should be no blocking operation in RecordProcessor creation,
      // otherwise the abort operation will not register since they are synchronized on the same
      // lock.
      if (isMap) {
        rproc = new MapRecordProcessor(jobConf, getContext());
      } else {
        rproc = new ReduceRecordProcessor(jobConf, getContext());
      }
    }

    progressHelper.scheduleProgressTaskService(0, 100);
    if (!aborted.get()) {
      initializeAndRunProcessor(inputs, outputs);
    }
    // TODO HIVE-14042. In case of an abort request, throw an InterruptedException
  }

  protected void initializeAndRunProcessor(Map inputs,
      Map outputs)
      throws Exception {
    Throwable originalThrowable = null;
    try {

      MRTaskReporter mrReporter = new MRTaskReporter(getContext());
      // Init and run are both potentially long, and blocking operations. Synchronization
      // with the 'abort' operation will not work since if they end up blocking on a monitor
      // which does not belong to the lock, the abort will end up getting blocked.
      // Both of these method invocations need to handle the abort call on their own.
      rproc.init(mrReporter, inputs, outputs);
      rproc.run();

      //done - output does not need to be committed as hive does not use outputcommitter
      perfLogger.PerfLogEnd(CLASS_NAME, PerfLogger.TEZ_RUN_PROCESSOR);
    } catch (Throwable t) {
      originalThrowable = t;
    } finally {
      if (originalThrowable != null && (originalThrowable instanceof Error ||
        Throwables.getRootCause(originalThrowable) instanceof Error)) {
        LOG.error("Cannot recover from this FATAL error", StringUtils.stringifyException(originalThrowable));
        getContext().reportFailure(TaskFailureType.FATAL, originalThrowable,
                      "Cannot recover from this error");
        throw new RuntimeException(originalThrowable);
      }

      try {
        if (rproc != null) {
          rproc.close();
        }
      } catch (Throwable t) {
        if (originalThrowable == null) {
          originalThrowable = t;
        }
      }
      if (originalThrowable != null) {
        LOG.error(StringUtils.stringifyException(originalThrowable));
        if (originalThrowable instanceof InterruptedException) {
          throw (InterruptedException) originalThrowable;
        } else {
          throw new RuntimeException(originalThrowable);
        }
      }
    }
  }

  @Override
  public void abort() {
    RecordProcessor rProcLocal;
    synchronized (this) {
      LOG.info("Received abort");
      aborted.set(true);
      rProcLocal = rproc;
    }
    if (rProcLocal != null) {
      LOG.info("Forwarding abort to RecordProcessor");
      rProcLocal.abort();
    } else {
      LOG.info("RecordProcessor not yet setup. Abort will be ignored");
    }
  }

  /**
   * KVOutputCollector. OutputCollector that writes using KVWriter.
   * Must be initialized before it is used.
   *
   */
  @SuppressWarnings("rawtypes")
  static class TezKVOutputCollector implements OutputCollector {
    private KeyValueWriter writer;
    private final LogicalOutput output;

    TezKVOutputCollector(LogicalOutput logicalOutput) {
      this.output = logicalOutput;
    }

    void initialize() throws Exception {
      this.writer = (KeyValueWriter) output.getWriter();
    }

    @Override
    public void collect(Object key, Object value) throws IOException {
      writer.write(key, value);
    }
  }

  public JobConf getConf() {
    return jobConf;
  }
}