All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.hadoop.hive.ql.io.HiveInputFormat Maven / Gradle / Ivy

/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.hadoop.hive.ql.io;

import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
import java.io.Serializable;
import java.util.ArrayList;
import java.util.Comparator;
import java.util.HashSet;
import java.util.Iterator;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.concurrent.ConcurrentHashMap;
import java.util.Map.Entry;

import org.apache.hadoop.hive.common.FileUtils;
import org.apache.hadoop.hive.common.JavaUtils;
import org.apache.hadoop.hive.common.StringInternUtils;
import org.apache.hadoop.hive.common.ValidTxnWriteIdList;
import org.apache.hadoop.hive.common.ValidWriteIdList;
import org.apache.hadoop.hive.ql.exec.SerializationUtilities;
import org.apache.hive.common.util.Ref;
import com.facebook.presto.hive.$internal.org.slf4j.Logger;
import com.facebook.presto.hive.$internal.org.slf4j.LoggerFactory;

import org.apache.hadoop.conf.Configurable;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hive.conf.HiveConf;
import org.apache.hadoop.hive.conf.HiveConf.ConfVars;
import org.apache.hadoop.hive.io.HiveIOExceptionHandlerUtil;
import org.apache.hadoop.hive.llap.io.api.LlapIo;
import org.apache.hadoop.hive.llap.io.api.LlapProxy;
import org.apache.hadoop.hive.ql.exec.spark.SparkDynamicPartitionPruner;
import org.apache.hadoop.hive.ql.plan.TableDesc;
import org.apache.hadoop.hive.ql.exec.Operator;
import org.apache.hadoop.hive.ql.exec.TableScanOperator;
import org.apache.hadoop.hive.ql.exec.Utilities;
import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch;
import org.apache.hadoop.hive.ql.log.PerfLogger;
import org.apache.hadoop.hive.ql.metadata.HiveException;
import org.apache.hadoop.hive.ql.plan.ExprNodeGenericFuncDesc;
import org.apache.hadoop.hive.ql.plan.MapWork;
import org.apache.hadoop.hive.ql.plan.OperatorDesc;
import org.apache.hadoop.hive.ql.plan.PartitionDesc;
import org.apache.hadoop.hive.ql.plan.TableScanDesc;
import org.apache.hadoop.hive.ql.plan.VectorPartitionDesc;
import org.apache.hadoop.hive.ql.plan.VectorPartitionDesc.VectorMapOperatorReadType;
import org.apache.hadoop.hive.ql.session.SessionState;
import org.apache.hadoop.hive.serde2.ColumnProjectionUtils;
import org.apache.hadoop.hive.serde2.Deserializer;
import org.apache.hadoop.io.Writable;
import org.apache.hadoop.io.WritableComparable;
import org.apache.hadoop.mapred.FileInputFormat;
import org.apache.hadoop.mapred.FileSplit;
import org.apache.hadoop.mapred.InputFormat;
import org.apache.hadoop.mapred.InputSplit;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.JobConfigurable;
import org.apache.hadoop.mapred.RecordReader;
import org.apache.hadoop.mapred.Reporter;
import org.apache.hadoop.util.StringUtils;
import org.apache.hive.common.util.ReflectionUtil;

/**
 * HiveInputFormat is a parameterized InputFormat which looks at the path name
 * and determine the correct InputFormat for that path name from
 * mapredPlan.pathToPartitionInfo(). It can be used to read files with different
 * input format in the same map-reduce job.
 */
public class HiveInputFormat
    implements InputFormat, JobConfigurable {
  private static final String CLASS_NAME = HiveInputFormat.class.getName();
  private static final Logger LOG = LoggerFactory.getLogger(CLASS_NAME);

  /**
   * A cache of InputFormat instances.
   */
  private static final Map> inputFormats
    = new ConcurrentHashMap>();

  private JobConf job;

  // both classes access by subclasses
  protected Map pathToPartitionInfo;
  protected MapWork mrwork;

  public static final class HiveInputSplitComparator implements Comparator {
    @Override
    public int compare(HiveInputSplit o1, HiveInputSplit o2) {
      int pathCompare = comparePath(o1.getPath(), o2.getPath());
      if (pathCompare != 0) {
        return pathCompare;
      }
      return Long.compare(o1.getStart(), o2.getStart());
    }

    private int comparePath(Path p1, Path p2) {
      return p1.compareTo(p2);
    }
  }


  /**
   * HiveInputSplit encapsulates an InputSplit with its corresponding
   * inputFormatClass. The reason that it derives from FileSplit is to make sure
   * "map.input.file" in MapTask.
   */
  public static class HiveInputSplit extends FileSplit implements InputSplit,
      Configurable {


    InputSplit inputSplit;
    String inputFormatClassName;

    public HiveInputSplit() {
      // This is the only public constructor of FileSplit
      super((Path) null, 0, 0, (String[]) null);
    }

    public HiveInputSplit(InputSplit inputSplit, String inputFormatClassName) {
      // This is the only public constructor of FileSplit
      super((Path) null, 0, 0, (String[]) null);
      this.inputSplit = inputSplit;
      this.inputFormatClassName = inputFormatClassName;
    }

    public InputSplit getInputSplit() {
      return inputSplit;
    }

    public String inputFormatClassName() {
      return inputFormatClassName;
    }

    @Override
    public Path getPath() {
      if (inputSplit instanceof FileSplit) {
        return ((FileSplit) inputSplit).getPath();
      }
      throw new RuntimeException(inputSplit + " is not a FileSplit");
    }

    /** The position of the first byte in the file to process. */
    @Override
    public long getStart() {
      if (inputSplit instanceof FileSplit) {
        return ((FileSplit) inputSplit).getStart();
      }
      return 0;
    }

    @Override
    public String toString() {
      return inputFormatClassName + ":" + inputSplit.toString();
    }

    @Override
    public long getLength() {
      long r = 0;
      try {
        r = inputSplit.getLength();
      } catch (Exception e) {
        throw new RuntimeException(e);
      }
      return r;
    }

    @Override
    public String[] getLocations() throws IOException {
      return inputSplit.getLocations();
    }

    @Override
    public void readFields(DataInput in) throws IOException {
      String inputSplitClassName = in.readUTF();
      try {
        inputSplit = (InputSplit) ReflectionUtil.newInstance(conf
            .getClassByName(inputSplitClassName), conf);
      } catch (Exception e) {
        throw new IOException(
            "Cannot create an instance of InputSplit class = "
            + inputSplitClassName + ":" + e.getMessage(), e);
      }
      inputSplit.readFields(in);
      inputFormatClassName = in.readUTF();
    }

    @Override
    public void write(DataOutput out) throws IOException {
      out.writeUTF(inputSplit.getClass().getName());
      inputSplit.write(out);
      out.writeUTF(inputFormatClassName);
    }

    Configuration conf;

    @Override
    public Configuration getConf() {
      return conf;
    }

    @Override
    public void setConf(Configuration conf) {
      this.conf = conf;
    }
  }

  @Override
  public void configure(JobConf job) {
    this.job = job;
  }

  public static InputFormat wrapForLlap(
      InputFormat inputFormat, Configuration conf,
      PartitionDesc part) throws HiveException {
    if (!HiveConf.getBoolVar(conf, ConfVars.LLAP_IO_ENABLED, LlapProxy.isDaemon())) {
      return inputFormat; // LLAP not enabled, no-op.
    }
    String ifName = inputFormat.getClass().getCanonicalName();
    boolean isSupported = inputFormat instanceof LlapWrappableInputFormatInterface;
    boolean isCacheOnly = inputFormat instanceof LlapCacheOnlyInputFormatInterface;
    boolean isVectorized = Utilities.getIsVectorized(conf);
    if (!isVectorized) {
      // Pretend it's vectorized if the non-vector wrapped is enabled.
      isVectorized = HiveConf.getBoolVar(conf, ConfVars.LLAP_IO_NONVECTOR_WRAPPER_ENABLED)
          && (Utilities.getPlanPath(conf) != null);
    }
    boolean isSerdeBased = false;
    if (isVectorized && !isSupported
        && HiveConf.getBoolVar(conf, ConfVars.LLAP_IO_ENCODE_ENABLED)) {
      // See if we can use re-encoding to read the format thru IO elevator.
      isSupported = isSerdeBased = checkInputFormatForLlapEncode(conf, ifName);
    }
    if ((!isSupported || !isVectorized) && !isCacheOnly) {
      if (LOG.isInfoEnabled()) {
        LOG.info("Not using llap for " + ifName + ": supported = "
          + isSupported + ", vectorized = " + isVectorized + ", cache only = " + isCacheOnly);
      }
      return inputFormat;
    }
    if (LOG.isDebugEnabled()) {
      LOG.debug("Processing " + ifName);
    }

    @SuppressWarnings("unchecked")
    LlapIo llapIo = LlapProxy.getIo();
    if (llapIo == null) {
      if (LOG.isInfoEnabled()) {
        LOG.info("Not using LLAP IO because it is not initialized");
      }
      return inputFormat;
    }
    Deserializer serde = null;
    if (isSerdeBased) {
      if (part == null) {
        if (isCacheOnly) {
          LOG.info("Using cache only because there's no partition spec for SerDe-based IF");
          injectLlapCaches(inputFormat, llapIo);
        } else {
          LOG.info("Not using LLAP IO because there's no partition spec for SerDe-based IF");
        }
        return inputFormat;
      }
      try {
        serde = part.getDeserializer(conf);
      } catch (Exception e) {
        throw new HiveException("Error creating SerDe for LLAP IO", e);
      }
    }
    if (isSupported && isVectorized) {
      InputFormat wrappedIf = llapIo.getInputFormat(inputFormat, serde);
      // null means we cannot wrap; the cause is logged inside.
      if (wrappedIf != null) {
        return castInputFormat(wrappedIf);
      }
    }
    if (isCacheOnly) {
      injectLlapCaches(inputFormat, llapIo);
    }
    return inputFormat;
  }

  private static boolean checkInputFormatForLlapEncode(Configuration conf, String ifName) {
    String formatList = HiveConf.getVar(conf, ConfVars.LLAP_IO_ENCODE_FORMATS);
    if (LOG.isDebugEnabled()) {
      LOG.debug("Checking " + ifName + " against " + formatList);
    }
    String[] formats = StringUtils.getStrings(formatList);
    if (formats != null) {
      for (String format : formats) {
        // TODO: should we check isAssignableFrom?
        if (ifName.equals(format)) {
          if (LOG.isInfoEnabled()) {
            LOG.info("Using SerDe-based LLAP reader for " + ifName);
          }
          return true;
        }
      }
    }
    return false;
  }

  public static void injectLlapCaches(InputFormat inputFormat,
      LlapIo llapIo) {
    LOG.info("Injecting LLAP caches into " + inputFormat.getClass().getCanonicalName());
    llapIo.initCacheOnlyInputFormat(inputFormat);
  }

  public static boolean canWrapForLlap(Class clazz, boolean checkVector) {
    return LlapWrappableInputFormatInterface.class.isAssignableFrom(clazz) &&
        (!checkVector || BatchToRowInputFormat.class.isAssignableFrom(clazz));
  }

  public static boolean canInjectCaches(Class clazz) {
    return LlapCacheOnlyInputFormatInterface.class.isAssignableFrom(clazz);
  }

  @SuppressWarnings("unchecked")
  private static  InputFormat castInputFormat(InputFormat from) {
    // This is ugly in two ways...
    // 1) We assume that LlapWrappableInputFormatInterface has NullWritable as first parameter.
    //    Since we are using Java and not, say, a programming language, there's no way to check.
    // 2) We ignore the fact that 2nd arg is completely incompatible (VRB -> Writable), because
    //    vectorization currently works by magic, getting VRB from IF with non-VRB value param.
    // So we just cast blindly and hope for the best (which is obviously what happens).
    return (InputFormat)from;
  }

  /** NOTE: this no longer wraps the IF for LLAP. Call wrapForLlap manually if needed. */
  public static InputFormat getInputFormatFromCache(
    Class inputFormatClass, JobConf job) throws IOException {
    InputFormat instance = inputFormats.get(inputFormatClass);
    if (instance == null) {
      try {
        instance = (InputFormat) ReflectionUtil
            .newInstance(inputFormatClass, job);
        // HBase input formats are not thread safe today. See HIVE-8808.
        String inputFormatName = inputFormatClass.getName().toLowerCase();
        if (!inputFormatName.contains("hbase")) {
          inputFormats.put(inputFormatClass, instance);
        }
      } catch (Exception e) {
        throw new IOException("Cannot create an instance of InputFormat class "
            + inputFormatClass.getName() + " as specified in mapredWork!", e);
      }
    }
    return instance;
  }

  @Override
  public RecordReader getRecordReader(InputSplit split, JobConf job,
      Reporter reporter) throws IOException {
    HiveInputSplit hsplit = (HiveInputSplit) split;
    InputSplit inputSplit = hsplit.getInputSplit();
    String inputFormatClassName = null;
    Class inputFormatClass = null;
    try {
      inputFormatClassName = hsplit.inputFormatClassName();
      inputFormatClass = job.getClassByName(inputFormatClassName);
    } catch (Exception e) {
      throw new IOException("cannot find class " + inputFormatClassName, e);
    }

    if (this.mrwork == null || pathToPartitionInfo == null) {
      init(job);
    }

    boolean nonNative = false;
    PartitionDesc part = HiveFileFormatUtils.getFromPathRecursively(
        pathToPartitionInfo, hsplit.getPath(), null);
    if (LOG.isDebugEnabled()) {
      LOG.debug("Found spec for " + hsplit.getPath() + " " + part + " from " + pathToPartitionInfo);
    }

    try {
      if ((part != null) && (part.getTableDesc() != null)) {
        Utilities.copyTableJobPropertiesToConf(part.getTableDesc(), job);
        nonNative = part.getTableDesc().isNonNative();
      }
    } catch (HiveException e) {
      throw new IOException(e);
    }

    Path splitPath = hsplit.getPath();
    pushProjectionsAndFilters(job, inputFormatClass, splitPath, nonNative);

    InputFormat inputFormat = getInputFormatFromCache(inputFormatClass, job);
    try {
      inputFormat = HiveInputFormat.wrapForLlap(inputFormat, job, part);
    } catch (HiveException e) {
      throw new IOException(e);
    }
    RecordReader innerReader = null;
    try {
      innerReader = inputFormat.getRecordReader(inputSplit, job, reporter);
    } catch (Exception e) {
      innerReader = HiveIOExceptionHandlerUtil
          .handleRecordReaderCreationException(e, job);
    }
    HiveRecordReader rr = new HiveRecordReader(innerReader, job);
    rr.initIOContext(hsplit, job, inputFormatClass, innerReader);
    return rr;
  }

  protected void init(JobConf job) {
    if (mrwork == null || pathToPartitionInfo == null) {
      if (HiveConf.getVar(job, HiveConf.ConfVars.HIVE_EXECUTION_ENGINE).equals("tez")) {
        mrwork = (MapWork) Utilities.getMergeWork(job);
        if (mrwork == null) {
          mrwork = Utilities.getMapWork(job);
        }
      } else {
        mrwork = Utilities.getMapWork(job);
      }

      // Prune partitions
      if (HiveConf.getVar(job, HiveConf.ConfVars.HIVE_EXECUTION_ENGINE).equals("spark")
          && HiveConf.isSparkDPPAny(job)) {
        SparkDynamicPartitionPruner pruner = new SparkDynamicPartitionPruner();
        try {
          pruner.prune(mrwork, job);
        } catch (Exception e) {
          throw new RuntimeException(e);
        }
      }

      pathToPartitionInfo = mrwork.getPathToPartitionInfo();
    }
  }

  /*
   * AddSplitsForGroup collects separate calls to setInputPaths into one where possible.
   * The reason for this is that this is faster on some InputFormats. E.g.: Orc will start
   * a threadpool to do the work and calling it multiple times unnecessarily will create a lot
   * of unnecessary thread pools.
   */
  private void addSplitsForGroup(List dirs, TableScanOperator tableScan, JobConf conf,
      InputFormat inputFormat, Class inputFormatClass, int splits,
      TableDesc table, List result)
          throws IOException {
    ValidWriteIdList validWriteIdList = AcidUtils.getTableValidWriteIdList(conf, table.getTableName());
    ValidWriteIdList validMmWriteIdList;
    if (AcidUtils.isInsertOnlyTable(table.getProperties())) {
      if (validWriteIdList == null) {
        throw new IOException("Insert-Only table: " + table.getTableName()
                + " is missing from the ValidWriteIdList config: "
                + conf.get(ValidTxnWriteIdList.VALID_TABLES_WRITEIDS_KEY));
      }
      validMmWriteIdList = validWriteIdList;
    } else {
      validMmWriteIdList = null;  // for non-MM case
    }

    try {
      Utilities.copyTablePropertiesToConf(table, conf);
      if (tableScan != null) {
        AcidUtils.setAcidOperationalProperties(conf, tableScan.getConf().isTranscationalTable(),
            tableScan.getConf().getAcidOperationalProperties());

        if (tableScan.getConf().isTranscationalTable() && (validWriteIdList == null)) {
          throw new IOException("Acid table: " + table.getTableName()
                  + " is missing from the ValidWriteIdList config: "
                  + conf.get(ValidTxnWriteIdList.VALID_TABLES_WRITEIDS_KEY));
        }
        if (validWriteIdList != null) {
          AcidUtils.setValidWriteIdList(conf, validWriteIdList);
        }
      }
    } catch (HiveException e) {
      throw new IOException(e);
    }

    if (tableScan != null) {
      pushFilters(conf, tableScan, this.mrwork);
    }

    Path[] finalDirs = processPathsForMmRead(dirs, conf, validMmWriteIdList);
    if (finalDirs == null) {
      // This is for transactional tables.
      if (!conf.getBoolean(Utilities.ENSURE_OPERATORS_EXECUTED, false)) {
        LOG.warn("No valid inputs found in " + dirs);
        return; // No valid inputs.
      } else if (validMmWriteIdList != null) {
        // AcidUtils.getAcidState() is already called to verify there is no input split.
        // Thus for a GroupByOperator summary row, set finalDirs and add a Dummy split here.
        finalDirs = dirs.toArray(new Path[dirs.size()]);
        result.add(new HiveInputSplit(new NullRowsInputFormat.DummyInputSplit(finalDirs[0].toString()),
            ZeroRowsInputFormat.class.getName()));
      }
    } else {
      FileInputFormat.setInputPaths(conf, finalDirs);
      conf.setInputFormat(inputFormat.getClass());

      int headerCount = 0;
      int footerCount = 0;
      if (table != null) {
        headerCount = Utilities.getHeaderCount(table);
        footerCount = Utilities.getFooterCount(table, conf);
        if (headerCount != 0 || footerCount != 0) {
          // Input file has header or footer, cannot be splitted.
          HiveConf.setLongVar(conf, ConfVars.MAPREDMINSPLITSIZE, Long.MAX_VALUE);
        }
      }

      InputSplit[] iss = inputFormat.getSplits(conf, splits);
      for (InputSplit is : iss) {
        result.add(new HiveInputSplit(is, inputFormatClass.getName()));
      }
      if (iss.length == 0 && finalDirs.length > 0 && conf.getBoolean(Utilities.ENSURE_OPERATORS_EXECUTED, false)) {
        // If there are no inputs; the Execution engine skips the operator tree.
        // To prevent it from happening; an opaque  ZeroRows input is added here - when needed.
        result.add(new HiveInputSplit(new NullRowsInputFormat.DummyInputSplit(finalDirs[0].toString()),
                ZeroRowsInputFormat.class.getName()));
      }
    }
  }

  public static Path[] processPathsForMmRead(List dirs, JobConf conf,
      ValidWriteIdList validWriteIdList) throws IOException {
     if (validWriteIdList == null) {
      return dirs.toArray(new Path[dirs.size()]);
    } else {
      List finalPaths = new ArrayList<>(dirs.size());
      for (Path dir : dirs) {
        processForWriteIds(dir, conf, validWriteIdList, finalPaths);
      }
      if (finalPaths.isEmpty()) {
        return null;
      }
      return finalPaths.toArray(new Path[finalPaths.size()]);
    }
  }

  private static void processForWriteIds(Path dir, JobConf conf,
      ValidWriteIdList validWriteIdList, List finalPaths) throws IOException {
    FileSystem fs = dir.getFileSystem(conf);
    if (Utilities.FILE_OP_LOGGER.isTraceEnabled()) {
      Utilities.FILE_OP_LOGGER.trace("Checking " + dir + " (root) for inputs");
    }
    // Ignore nullscan-optimized paths.
    if (fs instanceof NullScanFileSystem) {
      finalPaths.add(dir);
      return;
    }

    // Tez require the use of recursive input dirs for union processing, so we have to look into the
    // directory to find out
    LinkedList subdirs = new LinkedList<>();
    subdirs.add(dir); // add itself as a starting point
    while (!subdirs.isEmpty()) {
      Path currDir = subdirs.poll();
      FileStatus[] files = fs.listStatus(currDir);
      boolean hadAcidState = false;   // whether getAcidState has been called for currDir
      for (FileStatus file : files) {
        Path path = file.getPath();
        if (Utilities.FILE_OP_LOGGER.isTraceEnabled()) {
          Utilities.FILE_OP_LOGGER.trace("Checking " + path + " for inputs");
        }
        if (!file.isDirectory()) {
          Utilities.FILE_OP_LOGGER.warn("Ignoring a file not in MM directory " + path);
        } else if (AcidUtils.extractWriteId(path) == null) {
          subdirs.add(path);
        } else if (!hadAcidState) {
          AcidUtils.Directory dirInfo
                  = AcidUtils.getAcidState(currDir, conf, validWriteIdList, Ref.from(false), true, null);
          hadAcidState = true;

          // Find the base, created for IOW.
          Path base = dirInfo.getBaseDirectory();
          if (base != null) {
            finalPaths.add(base);
          }

          // Find the parsed delta files.
          for (AcidUtils.ParsedDelta delta : dirInfo.getCurrentDirectories()) {
            Utilities.FILE_OP_LOGGER.debug("Adding input " + delta.getPath());
            finalPaths.add(delta.getPath());
          }
        }
      }
    }
  }

  Path[] getInputPaths(JobConf job) throws IOException {
    Path[] dirs;
    if (HiveConf.getVar(job, HiveConf.ConfVars.HIVE_EXECUTION_ENGINE).equals("spark")) {
      dirs = mrwork.getPathToPartitionInfo().keySet().toArray(new Path[]{});
    } else {
      dirs = FileInputFormat.getInputPaths(job);
      if (dirs.length == 0) {
        // on tez we're avoiding to duplicate the file info in FileInputFormat.
        if (HiveConf.getVar(job, HiveConf.ConfVars.HIVE_EXECUTION_ENGINE).equals("tez")) {
          try {
            List paths = Utilities.getInputPathsTez(job, mrwork);
            dirs = paths.toArray(new Path[paths.size()]);
          } catch (Exception e) {
            throw new IOException("Could not create input files", e);
          }
        } else {
          throw new IOException("No input paths specified in job");
        }
      }
    }
    StringInternUtils.internUriStringsInPathArray(dirs);
    return dirs;
  }

  @Override
  public InputSplit[] getSplits(JobConf job, int numSplits) throws IOException {
    PerfLogger perfLogger = SessionState.getPerfLogger();
    perfLogger.PerfLogBegin(CLASS_NAME, PerfLogger.GET_SPLITS);
    init(job);
    Path[] dirs = getInputPaths(job);
    JobConf newjob = new JobConf(job);
    List result = new ArrayList();

    List currentDirs = new ArrayList();
    Class currentInputFormatClass = null;
    TableDesc currentTable = null;
    TableScanOperator currentTableScan = null;

    boolean pushDownProjection = false;
    //Buffers to hold filter pushdown information
    StringBuilder readColumnsBuffer = new StringBuilder(newjob.
      get(ColumnProjectionUtils.READ_COLUMN_IDS_CONF_STR, ""));;
    StringBuilder readColumnNamesBuffer = new StringBuilder(newjob.
      get(ColumnProjectionUtils.READ_COLUMN_NAMES_CONF_STR, ""));
    // for each dir, get the InputFormat, and do getSplits.
    for (Path dir : dirs) {
      PartitionDesc part = getPartitionDescFromPath(pathToPartitionInfo, dir);
      Class inputFormatClass = part.getInputFileFormatClass();
      TableDesc table = part.getTableDesc();
      TableScanOperator tableScan = null;

      List aliases = mrwork.getPathToAliases().get(dir);

      // Make filter pushdown information available to getSplits.
      if ((aliases != null) && (aliases.size() == 1)) {
        Operator op = mrwork.getAliasToWork().get(aliases.get(0));
        if ((op != null) && (op instanceof TableScanOperator)) {
          tableScan = (TableScanOperator) op;
          //Reset buffers to store filter push down columns
          readColumnsBuffer.setLength(0);
          readColumnNamesBuffer.setLength(0);
          // push down projections.
          ColumnProjectionUtils.appendReadColumns(readColumnsBuffer, readColumnNamesBuffer,
            tableScan.getNeededColumnIDs(), tableScan.getNeededColumns());
          pushDownProjection = true;
          // push down filters
          pushFilters(newjob, tableScan, this.mrwork);
        }
      } else {
        if (LOG.isDebugEnabled()) {
          LOG.debug("aliases: {} pathToAliases: {} dir: {}", aliases, mrwork.getPathToAliases(), dir);
        }
      }

      if (!currentDirs.isEmpty() &&
          inputFormatClass.equals(currentInputFormatClass) &&
          table.equals(currentTable) &&
          tableScan == currentTableScan) {
        currentDirs.add(dir);
        continue;
      }

      if (!currentDirs.isEmpty()) {
        if (LOG.isInfoEnabled()) {
          LOG.info("Generating splits as currentDirs is not empty. currentDirs: {}", currentDirs);
        }

        // set columns to read in conf
        if (pushDownProjection) {
          pushProjection(newjob, readColumnsBuffer, readColumnNamesBuffer);
        }

        addSplitsForGroup(currentDirs, currentTableScan, newjob,
            getInputFormatFromCache(currentInputFormatClass, job),
            currentInputFormatClass, currentDirs.size()*(numSplits / dirs.length),
            currentTable, result);
      }

      currentDirs.clear();
      currentDirs.add(dir);
      currentTableScan = tableScan;
      currentTable = table;
      currentInputFormatClass = inputFormatClass;
    }

    // set columns to read in conf
    if (pushDownProjection) {
      pushProjection(newjob, readColumnsBuffer, readColumnNamesBuffer);
    }

    if (dirs.length != 0) {
      if (LOG.isInfoEnabled()) {
        LOG.info("Generating splits for dirs: {}", dirs);
      }
      addSplitsForGroup(currentDirs, currentTableScan, newjob,
          getInputFormatFromCache(currentInputFormatClass, job),
          currentInputFormatClass, currentDirs.size()*(numSplits / dirs.length),
          currentTable, result);
    }

    Utilities.clearWorkMapForConf(job);
    if (LOG.isInfoEnabled()) {
      LOG.info("number of splits " + result.size());
    }
    perfLogger.PerfLogEnd(CLASS_NAME, PerfLogger.GET_SPLITS);
    return result.toArray(new HiveInputSplit[result.size()]);
  }

  private void pushProjection(final JobConf newjob, final StringBuilder readColumnsBuffer,
      final StringBuilder readColumnNamesBuffer) {
    String readColIds = readColumnsBuffer.toString();
    String readColNames = readColumnNamesBuffer.toString();
    newjob.setBoolean(ColumnProjectionUtils.READ_ALL_COLUMNS, false);
    newjob.set(ColumnProjectionUtils.READ_COLUMN_IDS_CONF_STR, readColIds);
    newjob.set(ColumnProjectionUtils.READ_COLUMN_NAMES_CONF_STR, readColNames);

    if (LOG.isInfoEnabled()) {
      LOG.info("{} = {}", ColumnProjectionUtils.READ_COLUMN_IDS_CONF_STR, readColIds);
      LOG.info("{} = {}", ColumnProjectionUtils.READ_COLUMN_NAMES_CONF_STR, readColNames);
    }
  }


  protected static PartitionDesc getPartitionDescFromPath(
      Map pathToPartitionInfo, Path dir)
      throws IOException {
    PartitionDesc partDesc = pathToPartitionInfo.get(dir);
    if (partDesc == null) {
      // Note: we could call HiveFileFormatUtils.getPartitionDescFromPathRecursively for MM tables.
      //       The recursive call is usually needed for non-MM tables, because the path management
      //       is not strict and the code does whatever. That should not happen for MM tables.
      //       Keep it like this for now; may need replacement if we find a valid use case.
      partDesc = pathToPartitionInfo.get(Path.getPathWithoutSchemeAndAuthority(dir));
    }
    if (partDesc == null) {
      throw new IOException("cannot find dir = " + dir.toString()
          + " in " + pathToPartitionInfo);
    }

    return partDesc;
  }

  public static void pushFilters(JobConf jobConf, TableScanOperator tableScan,
    final MapWork mrwork) {

    // ensure filters are not set from previous pushFilters
    jobConf.unset(TableScanDesc.FILTER_TEXT_CONF_STR);
    jobConf.unset(TableScanDesc.FILTER_EXPR_CONF_STR);

    Utilities.unsetSchemaEvolution(jobConf);

    TableScanDesc scanDesc = tableScan.getConf();
    if (scanDesc == null) {
      return;
    }

    Utilities.addTableSchemaToConf(jobConf, tableScan);

    // construct column name list and types for reference by filter push down
    Utilities.setColumnNameList(jobConf, tableScan);
    Utilities.setColumnTypeList(jobConf, tableScan);
    // push down filters
    ExprNodeGenericFuncDesc filterExpr = (ExprNodeGenericFuncDesc)scanDesc.getFilterExpr();
    if (filterExpr == null) {
      return;
    }

    // disable filter pushdown for mapreduce when there are more than one table aliases,
    // since we don't clone jobConf per alias
    if (mrwork != null && mrwork.getAliases() != null && mrwork.getAliases().size() > 1 &&
      jobConf.get(ConfVars.HIVE_EXECUTION_ENGINE.varname).equals("mr")) {
      return;
    }

    String serializedFilterObj = scanDesc.getSerializedFilterObject();
    String serializedFilterExpr = scanDesc.getSerializedFilterExpr();
    boolean hasObj = serializedFilterObj != null, hasExpr = serializedFilterExpr != null;
    if (!hasObj) {
      Serializable filterObject = scanDesc.getFilterObject();
      if (filterObject != null) {
        serializedFilterObj = SerializationUtilities.serializeObject(filterObject);
      }
    }
    if (serializedFilterObj != null) {
      jobConf.set(TableScanDesc.FILTER_OBJECT_CONF_STR, serializedFilterObj);
    }
    if (!hasExpr) {
      serializedFilterExpr = SerializationUtilities.serializeExpression(filterExpr);
    }
    String filterText = filterExpr.getExprString();
    if (LOG.isDebugEnabled()) {
      LOG.debug("Pushdown initiated with filterText = " + filterText + ", filterExpr = "
          + filterExpr + ", serializedFilterExpr = " + serializedFilterExpr + " ("
          + (hasExpr ? "desc" : "new") + ")" + (serializedFilterObj == null ? "" :
            (", serializedFilterObj = " + serializedFilterObj + " (" + (hasObj ? "desc" : "new")
                + ")")));
    }
    jobConf.set(TableScanDesc.FILTER_TEXT_CONF_STR, filterText);
    jobConf.set(TableScanDesc.FILTER_EXPR_CONF_STR, serializedFilterExpr);
  }

  protected void pushProjectionsAndFilters(JobConf jobConf, Class inputFormatClass,
      Path splitPath) {
    pushProjectionsAndFilters(jobConf, inputFormatClass, splitPath, false);
  }

  protected void pushProjectionsAndFilters(JobConf jobConf, Class inputFormatClass,
      Path splitPath, boolean nonNative) {
    Path splitPathWithNoSchema = Path.getPathWithoutSchemeAndAuthority(splitPath);
    if (this.mrwork == null) {
      init(job);
    }

    if(this.mrwork.getPathToAliases() == null) {
      return;
    }

    ArrayList aliases = new ArrayList();
    Iterator>> iterator = this.mrwork
        .getPathToAliases().entrySet().iterator();

    Set splitParentPaths = null;
    int pathsSize = this.mrwork.getPathToAliases().entrySet().size();
    while (iterator.hasNext()) {
      Entry> entry = iterator.next();
      Path key = entry.getKey();
      boolean match;
      if (nonNative) {
        // For non-native tables, we need to do an exact match to avoid
        // HIVE-1903.  (The table location contains no files, and the string
        // representation of its path does not have a trailing slash.)
        match =
          splitPath.equals(key) || splitPathWithNoSchema.equals(key);
      } else {
        // But for native tables, we need to do a prefix match for
        // subdirectories.  (Unlike non-native tables, prefix mixups don't seem
        // to be a potential problem here since we are always dealing with the
        // path to something deeper than the table location.)
        if (pathsSize > 1) {
          // Comparing paths multiple times creates lots of objects &
          // creates GC pressure for tables having large number of partitions.
          // In such cases, use pre-computed paths for comparison
          if (splitParentPaths == null) {
            splitParentPaths = new HashSet<>();
            FileUtils.populateParentPaths(splitParentPaths, splitPath);
            FileUtils.populateParentPaths(splitParentPaths, splitPathWithNoSchema);
          }
          match = splitParentPaths.contains(key);
        } else {
          match = FileUtils.isPathWithinSubtree(splitPath, key)
              || FileUtils.isPathWithinSubtree(splitPathWithNoSchema, key);
        }
      }
      if (match) {
        ArrayList list = entry.getValue();
        for (String val : list) {
          aliases.add(val);
        }
      }
    }

    for (String alias : aliases) {
      Operator op = this.mrwork.getAliasToWork().get(
        alias);
      if (op instanceof TableScanOperator) {
        TableScanOperator ts = (TableScanOperator) op;
        // push down projections.
        ColumnProjectionUtils.appendReadColumns(
            jobConf, ts.getNeededColumnIDs(), ts.getNeededColumns(), ts.getNeededNestedColumnPaths());
        // push down filters
        pushFilters(jobConf, ts, this.mrwork);

        AcidUtils.setAcidOperationalProperties(job, ts.getConf().isTranscationalTable(),
            ts.getConf().getAcidOperationalProperties());
        AcidUtils.setValidWriteIdList(job, ts.getConf());
      }
    }
  }
}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy