org.apache.hadoop.hive.ql.stats.BasicStatsNoJobTask Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of hive-exec
There is a newer version: 4.0.1
/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.hadoop.hive.ql.stats;

import java.io.IOException;
import java.util.Collection;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.TimeUnit;

import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hive.common.HiveStatsUtils;
import org.apache.hadoop.hive.common.StatsSetupConst;
import org.apache.hadoop.hive.conf.HiveConf;
import org.apache.hadoop.hive.metastore.api.EnvironmentContext;
import org.apache.hadoop.hive.metastore.api.InvalidOperationException;
import org.apache.hadoop.hive.ql.CompilationOpContext;
import org.apache.hadoop.hive.ql.exec.StatsTask;
import org.apache.hadoop.hive.ql.exec.Utilities;
import org.apache.hadoop.hive.ql.io.AcidUtils;
import org.apache.hadoop.hive.ql.io.StatsProvidingRecordReader;
import org.apache.hadoop.hive.ql.metadata.Hive;
import org.apache.hadoop.hive.ql.metadata.HiveException;
import org.apache.hadoop.hive.ql.metadata.Partition;
import org.apache.hadoop.hive.ql.metadata.Table;
import org.apache.hadoop.hive.ql.parse.BaseSemanticAnalyzer.TableSpec;
import org.apache.hadoop.hive.ql.plan.BasicStatsNoJobWork;
import org.apache.hadoop.hive.ql.plan.api.StageType;
import org.apache.hadoop.hive.ql.session.SessionState.LogHelper;
import org.apache.hadoop.mapred.FileSplit;
import org.apache.hadoop.mapred.InputFormat;
import org.apache.hadoop.mapred.InputSplit;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.Reporter;
import org.apache.hadoop.util.StringUtils;
import org.apache.hive.common.util.ReflectionUtil;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import com.google.common.base.Function;
import com.google.common.collect.ImmutableList;
import com.google.common.collect.ImmutableListMultimap;
import com.google.common.collect.Lists;
import com.google.common.collect.Multimaps;

/**
 * StatsNoJobTask is used in cases where stats collection is the only task for the given query (no
 * parent MR or Tez job). It is used in the following cases 1) ANALYZE with noscan for
 * file formats that implement StatsProvidingRecordReader interface: ORC format (implements
 * StatsProvidingRecordReader) stores column statistics for all columns in the file footer. Its much
 * faster to compute the table/partition statistics by reading the footer than scanning all the
 * rows. This task can be used for computing basic stats like numFiles, numRows, fileSize,
 * rawDataSize from ORC footer.
 **/
public class BasicStatsNoJobTask implements IStatsProcessor {

  private static transient final Logger LOG = LoggerFactory.getLogger(BasicStatsNoJobTask.class);
  private HiveConf conf;

  private BasicStatsNoJobWork work;
  private LogHelper console;

  public BasicStatsNoJobTask(HiveConf conf, BasicStatsNoJobWork work) {
    this.conf = conf;
    this.work = work;
    console = new LogHelper(LOG);
  }


  @Override
  public void initialize(CompilationOpContext opContext) {

  }

  @Override
  public int process(Hive db, Table tbl) throws Exception {

    LOG.info("Executing stats (no job) task");

    ExecutorService threadPool = StatsTask.newThreadPool(conf);

    return aggregateStats(threadPool, db);
  }

  public StageType getType() {
    return StageType.STATS;
  }

  public String getName() {
    return "STATS-NO-JOB";
  }

  static class StatItem {
    Partish partish;
    Map params;
    Object result;
  }

  static class FooterStatCollector implements Runnable {

    private Partish partish;
    private Object result;
    private JobConf jc;
    private Path dir;
    private FileSystem fs;
    private LogHelper console;

    public FooterStatCollector(JobConf jc, Partish partish) {
      this.jc = jc;
      this.partish = partish;
    }

    public static final Function SIMPLE_NAME_FUNCTION = new Function() {

      @Override
      public String apply(FooterStatCollector sc) {
        return String.format("%s#%s", sc.partish.getTable().getCompleteName(), sc.partish.getPartishType());
      }
    };
    private static final Function EXTRACT_RESULT_FUNCTION = new Function() {
      @Override
      public Partition apply(FooterStatCollector input) {
        return (Partition) input.result;
      }
    };

    private boolean isValid() {
      return result != null;
    }

    public void init(HiveConf conf, LogHelper console) throws IOException {
      this.console = console;
      dir = new Path(partish.getPartSd().getLocation());
      fs = dir.getFileSystem(conf);
    }

    @Override
    public void run() {

      Map parameters = partish.getPartParameters();
      try {
        long numRows = 0;
        long rawDataSize = 0;
        long fileSize = 0;
        long numFiles = 0;
        // Note: this code would be invalid for transactional tables of any kind.
        Utilities.FILE_OP_LOGGER.debug("Aggregating stats for {}", dir);
        List fileList = null;
        if (partish.getTable() != null
            && AcidUtils.isTransactionalTable(partish.getTable())) {
          fileList = AcidUtils.getAcidFilesForStats(partish.getTable(), dir, jc, fs);
        } else {
          fileList = HiveStatsUtils.getFileStatusRecurse(dir, -1, fs);
        }

        for (FileStatus file : fileList) {
          Utilities.FILE_OP_LOGGER.debug("Computing stats for {}", file);
          if (!file.isDirectory()) {
            InputFormat inputFormat = ReflectionUtil.newInstance(partish.getInputFormatClass(), jc);
            InputSplit dummySplit = new FileSplit(file.getPath(), 0, 0, new String[] { partish.getLocation() });
            if (file.getLen() == 0) {
              numFiles += 1;
            } else {
              org.apache.hadoop.mapred.RecordReader recordReader = inputFormat.getRecordReader(dummySplit, jc, Reporter.NULL);
              try {
                if (recordReader instanceof StatsProvidingRecordReader) {
                  StatsProvidingRecordReader statsRR;
                  statsRR = (StatsProvidingRecordReader) recordReader;
                  rawDataSize += statsRR.getStats().getRawDataSize();
                  numRows += statsRR.getStats().getRowCount();
                  fileSize += file.getLen();
                  numFiles += 1;
                } else {
                  throw new HiveException(String.format("Unexpected file found during reading footers for: %s ", file));
                }
              } finally {
                recordReader.close();
              }
            }
          }
        }

        StatsSetupConst.setBasicStatsState(parameters, StatsSetupConst.TRUE);

        parameters.put(StatsSetupConst.ROW_COUNT, String.valueOf(numRows));
        parameters.put(StatsSetupConst.RAW_DATA_SIZE, String.valueOf(rawDataSize));
        parameters.put(StatsSetupConst.TOTAL_SIZE, String.valueOf(fileSize));
        parameters.put(StatsSetupConst.NUM_FILES, String.valueOf(numFiles));

        if (partish.getPartition() != null) {
          result = new Partition(partish.getTable(), partish.getPartition().getTPartition());
        } else {
          result = new Table(partish.getTable().getTTable());
        }

        String msg = partish.getSimpleName() + " stats: [" + toString(parameters) + ']';
        LOG.debug(msg);
        console.printInfo(msg);

      } catch (Exception e) {
        console.printInfo("[Warning] could not update stats for " + partish.getSimpleName() + ".", "Failed with exception " + e.getMessage() + "\n" + StringUtils.stringifyException(e));
      }
    }

    private String toString(Map parameters) {
      StringBuilder builder = new StringBuilder();
      for (String statType : StatsSetupConst.supportedStats) {
        String value = parameters.get(statType);
        if (value != null) {
          if (builder.length() > 0) {
            builder.append(", ");
          }
          builder.append(statType).append('=').append(value);
        }
      }
      return builder.toString();
    }

  }

  private int aggregateStats(ExecutorService threadPool, Hive db) {
    int ret = 0;
    try {
      JobConf jc = new JobConf(conf);

      TableSpec tableSpecs = work.getTableSpecs();

      if (tableSpecs == null) {
        throw new RuntimeException("this is unexpected...needs some investigation");
      }

      Table table = tableSpecs.tableHandle;

      Collection partitions = null;
      if (work.getPartitions() == null || work.getPartitions().isEmpty()) {
        if (table.isPartitioned()) {
          partitions = tableSpecs.partitions;
        }
      } else {
        partitions = work.getPartitions();
      }

      LinkedList partishes = Lists.newLinkedList();
      if (partitions == null) {
        partishes.add(Partish.buildFor(table));
      } else {
        for (Partition part : partitions) {
          partishes.add(Partish.buildFor(table, part));
        }
      }

      List scs = Lists.newArrayList();
      for (Partish partish : partishes) {
        scs.add(new FooterStatCollector(jc, partish));
      }

      for (FooterStatCollector sc : scs) {
        sc.init(conf, console);
        threadPool.execute(sc);
      }

      LOG.debug("Stats collection waiting for threadpool to shutdown..");
      shutdownAndAwaitTermination(threadPool);
      LOG.debug("Stats collection threadpool shutdown successful.");

      ret = updatePartitions(db, scs, table);

    } catch (Exception e) {
      console.printError("Failed to collect footer statistics.", "Failed with exception " + e.getMessage() + "\n" + StringUtils.stringifyException(e));
      // Fail the query if the stats are supposed to be reliable
      if (work.isStatsReliable()) {
        ret = -1;
      }
    }

    // The return value of 0 indicates success,
    // anything else indicates failure
    return ret;
  }

  private int updatePartitions(Hive db, List scs, Table table) throws InvalidOperationException, HiveException {

    String tableFullName = table.getFullyQualifiedName();

    if (scs.isEmpty()) {
      return 0;
    }
    if (work.isStatsReliable()) {
      for (FooterStatCollector statsCollection : scs) {
        if (statsCollection.result == null) {
          LOG.debug("Stats requested to be reliable. Empty stats found: {}", statsCollection.partish.getSimpleName());
          return -1;
        }
      }
    }
    List validColectors = Lists.newArrayList();
    for (FooterStatCollector statsCollection : scs) {
      if (statsCollection.isValid()) {
        validColectors.add(statsCollection);
      }
    }

    EnvironmentContext environmentContext = new EnvironmentContext();
    environmentContext.putToProperties(StatsSetupConst.DO_NOT_UPDATE_STATS, StatsSetupConst.TRUE);

    ImmutableListMultimap collectorsByTable = Multimaps.index(validColectors, FooterStatCollector.SIMPLE_NAME_FUNCTION);

    LOG.debug("Collectors.size(): {}", collectorsByTable.keySet());

    if (collectorsByTable.keySet().size() < 1) {
      LOG.warn("Collectors are empty! ; {}", tableFullName);
    }

    // for now this should be true...
    assert (collectorsByTable.keySet().size() <= 1);

    LOG.debug("Updating stats for: {}", tableFullName);

    for (String partName : collectorsByTable.keySet()) {
      ImmutableList values = collectorsByTable.get(partName);

      if (values == null) {
        throw new RuntimeException("very intresting");
      }

      if (values.get(0).result instanceof Table) {
        db.alterTable(tableFullName, (Table) values.get(0).result, environmentContext);
        LOG.debug("Updated stats for {}.", tableFullName);
      } else {
        if (values.get(0).result instanceof Partition) {
          List results = Lists.transform(values, FooterStatCollector.EXTRACT_RESULT_FUNCTION);
          db.alterPartitions(tableFullName, results, environmentContext);
          LOG.debug("Bulk updated {} partitions of {}.", results.size(), tableFullName);
        } else {
          throw new RuntimeException("inconsistent");
        }
      }
    }
    LOG.debug("Updated stats for: {}", tableFullName);
    return 0;
  }

  private void shutdownAndAwaitTermination(ExecutorService threadPool) {

    // Disable new tasks from being submitted
    threadPool.shutdown();
    try {

      // Wait a while for existing tasks to terminate
      // XXX this will wait forever... :)
      while (!threadPool.awaitTermination(10, TimeUnit.SECONDS)) {
        LOG.debug("Waiting for all stats tasks to finish...");
      }
      // Cancel currently executing tasks
      threadPool.shutdownNow();

      // Wait a while for tasks to respond to being cancelled
      if (!threadPool.awaitTermination(100, TimeUnit.SECONDS)) {
        LOG.debug("Stats collection thread pool did not terminate");
      }
    } catch (InterruptedException ie) {

      // Cancel again if current thread also interrupted
      threadPool.shutdownNow();

      // Preserve interrupt status
      Thread.currentThread().interrupt();
    }
  }

  @Override
  public void setDpPartSpecs(Collection dpPartSpecs) {
  }
}