org.apache.hadoop.hive.ql.stats.BasicStatsTask Maven / Gradle / Ivy
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hadoop.hive.ql.stats;
import java.io.IOException;
import java.io.Serializable;
import java.util.ArrayList;
import java.util.Collection;
import java.util.List;
import java.util.Map;
import java.util.concurrent.Callable;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
import java.util.concurrent.Future;
import java.util.stream.Collectors;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hive.common.FileUtils;
import org.apache.hadoop.hive.common.StatsSetupConst;
import org.apache.hadoop.hive.conf.HiveConf;
import org.apache.hadoop.hive.conf.HiveConf.ConfVars;
import org.apache.hadoop.hive.metastore.Warehouse;
import org.apache.hadoop.hive.metastore.api.EnvironmentContext;
import org.apache.hadoop.hive.metastore.api.MetaException;
import org.apache.hadoop.hive.metastore.api.UpdateTransactionalStatsRequest;
import org.apache.hadoop.hive.metastore.utils.MetaStoreServerUtils;
import org.apache.hadoop.hive.ql.CompilationOpContext;
import org.apache.hadoop.hive.ql.ErrorMsg;
import org.apache.hadoop.hive.ql.exec.Task;
import org.apache.hadoop.hive.ql.exec.Utilities;
import org.apache.hadoop.hive.ql.io.AcidUtils;
import org.apache.hadoop.hive.ql.metadata.Hive;
import org.apache.hadoop.hive.ql.metadata.HiveException;
import org.apache.hadoop.hive.ql.metadata.Partition;
import org.apache.hadoop.hive.ql.metadata.Table;
import org.apache.hadoop.hive.ql.parse.BaseSemanticAnalyzer.TableSpec;
import org.apache.hadoop.hive.ql.plan.BasicStatsWork;
import org.apache.hadoop.hive.ql.plan.DynamicPartitionCtx;
import org.apache.hadoop.hive.ql.plan.LoadTableDesc;
import org.apache.hadoop.hive.ql.plan.api.StageType;
import org.apache.hadoop.hive.ql.session.SessionState.LogHelper;
import org.apache.hadoop.util.StringUtils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.google.common.collect.Lists;
import com.google.common.util.concurrent.ThreadFactoryBuilder;
import static java.util.Collections.emptyList;
import static java.util.Collections.singletonList;
import static java.util.Collections.unmodifiableList;
import static org.apache.hadoop.hive.common.StatsSetupConst.DELETE_COUNT;
import static org.apache.hadoop.hive.common.StatsSetupConst.INSERT_COUNT;
import static org.apache.hadoop.hive.common.StatsSetupConst.UPDATE_COUNT;
/**
* StatsTask implementation. StatsTask mainly deals with "collectable" stats. These are
* stats that require data scanning and are collected during query execution (unless the user
* explicitly requests data scanning just for the purpose of stats computation using the "ANALYZE"
* command. All other stats are computed directly by the MetaStore. The rationale being that the
* MetaStore layer covers all Thrift calls and provides better guarantees about the accuracy of
* those stats.
**/
public class BasicStatsTask implements Serializable, IStatsProcessor {
private static final long serialVersionUID = 1L;
private static transient final Logger LOG = LoggerFactory.getLogger(BasicStatsTask.class);
private Table table;
private Collection dpPartSpecs;
public boolean followedColStats;
private BasicStatsWork work;
private HiveConf conf;
protected transient LogHelper console;
public BasicStatsTask(HiveConf conf, BasicStatsWork work) {
super();
dpPartSpecs = null;
this.conf = conf;
console = new LogHelper(LOG);
this.work = work;
}
@Override
public int process(Hive db, Table tbl) throws Exception {
LOG.info("Executing stats task");
table = tbl;
return aggregateStats(db, tbl);
}
@Override
public void initialize(CompilationOpContext opContext) {
}
public StageType getType() {
return StageType.STATS;
}
public String getName() {
return "STATS";
}
private static class BasicStatsProcessor {
private Partish partish;
private List partfileStatus;
private boolean isMissingAcidState = false;
private BasicStatsWork work;
private boolean followedColStats1;
private Map providedBasicStats;
public BasicStatsProcessor(Partish partish, BasicStatsWork work, HiveConf conf, boolean followedColStats2) {
this.partish = partish;
this.work = work;
followedColStats1 = followedColStats2;
Table table = partish.getTable();
if (table.isNonNative() && table.getStorageHandler().canProvideBasicStatistics()) {
providedBasicStats = table.getStorageHandler().getBasicStatistics(partish);
}
}
public Object process(StatsAggregator statsAggregator) throws HiveException, MetaException {
Partish p = partish;
Map parameters = p.getPartParameters();
if (work.isTargetRewritten()) {
StatsSetupConst.setBasicStatsState(parameters, StatsSetupConst.TRUE);
}
// work.getTableSpecs() == null means it is not analyze command
// and then if it is not followed by column stats, we should clean
// column stats
// FIXME: move this to ColStat related part
if (!work.isExplicitAnalyze() && !followedColStats1) {
StatsSetupConst.clearColumnStatsState(parameters);
}
if (partfileStatus == null && providedBasicStats == null) {
// This may happen if ACID state is absent from config.
String spec = partish.getPartition() == null ? partish.getTable().getTableName()
: partish.getPartition().getSpec().toString();
LOG.warn("Partition/partfiles is null for: " + spec);
if (isMissingAcidState) {
MetaStoreServerUtils.clearQuickStats(parameters);
return p.getOutput();
}
return null;
}
// The collectable stats for the aggregator needs to be cleared.
// For example, if a file is being loaded, the old number of rows are not valid
// XXX: makes no sense for me... possibly not needed anymore
if (work.isClearAggregatorStats()) {
// we choose to keep the invalid stats and only change the setting.
StatsSetupConst.setBasicStatsState(parameters, StatsSetupConst.FALSE);
}
if (providedBasicStats == null) {
MetaStoreServerUtils.populateQuickStats(partfileStatus, parameters);
if (statsAggregator != null) {
// Update stats for transactional tables (MM, or full ACID with overwrite), even
// though we are marking stats as not being accurate.
if (StatsSetupConst.areBasicStatsUptoDate(parameters) || p.isTransactionalTable()) {
String prefix = getAggregationPrefix(p.getTable(), p.getPartition());
updateStats(statsAggregator, parameters, prefix);
}
}
} else {
parameters.putAll(providedBasicStats);
}
return p.getOutput();
}
public void collectFileStatus(Warehouse wh, HiveConf conf) throws MetaException, IOException {
if (providedBasicStats == null) {
if (!partish.isTransactionalTable()) {
partfileStatus = wh.getFileStatusesForSD(partish.getPartSd());
} else {
Path path = new Path(partish.getPartSd().getLocation());
partfileStatus = AcidUtils.getAcidFilesForStats(partish.getTable(), path, conf, null);
isMissingAcidState = true;
}
}
}
private void updateStats(StatsAggregator statsAggregator, Map parameters,
String aggKey) throws HiveException {
for (String statType : StatsSetupConst.STATS_REQUIRE_COMPUTE) {
String value = statsAggregator.aggregateStats(aggKey, statType);
if (value != null && !value.isEmpty()) {
long longValue = Long.parseLong(value);
if (!work.isTargetRewritten()) {
String originalValue = parameters.get(statType);
if (originalValue != null) {
longValue += Long.parseLong(originalValue); // todo: invalid + valid = invalid
}
}
parameters.put(statType, String.valueOf(longValue));
}
}
}
}
private static class TransactionalStatsProcessor {
private final Hive db;
private final Partish partish;
private TransactionalStatsProcessor(Hive db, Partish partish) {
this.db = db;
this.partish = partish;
}
private long toLong(String value) {
if (value == null || value.isEmpty()) {
return 0;
}
return Long.parseLong(value);
}
public void process(StatsAggregator statsAggregator) throws HiveException, MetaException {
if (statsAggregator == null) {
return;
}
if (partish.isTransactionalTable()) {
String prefix = getAggregationPrefix(partish.getTable(), partish.getPartition());
long insertCount = toLong(statsAggregator.aggregateStats(prefix, INSERT_COUNT));
long updateCount = toLong(statsAggregator.aggregateStats(prefix, UPDATE_COUNT));
long deleteCount = toLong(statsAggregator.aggregateStats(prefix, DELETE_COUNT));
if (insertCount > 0 || updateCount > 0 || deleteCount > 0) {
UpdateTransactionalStatsRequest request = new UpdateTransactionalStatsRequest();
request.setTableId(partish.getTable().getTTable().getId());
request.setInsertCount(insertCount);
request.setUpdatedCount(updateCount);
request.setDeletedCount(deleteCount);
db.updateTransactionalStatistics(request);
}
}
}
}
private int aggregateStats(Hive db, Table tbl) {
StatsAggregator statsAggregator = null;
int ret = 0;
StatsCollectionContext scc = null;
EnvironmentContext environmentContext = null;
environmentContext = new EnvironmentContext();
environmentContext.putToProperties(StatsSetupConst.DO_NOT_UPDATE_STATS, StatsSetupConst.TRUE);
try {
// Stats setup:
final Warehouse wh = new Warehouse(conf);
if (!getWork().getNoStatsAggregator() && !getWork().isNoScanAnalyzeCommand()) {
try {
scc = getContext();
statsAggregator = createStatsAggregator(scc, conf);
} catch (HiveException e) {
if (HiveConf.getBoolVar(conf, HiveConf.ConfVars.HIVE_STATS_RELIABLE)) {
throw e;
}
console.printError(ErrorMsg.STATS_SKIPPING_BY_ERROR.getErrorCodedMsg(e.toString()));
}
}
List partitions = getPartitionsList(db);
String tableFullName = table.getDbName() + "." + table.getTableName();
List partishes = new ArrayList<>();
if (partitions == null) {
Partish p;
partishes.add(p = new Partish.PTable(table));
BasicStatsProcessor basicStatsProcessor = new BasicStatsProcessor(p, work, conf, followedColStats);
basicStatsProcessor.collectFileStatus(wh, conf);
Table res = (Table) basicStatsProcessor.process(statsAggregator);
if (res == null) {
return 0;
}
db.alterTable(tableFullName, res, environmentContext, true);
TransactionalStatsProcessor transactionalStatsProcessor = new TransactionalStatsProcessor(db, p);
transactionalStatsProcessor.process(statsAggregator);
if (conf.getBoolVar(ConfVars.TEZ_EXEC_SUMMARY)) {
console.printInfo("Table " + tableFullName + " stats: [" + toString(p.getPartParameters()) + ']');
} else {
LOG.info("Table " + tableFullName + " stats: [" + toString(p.getPartParameters()) + ']');
}
// The table object is assigned to the latest table object.
// So that it can be used by ColStatsProcessor.
// This is only required for unpartitioned tables.
tbl.setTTable(res.getTTable());
} else {
// Partitioned table:
// Need to get the old stats of the partition
// and update the table stats based on the old and new stats.
List updates = new ArrayList();
final ExecutorService pool = buildBasicStatsExecutor();
final List> futures = Lists.newLinkedList();
List processors = Lists.newLinkedList();
List transactionalStatsProcessors = Lists.newLinkedList();
try {
for(final Partition partn : partitions) {
Partish p;
BasicStatsProcessor bsp = new BasicStatsProcessor(p = new Partish.PPart(table, partn), work, conf, followedColStats);
processors.add(bsp);
transactionalStatsProcessors.add(new TransactionalStatsProcessor(db, p));
futures.add(pool.submit(new Callable() {
@Override
public Void call() throws Exception {
bsp.collectFileStatus(wh, conf);
return null;
}
}));
}
pool.shutdown();
for (Future future : futures) {
future.get();
}
} catch (InterruptedException e) {
LOG.debug("Cancelling " + futures.size() + " file stats lookup tasks");
//cancel other futures
for (Future future : futures) {
future.cancel(true);
}
// Fail the query if the stats are supposed to be reliable
if (work.isStatsReliable()) {
ret = 1;
}
} finally {
if (pool != null) {
pool.shutdownNow();
}
LOG.debug("Finished getting file stats of all partitions!");
}
for (BasicStatsProcessor basicStatsProcessor : processors) {
Object res = basicStatsProcessor.process(statsAggregator);
if (res == null) {
LOG.info("Partition " + basicStatsProcessor.partish.getPartition().getSpec() + " stats: [0]");
continue;
}
updates.add((Partition) res);
if (conf.getBoolVar(ConfVars.TEZ_EXEC_SUMMARY)) {
console.printInfo("Partition " + basicStatsProcessor.partish.getPartition().getSpec() + " stats: [" + toString(basicStatsProcessor.partish.getPartParameters()) + ']');
} else {
LOG.info("Partition " + basicStatsProcessor.partish.getPartition().getSpec() + " stats: [" + toString(basicStatsProcessor.partish.getPartParameters()) + ']');
}
}
if (!updates.isEmpty()) {
db.alterPartitions(tableFullName, updates, environmentContext, true);
}
for (TransactionalStatsProcessor transactionalStatsProcessor : transactionalStatsProcessors) {
transactionalStatsProcessor.process(statsAggregator);
}
if (work.isStatsReliable() && updates.size() != processors.size()) {
LOG.info("Stats should be reliadble...however seems like there were some issue.. => ret 1");
ret = 1;
}
}
} catch (Exception e) {
console.printInfo("[Warning] could not update stats.",
"Failed with exception " + e.getMessage() + "\n"
+ StringUtils.stringifyException(e));
// Fail the query if the stats are supposed to be reliable
if (work.isStatsReliable()) {
ret = 1;
}
} finally {
if (statsAggregator != null) {
statsAggregator.closeConnection(scc);
}
}
// The return value of 0 indicates success,
// anything else indicates failure
return ret;
}
private BasicStatsWork getWork() {
return work;
}
private ExecutorService buildBasicStatsExecutor() {
//Get the file status up-front for all partitions. Beneficial in cases of blob storage systems
int poolSize = conf.getInt(ConfVars.HIVE_MOVE_FILES_THREAD_COUNT.varname, 1);
// In case thread count is set to 0, use single thread.
poolSize = Math.max(poolSize, 1);
final ExecutorService pool = Executors.newFixedThreadPool(poolSize, new ThreadFactoryBuilder().setDaemon(true).setNameFormat("stats-updater-thread-%d").build());
LOG.debug("Getting file stats of all partitions. threadpool size:" + poolSize);
return pool;
}
private StatsAggregator createStatsAggregator(StatsCollectionContext scc, HiveConf conf) throws HiveException {
String statsImpl = HiveConf.getVar(conf, HiveConf.ConfVars.HIVESTATSDBCLASS);
StatsFactory factory = StatsFactory.newFactory(statsImpl, conf);
if (factory == null) {
throw new HiveException(ErrorMsg.STATSPUBLISHER_NOT_OBTAINED.getErrorCodedMsg());
}
// initialize stats publishing table for noscan which has only stats task
// the rest of MR task following stats task initializes it in ExecDriver.java
StatsPublisher statsPublisher = factory.getStatsPublisher();
if (!statsPublisher.init(scc)) { // creating stats table if not exists
throw new HiveException(ErrorMsg.STATSPUBLISHER_INITIALIZATION_ERROR.getErrorCodedMsg());
}
// manufacture a StatsAggregator
StatsAggregator statsAggregator = factory.getStatsAggregator();
if (!statsAggregator.connect(scc)) {
throw new HiveException(ErrorMsg.STATSAGGREGATOR_CONNECTION_ERROR.getErrorCodedMsg(statsImpl));
}
return statsAggregator;
}
private StatsCollectionContext getContext() throws HiveException {
StatsCollectionContext scc = new StatsCollectionContext(conf);
Task sourceTask = getWork().getSourceTask();
if (sourceTask == null) {
throw new HiveException(ErrorMsg.STATSAGGREGATOR_SOURCETASK_NULL.getErrorCodedMsg());
}
scc.setTask(sourceTask);
scc.setStatsTmpDir(this.getWork().getStatsTmpDir());
return scc;
}
private String toString(Map parameters) {
StringBuilder builder = new StringBuilder();
for (String statType : StatsSetupConst.SUPPORTED_STATS) {
String value = parameters.get(statType);
if (value != null) {
if (builder.length() > 0) {
builder.append(", ");
}
builder.append(statType).append('=').append(value);
}
}
return builder.toString();
}
/**
* Get the list of partitions that need to update statistics.
* TODO: we should reuse the Partitions generated at compile time
* since getting the list of partitions is quite expensive.
*
* @return a list of partitions that need to update statistics.
* @throws HiveException
*/
private List getPartitionsList(Hive db) throws HiveException {
if (work.getLoadFileDesc() != null) {
return null; //we are in CTAS, so we know there are no partitions
}
if (work.getTableSpecs() != null) {
// ANALYZE command
TableSpec tblSpec = work.getTableSpecs();
table = tblSpec.tableHandle;
if (!table.isPartitioned()) {
return null;
}
// get all partitions that match with the partition spec
return tblSpec.partitions != null ? unmodifiableList(tblSpec.partitions) : emptyList();
} else if (work.getLoadTableDesc() != null) {
// INSERT OVERWRITE command
LoadTableDesc tbd = work.getLoadTableDesc();
table = db.getTable(tbd.getTable().getTableName());
if (!table.isPartitioned()) {
return null;
}
DynamicPartitionCtx dpCtx = tbd.getDPCtx();
if (dpCtx != null && dpCtx.getNumDPCols() > 0) { // dynamic partitions
// If no dynamic partitions are generated, dpPartSpecs may not be initialized
if (dpPartSpecs != null) {
// Reload partition metadata because another BasicStatsTask instance may have updated the stats.
List partNames = dpPartSpecs.stream().map(Partition::getName).collect(Collectors.toList());
return db.getPartitionsByNames(table, partNames);
}
} else { // static partition
return singletonList(db.getPartition(table, tbd.getPartitionSpec(), false));
}
}
return emptyList();
}
public Collection getDpPartSpecs() {
return dpPartSpecs;
}
@Override
public void setDpPartSpecs(Collection dpPartSpecs) {
this.dpPartSpecs = dpPartSpecs;
}
public static String getAggregationPrefix(Table table, Partition partition) throws MetaException {
String prefix = getAggregationPrefix0(table, partition);
String aggKey = prefix.endsWith(Path.SEPARATOR) ? prefix : prefix + Path.SEPARATOR;
return aggKey;
}
private static String getAggregationPrefix0(Table table, Partition partition) throws MetaException {
// prefix is of the form dbName.tblName
String prefix = FileUtils.escapePathName(table.getDbName()).toLowerCase() + "." +
FileUtils.escapePathName(table.getTableName()).toLowerCase();
// FIXME: this is a secret contract; reusein getAggrKey() creates a more closer relation to the StatsGatherer
// prefix = work.getAggKey();
if (partition != null) {
return Utilities.join(prefix, Warehouse.makePartPath(partition.getSpec()));
}
return prefix;
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy