Please wait. This can take some minutes ...
Many resources are needed to download a project. Please understand that we have to compensate our server costs. Thank you in advance.
Project price only 1 $
You can buy this project and download/modify it how often you want.
org.apache.hadoop.hive.ql.stats.BasicStatsNoJobTask Maven / Gradle / Ivy
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hadoop.hive.ql.stats;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Collection;
import java.util.List;
import java.util.Map;
import java.util.concurrent.Callable;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Future;
import java.util.concurrent.LinkedBlockingQueue;
import java.util.concurrent.ThreadFactory;
import java.util.concurrent.ThreadPoolExecutor;
import java.util.concurrent.TimeUnit;
import java.util.stream.Collectors;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hive.common.HiveStatsUtils;
import org.apache.hadoop.hive.common.StatsSetupConst;
import org.apache.hadoop.hive.conf.HiveConf;
import org.apache.hadoop.hive.metastore.api.EnvironmentContext;
import org.apache.hadoop.hive.metastore.api.InvalidOperationException;
import org.apache.hadoop.hive.ql.CompilationOpContext;
import org.apache.hadoop.hive.ql.exec.StatsTask;
import org.apache.hadoop.hive.ql.exec.Utilities;
import org.apache.hadoop.hive.ql.io.AcidUtils;
import org.apache.hadoop.hive.ql.io.StatsProvidingRecordReader;
import org.apache.hadoop.hive.ql.io.orc.OrcInputFormat;
import org.apache.hadoop.hive.ql.io.parquet.MapredParquetInputFormat;
import org.apache.hadoop.hive.ql.metadata.Hive;
import org.apache.hadoop.hive.ql.metadata.HiveException;
import org.apache.hadoop.hive.ql.metadata.Partition;
import org.apache.hadoop.hive.ql.metadata.Table;
import org.apache.hadoop.hive.ql.parse.BaseSemanticAnalyzer.TableSpec;
import org.apache.hadoop.hive.ql.plan.BasicStatsNoJobWork;
import org.apache.hadoop.hive.ql.plan.api.StageType;
import org.apache.hadoop.hive.ql.session.SessionState.LogHelper;
import org.apache.hadoop.mapred.FileSplit;
import org.apache.hadoop.mapred.InputFormat;
import org.apache.hadoop.mapred.InputSplit;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.Reporter;
import org.apache.hadoop.util.StringUtils;
import org.apache.hive.common.util.ReflectionUtil;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.google.common.base.Function;
import com.google.common.collect.ImmutableList;
import com.google.common.collect.ImmutableListMultimap;
import com.google.common.collect.Lists;
import com.google.common.collect.Multimaps;
import com.google.common.util.concurrent.ThreadFactoryBuilder;
/**
* StatsNoJobTask is used in cases where stats collection is the only task for the given query (no
* parent MR or Tez job). It is used in the following cases 1) ANALYZE with noscan for
* file formats that implement StatsProvidingRecordReader interface: ORC format (implements
* StatsProvidingRecordReader) stores column statistics for all columns in the file footer. It's much
* faster to compute the table/partition statistics by reading the footer than scanning all the
* rows. This task can be used for computing basic stats like numFiles, numRows, fileSize,
* rawDataSize from ORC footer.
* However, this cannot be used for full ACID tables, since some files may contain updates
* and deletes to existing rows, so summing up the per-file row counts is invalid.
**/
public class BasicStatsNoJobTask implements IStatsProcessor {
private static transient final Logger LOG = LoggerFactory.getLogger(BasicStatsNoJobTask.class);
private HiveConf conf;
private BasicStatsNoJobWork work;
private LogHelper console;
public BasicStatsNoJobTask(HiveConf conf, BasicStatsNoJobWork work) {
this.conf = conf;
this.work = work;
console = new LogHelper(LOG);
}
public static boolean canUseBasicStats(
Table table, Class extends InputFormat> inputFormat) {
return canUseFooterScan(table, inputFormat) || useBasicStatsFromStorageHandler(table);
}
public static boolean canUseFooterScan(Table table, Class extends InputFormat> inputFormat) {
return (OrcInputFormat.class.isAssignableFrom(inputFormat) && !AcidUtils.isFullAcidTable(table))
|| MapredParquetInputFormat.class.isAssignableFrom(inputFormat);
}
private static boolean useBasicStatsFromStorageHandler(Table table) {
return table.isNonNative() && table.getStorageHandler().canProvideBasicStatistics();
}
@Override
public void initialize(CompilationOpContext opContext) {
}
@Override
public int process(Hive db, Table tbl) throws Exception {
LOG.info("Executing stats (no job) task");
ExecutorService threadPool = StatsTask.newThreadPool(conf);
return aggregateStats(threadPool, db);
}
public StageType getType() {
return StageType.STATS;
}
public String getName() {
return "STATS-NO-JOB";
}
abstract static class StatCollector implements Runnable {
protected Partish partish;
protected Object result;
protected LogHelper console;
public static Function SIMPLE_NAME_FUNCTION =
sc -> String.format("%s#%s", sc.partish.getTable().getCompleteName(), sc.partish.getPartishType());
public static Function EXTRACT_RESULT_FUNCTION = sc -> (Partition) sc.result;
protected void init(HiveConf conf, LogHelper console) throws IOException {
this.console = console;
}
protected final boolean isValid() {
return result != null;
}
protected final String toString(Map parameters) {
return StatsSetupConst.SUPPORTED_STATS.stream().map(st -> st + "=" + parameters.get(st))
.collect(Collectors.joining(", "));
}
}
static class HiveStorageHandlerStatCollector extends StatCollector {
public HiveStorageHandlerStatCollector(Partish partish) {
this.partish = partish;
}
@Override
public void run() {
try {
Table table = partish.getTable();
Map parameters;
Map basicStatistics = table.getStorageHandler().getBasicStatistics(partish);
if (partish.getPartition() != null) {
parameters = partish.getPartParameters();
result = new Partition(table, partish.getPartition().getTPartition());
} else {
parameters = table.getParameters();
result = new Table(table.getTTable());
}
parameters.putAll(basicStatistics);
StatsSetupConst.setBasicStatsState(parameters, StatsSetupConst.TRUE);
String msg = partish.getSimpleName() + " stats: [" + toString(parameters) + ']';
LOG.debug(msg);
console.printInfo(msg);
} catch (Exception e) {
console.printInfo("[Warning] could not update stats for " + partish.getSimpleName() + ".",
"Failed with exception " + e.getMessage() + "\n" + StringUtils.stringifyException(e));
}
}
}
static class FooterStatCollector extends StatCollector {
private JobConf jc;
private Path dir;
private FileSystem fs;
private LogHelper console;
public FooterStatCollector(JobConf jc, Partish partish) {
this.jc = jc;
this.partish = partish;
}
@Override
public void init(HiveConf conf, LogHelper console) throws IOException {
this.console = console;
dir = new Path(partish.getPartSd().getLocation());
fs = dir.getFileSystem(conf);
}
@Override
public void run() {
Map parameters = partish.getPartParameters();
try {
long numRows = 0;
long rawDataSize = 0;
long fileSize = 0;
long numFiles = 0;
long numErasureCodedFiles = 0;
// Note: this code would be invalid for transactional tables of any kind.
Utilities.FILE_OP_LOGGER.debug("Aggregating stats for {}", dir);
List fileList = null;
if (partish.getTable() != null
&& AcidUtils.isTransactionalTable(partish.getTable())) {
fileList = AcidUtils.getAcidFilesForStats(partish.getTable(), dir, jc, fs);
} else {
fileList = HiveStatsUtils.getFileStatusRecurse(dir, -1, fs);
}
ThreadPoolExecutor tpE = null;
List> futures = null;
int numThreadsFactor = HiveConf.getIntVar(jc, HiveConf.ConfVars.BASICSTATSTASKSMAXTHREADSFACTOR);
if (fileList.size() > 1 && numThreadsFactor > 0) {
int numThreads = Math.min(fileList.size(), numThreadsFactor * Runtime.getRuntime().availableProcessors());
ThreadFactory threadFactory = new ThreadFactoryBuilder().setNameFormat("Basic-Stats-Thread-%d").build();
tpE = new ThreadPoolExecutor(numThreads, numThreads, 0, TimeUnit.SECONDS, new LinkedBlockingQueue<>(),
threadFactory);
tpE.allowsCoreThreadTimeOut();
futures = new ArrayList<>();
LOG.info("Processing Stats for {} file using {} threads", fileList.size(), numThreads);
}
for (FileStatus file : fileList) {
Utilities.FILE_OP_LOGGER.debug("Computing stats for {}", file);
if (!file.isDirectory()) {
InputFormat, ?> inputFormat = ReflectionUtil.newInstance(partish.getInputFormatClass(), jc);
InputSplit dummySplit = new FileSplit(file.getPath(), 0, -1, new String[] { partish.getLocation() });
if (file.getLen() == 0) {
numFiles += 1;
} else {
FileStatProcessor fsp = new FileStatProcessor(file, inputFormat, dummySplit, jc);
if (tpE != null) {
futures.add(tpE.submit(fsp));
} else {
// No parallel processing, just call the method normally & update the stats.
FileStats fileStat = fsp.call();
rawDataSize += fileStat.getRawDataSize();
numRows += fileStat.getNumRows();
fileSize += fileStat.getFileSize();
numFiles += 1;
numErasureCodedFiles += fileStat.getNumErasureCodedFiles();
}
}
}
}
if (tpE != null) {
try {
for (Future future : futures) {
FileStats fileStat = future.get();
rawDataSize += fileStat.getRawDataSize();
numRows += fileStat.getNumRows();
fileSize += fileStat.getFileSize();
numFiles += 1;
numErasureCodedFiles += fileStat.getNumErasureCodedFiles();
}
} catch (Exception e) {
LOG.error("Encountered exception while collecting stats for file lists as {}", fileList, e);
// Cancel all the futures in the list & throw the caught exception post that.
futures.forEach(x -> x.cancel(true));
throw e;
} finally {
tpE.shutdown();
}
}
StatsSetupConst.setBasicStatsState(parameters, StatsSetupConst.TRUE);
parameters.put(StatsSetupConst.ROW_COUNT, String.valueOf(numRows));
parameters.put(StatsSetupConst.RAW_DATA_SIZE, String.valueOf(rawDataSize));
parameters.put(StatsSetupConst.TOTAL_SIZE, String.valueOf(fileSize));
parameters.put(StatsSetupConst.NUM_FILES, String.valueOf(numFiles));
parameters.put(StatsSetupConst.NUM_ERASURE_CODED_FILES, String.valueOf(numErasureCodedFiles));
if (partish.getPartition() != null) {
result = new Partition(partish.getTable(), partish.getPartition().getTPartition());
} else {
result = new Table(partish.getTable().getTTable());
}
String msg = partish.getSimpleName() + " stats: [" + toString(parameters) + ']';
LOG.debug(msg);
console.printInfo(msg);
} catch (Exception e) {
console.printInfo("[Warning] could not update stats for " + partish.getSimpleName() + ".", "Failed with exception " + e.getMessage() + "\n" + StringUtils.stringifyException(e));
}
}
}
private Collection getPartitions(Table table) {
Collection partitions = null;
if (work.getPartitions() == null || work.getPartitions().isEmpty()) {
if (table.isPartitioned()) {
partitions = table.getTableSpec().partitions;
}
} else {
partitions = work.getPartitions();
}
return partitions;
}
private List getPartishes(Table table) {
Collection partitions = getPartitions(table);
List partishes = Lists.newLinkedList();
if (partitions == null) {
partishes.add(Partish.buildFor(table));
} else {
for (Partition part : partitions) {
partishes.add(Partish.buildFor(table, part));
}
}
return partishes;
}
private int aggregateStats(ExecutorService threadPool, Hive db) {
int ret = 0;
try {
JobConf jc = new JobConf(conf);
TableSpec tableSpecs = work.getTableSpecs();
if (tableSpecs == null) {
throw new RuntimeException("this is unexpected...needs some investigation");
}
Table table = tableSpecs.tableHandle;
List partishes = getPartishes(table);
List scs = new ArrayList();
for (Partish partish : partishes) {
if (useBasicStatsFromStorageHandler(table)) {
scs.add(new HiveStorageHandlerStatCollector(partish));
} else {
scs.add(new FooterStatCollector(jc, partish));
}
}
for (StatCollector sc : scs) {
sc.init(conf, console);
threadPool.execute(sc);
}
LOG.debug("Stats collection waiting for threadpool to shutdown..");
shutdownAndAwaitTermination(threadPool);
LOG.debug("Stats collection threadpool shutdown successful.");
ret = updatePartitions(db, scs, table);
} catch (Exception e) {
console.printError("Failed to collect footer statistics.", "Failed with exception " + e.getMessage() + "\n" + StringUtils.stringifyException(e));
// Fail the query if the stats are supposed to be reliable
if (work.isStatsReliable()) {
ret = -1;
}
}
// The return value of 0 indicates success,
// anything else indicates failure
return ret;
}
private int updatePartitions(Hive db, List scs, Table table) throws InvalidOperationException, HiveException {
String tableFullName = table.getFullyQualifiedName();
if (scs.isEmpty()) {
return 0;
}
if (work.isStatsReliable()) {
for (StatCollector statsCollection : scs) {
if (!statsCollection.isValid()) {
LOG.debug("Stats requested to be reliable. Empty stats found: {}", statsCollection.partish.getSimpleName());
return -1;
}
}
}
List validColectors = Lists.newArrayList();
for (StatCollector statsCollection : scs) {
if (statsCollection.isValid()) {
validColectors.add(statsCollection);
}
}
EnvironmentContext environmentContext = new EnvironmentContext();
environmentContext.putToProperties(StatsSetupConst.DO_NOT_UPDATE_STATS, StatsSetupConst.TRUE);
ImmutableListMultimap collectorsByTable = Multimaps.index(validColectors, FooterStatCollector.SIMPLE_NAME_FUNCTION);
LOG.debug("Collectors.size(): {}", collectorsByTable.keySet());
if (collectorsByTable.keySet().size() < 1) {
LOG.warn("Collectors are empty! ; {}", tableFullName);
}
// for now this should be true...
assert (collectorsByTable.keySet().size() <= 1);
LOG.debug("Updating stats for: {}", tableFullName);
for (String partName : collectorsByTable.keySet()) {
ImmutableList values = collectorsByTable.get(partName);
if (values == null) {
throw new RuntimeException("very intresting");
}
if (values.get(0).result instanceof Table) {
db.alterTable(tableFullName, (Table) values.get(0).result, environmentContext, true);
LOG.debug("Updated stats for {}.", tableFullName);
} else {
if (values.get(0).result instanceof Partition) {
List results = Lists.transform(values, StatCollector.EXTRACT_RESULT_FUNCTION);
db.alterPartitions(tableFullName, results, environmentContext, true);
LOG.debug("Bulk updated {} partitions of {}.", results.size(), tableFullName);
} else {
throw new RuntimeException("inconsistent");
}
}
}
LOG.debug("Updated stats for: {}", tableFullName);
return 0;
}
private void shutdownAndAwaitTermination(ExecutorService threadPool) {
// Disable new tasks from being submitted
threadPool.shutdown();
try {
// Wait a while for existing tasks to terminate
// XXX this will wait forever... :)
while (!threadPool.awaitTermination(10, TimeUnit.SECONDS)) {
LOG.debug("Waiting for all stats tasks to finish...");
}
// Cancel currently executing tasks
threadPool.shutdownNow();
// Wait a while for tasks to respond to being cancelled
if (!threadPool.awaitTermination(100, TimeUnit.SECONDS)) {
LOG.debug("Stats collection thread pool did not terminate");
}
} catch (InterruptedException ie) {
// Cancel again if current thread also interrupted
threadPool.shutdownNow();
// Preserve interrupt status
Thread.currentThread().interrupt();
}
}
@Override
public void setDpPartSpecs(Collection dpPartSpecs) {
}
/**
* Utility class to process file level stats in parallel.
*/
private static class FileStatProcessor implements Callable {
private final InputSplit dummySplit;
private final InputFormat, ?> inputFormat;
private final JobConf jc;
private final FileStatus file;
FileStatProcessor(FileStatus file, InputFormat, ?> inputFormat, InputSplit dummySplit, JobConf jc) {
this.file = file;
this.dummySplit = dummySplit;
this.inputFormat = inputFormat;
this.jc = jc;
}
@Override
public FileStats call() throws Exception {
try (org.apache.hadoop.mapred.RecordReader, ?> recordReader = inputFormat
.getRecordReader(dummySplit, jc, Reporter.NULL)) {
if (recordReader instanceof StatsProvidingRecordReader) {
StatsProvidingRecordReader statsRR;
statsRR = (StatsProvidingRecordReader) recordReader;
final FileStats fileStats =
new FileStats(statsRR.getStats().getRawDataSize(), statsRR.getStats().getRowCount(), file.getLen(),
file.isErasureCoded());
return fileStats;
} else {
throw new HiveException(String.format("Unexpected file found during reading footers for: %s ", file));
}
}
}
}
/**
* Utility class for holding the file level statistics.
*/
private static class FileStats {
private long numRows = 0;
private long rawDataSize = 0;
private long fileSize = 0;
private long numErasureCodedFiles = 0;
public FileStats(long rawDataSize, long numRows, long fileSize, boolean isErasureCoded) {
this.rawDataSize = rawDataSize;
this.numRows = numRows;
this.fileSize = fileSize;
this.numErasureCodedFiles = isErasureCoded ? 1 : 0;
}
public long getNumRows() {
return numRows;
}
public long getRawDataSize() {
return rawDataSize;
}
public long getFileSize() {
return fileSize;
}
public long getNumErasureCodedFiles() {
return numErasureCodedFiles;
}
}
}