org.apache.hadoop.hive.common.HiveStatsUtils Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of hive-apache Show documentation
Show all versions of hive-apache Show documentation
Shaded version of Apache Hive for Trino
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hadoop.hive.common;
import java.io.IOException;
import java.util.ArrayList;
import java.util.LinkedList;
import java.util.List;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.fs.PathFilter;
import org.apache.hadoop.hive.conf.HiveConf;
import io.trino.hive.$internal.org.slf4j.Logger;
import io.trino.hive.$internal.org.slf4j.LoggerFactory;
import io.trino.hive.$internal.com.google.common.collect.Lists;
/**
* HiveStatsUtils.
* A collection of utilities used for hive statistics.
* Used by classes in both metastore and ql package
*/
public class HiveStatsUtils {
private static final Logger LOG = LoggerFactory.getLogger(HiveStatsUtils.class);
/**
* Get all file status from a root path and recursively go deep into certain levels.
*
* @param path
* the root path
* @param level
* the depth of directory to explore
* @param fs
* the file system
* @return array of FileStatus
* @throws IOException
*/
public static List getFileStatusRecurse(Path path, int level, FileSystem fs)
throws IOException {
return getFileStatusRecurse(path, level, fs, FileUtils.HIDDEN_FILES_PATH_FILTER, false);
}
public static List getFileStatusRecurse(
Path path, int level, FileSystem fs, PathFilter filter) throws IOException {
return getFileStatusRecurse(path, level, fs, filter, false);
}
public static List getFileStatusRecurse(
Path path, int level, FileSystem fs, PathFilter filter, boolean allLevelsBelow)
throws IOException {
// if level is <0, the return all files/directories under the specified path
if (level < 0) {
List result = new ArrayList();
try {
FileStatus fileStatus = fs.getFileStatus(path);
FileUtils.listStatusRecursively(fs, fileStatus, filter, result);
} catch (IOException e) {
// globStatus() API returns empty FileStatus[] when the specified path
// does not exist. But getFileStatus() throw IOException. To mimic the
// similar behavior we will return empty array on exception. For external
// tables, the path of the table will not exists during table creation
return new ArrayList<>(0);
}
return result;
}
// construct a path pattern (e.g., /*/*) to find all dynamically generated paths
StringBuilder sb = new StringBuilder(path.toUri().getPath());
for (int i = 0; i < level; i++) {
sb.append(Path.SEPARATOR).append("*");
}
Path pathPattern = new Path(path, sb.toString());
if (!allLevelsBelow) {
return Lists.newArrayList(fs.globStatus(pathPattern, filter));
}
LinkedList queue = new LinkedList<>();
List results = new ArrayList();
for (FileStatus status : fs.globStatus(pathPattern)) {
if (filter.accept(status.getPath())) {
results.add(status);
}
if (status.isDirectory()) {
queue.add(status);
}
}
while (!queue.isEmpty()) {
FileStatus status = queue.poll();
for (FileStatus child : fs.listStatus(status.getPath())) {
if (filter.accept(child.getPath())) {
results.add(child);
}
if (child.isDirectory()) {
queue.add(child);
}
}
}
return results;
}
public static int getNumBitVectorsForNDVEstimation(Configuration conf) throws Exception {
int numBitVectors;
float percentageError = HiveConf.getFloatVar(conf, HiveConf.ConfVars.HIVE_STATS_NDV_ERROR);
if (percentageError < 0.0) {
throw new Exception("hive.stats.ndv.error can't be negative");
} else if (percentageError <= 2.4) {
numBitVectors = 1024;
LOG.info("Lowest error achievable is 2.4% but error requested is " + percentageError + "%");
LOG.info("Choosing 1024 bit vectors..");
} else if (percentageError <= 3.4 ) {
numBitVectors = 1024;
LOG.info("Error requested is " + percentageError + "%");
LOG.info("Choosing 1024 bit vectors..");
} else if (percentageError <= 4.8) {
numBitVectors = 512;
LOG.info("Error requested is " + percentageError + "%");
LOG.info("Choosing 512 bit vectors..");
} else if (percentageError <= 6.8) {
numBitVectors = 256;
LOG.info("Error requested is " + percentageError + "%");
LOG.info("Choosing 256 bit vectors..");
} else if (percentageError <= 9.7) {
numBitVectors = 128;
LOG.info("Error requested is " + percentageError + "%");
LOG.info("Choosing 128 bit vectors..");
} else if (percentageError <= 13.8) {
numBitVectors = 64;
LOG.info("Error requested is " + percentageError + "%");
LOG.info("Choosing 64 bit vectors..");
} else if (percentageError <= 19.6) {
numBitVectors = 32;
LOG.info("Error requested is " + percentageError + "%");
LOG.info("Choosing 32 bit vectors..");
} else if (percentageError <= 28.2) {
numBitVectors = 16;
LOG.info("Error requested is " + percentageError + "%");
LOG.info("Choosing 16 bit vectors..");
} else if (percentageError <= 40.9) {
numBitVectors = 8;
LOG.info("Error requested is " + percentageError + "%");
LOG.info("Choosing 8 bit vectors..");
} else if (percentageError <= 61.0) {
numBitVectors = 4;
LOG.info("Error requested is " + percentageError + "%");
LOG.info("Choosing 4 bit vectors..");
} else {
numBitVectors = 2;
LOG.info("Error requested is " + percentageError + "%");
LOG.info("Choosing 2 bit vectors..");
}
return numBitVectors;
}
}