org.apache.hadoop.hive.common.HiveStatsUtils Maven / Gradle / Ivy
The newest version!
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hadoop.hive.common;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Collections;
import java.util.List;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hive.conf.HiveConf;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.google.common.collect.Lists;
/**
* HiveStatsUtils.
* A collection of utilities used for hive statistics.
* Used by classes in both metastore and ql package
*/
public class HiveStatsUtils {
private static final Logger LOG = LoggerFactory.getLogger(HiveStatsUtils.class);
/**
* Get all file status from a root path and recursively go deep into certain levels.
*
* @param path
* the root path
* @param level
* the depth of directory to explore
* @param fs
* the file system
* @return array of FileStatus
* @throws IOException
*/
public static List getFileStatusRecurse(Path path, int level, FileSystem fs)
throws IOException {
// if level is <0, the return all files/directories under the specified path
if (level < 0) {
List result = new ArrayList();
try {
FileStatus fileStatus = fs.getFileStatus(path);
FileUtils.listStatusRecursively(fs, fileStatus, result);
} catch (IOException e) {
// globStatus() API returns empty FileStatus[] when the specified path
// does not exist. But getFileStatus() throw IOException. To mimic the
// similar behavior we will return empty array on exception. For external
// tables, the path of the table will not exists during table creation
return Collections.emptyList();
}
return result;
}
// construct a path pattern (e.g., /*/*) to find all dynamically generated paths
StringBuilder sb = new StringBuilder(path.toUri().getPath());
for (int i = 0; i < level; i++) {
sb.append(Path.SEPARATOR).append("*");
}
Path pathPattern = new Path(path, sb.toString());
return Lists.newArrayList(fs.globStatus(pathPattern, FileUtils.HIDDEN_FILES_PATH_FILTER));
}
public static boolean computeHistograms(Configuration conf) {
return HiveConf.getBoolVar(conf, HiveConf.ConfVars.HIVE_STATS_KLL_ENABLE,
HiveConf.ConfVars.HIVE_STATS_KLL_ENABLE.defaultBoolVal);
}
public static int getKParamForKllSketch(Configuration conf) {
int k = HiveConf.getIntVar(conf, HiveConf.ConfVars.HIVE_STATS_KLL_K_PARAM);
if (k < 8 || k > 65535) {
throw new IllegalArgumentException(HiveConf.ConfVars.HIVE_STATS_KLL_K_PARAM.varname +
" must be in the range [8, 65535], got " + k);
}
return k;
}
public static int getNumBitVectorsForNDVEstimation(Configuration conf) throws Exception {
int numBitVectors;
float percentageError = HiveConf.getFloatVar(conf, HiveConf.ConfVars.HIVE_STATS_NDV_ERROR);
if (percentageError < 0.0) {
throw new Exception("hive.stats.ndv.error can't be negative");
} else if (percentageError <= 2.4) {
numBitVectors = 1024;
LOG.info("Lowest error achievable is 2.4% but error requested is " + percentageError + "%");
LOG.info("Choosing 1024 bit vectors..");
} else if (percentageError <= 3.4 ) {
numBitVectors = 1024;
LOG.info("Error requested is " + percentageError + "%");
LOG.info("Choosing 1024 bit vectors..");
} else if (percentageError <= 4.8) {
numBitVectors = 512;
LOG.info("Error requested is " + percentageError + "%");
LOG.info("Choosing 512 bit vectors..");
} else if (percentageError <= 6.8) {
numBitVectors = 256;
LOG.info("Error requested is " + percentageError + "%");
LOG.info("Choosing 256 bit vectors..");
} else if (percentageError <= 9.7) {
numBitVectors = 128;
LOG.info("Error requested is " + percentageError + "%");
LOG.info("Choosing 128 bit vectors..");
} else if (percentageError <= 13.8) {
numBitVectors = 64;
LOG.info("Error requested is " + percentageError + "%");
LOG.info("Choosing 64 bit vectors..");
} else if (percentageError <= 19.6) {
numBitVectors = 32;
LOG.info("Error requested is " + percentageError + "%");
LOG.info("Choosing 32 bit vectors..");
} else if (percentageError <= 28.2) {
numBitVectors = 16;
LOG.info("Error requested is " + percentageError + "%");
LOG.info("Choosing 16 bit vectors..");
} else if (percentageError <= 40.9) {
numBitVectors = 8;
LOG.info("Error requested is " + percentageError + "%");
LOG.info("Choosing 8 bit vectors..");
} else if (percentageError <= 61.0) {
numBitVectors = 4;
LOG.info("Error requested is " + percentageError + "%");
LOG.info("Choosing 4 bit vectors..");
} else {
numBitVectors = 2;
LOG.info("Error requested is " + percentageError + "%");
LOG.info("Choosing 2 bit vectors..");
}
return numBitVectors;
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy