org.apache.hadoop.hive.ql.parse.ColumnStatsSemanticAnalyzer Maven / Gradle / Ivy
/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hadoop.hive.ql.parse;
import java.io.IOException;
import java.util.ArrayList;
import java.util.LinkedHashMap;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.hive.conf.HiveConf;
import org.apache.hadoop.hive.metastore.api.FieldSchema;
import org.apache.hadoop.hive.ql.Context;
import org.apache.hadoop.hive.ql.ErrorMsg;
import org.apache.hadoop.hive.ql.exec.Utilities;
import org.apache.hadoop.hive.ql.metadata.HiveException;
import org.apache.hadoop.hive.ql.metadata.Partition;
import org.apache.hadoop.hive.ql.metadata.Table;
/**
* ColumnStatsSemanticAnalyzer.
* Handles semantic analysis and rewrite for gathering column statistics both at the level of a
* partition and a table. Note that table statistics are implemented in SemanticAnalyzer.
*
*/
public class ColumnStatsSemanticAnalyzer extends SemanticAnalyzer {
private static final Log LOG = LogFactory
.getLog(ColumnStatsSemanticAnalyzer.class);
private ASTNode originalTree;
private ASTNode rewrittenTree;
private String rewrittenQuery;
private Context ctx;
private boolean isRewritten;
private boolean isTableLevel;
private String tableName;
private List colNames;
private List colType;
private String partName;
private class PartitionList {
private final String[] partKeys;
private String[] partKeyTypes;
private final String[] partValues;
private int numPartitions;
private int numPartitionValues;
PartitionList(int numPartitions) {
this.numPartitions = numPartitions;
partKeys = new String[numPartitions];
partValues = new String[numPartitions];
}
public int getNumPartitions() {
return numPartitions;
}
public void setNumPartitions(int numPartitions) {
this.numPartitions = numPartitions;
}
public String[] getPartValues() {
return partValues;
}
public String[] getPartKeys() {
return partKeys;
}
public void addPartValue(String partValue, int index) {
partValues[index] = new String(partValue);
}
public void addPartKey(String partKey, int index) {
partKeys[index] = new String(partKey);
}
public int getNumPartValues() {
return numPartitionValues;
}
public void setNumPartValues(int numPartValues) {
numPartitionValues = numPartValues;
}
public String[] getPartKeyTypes() {
return partKeyTypes;
}
public void setPartKeyTypes(String[] partKeyTypes) {
this.partKeyTypes = partKeyTypes;
}
public void setPartKeyType(String partKeyType, int index) {
partKeyTypes[index] = partKeyType;
}
}
public ColumnStatsSemanticAnalyzer(HiveConf conf) throws SemanticException {
super(conf);
}
private boolean shouldRewrite(ASTNode tree) {
boolean rwt = false;
if (tree.getChildCount() > 1) {
ASTNode child0 = (ASTNode) tree.getChild(0);
ASTNode child1;
if (child0.getToken().getType() == HiveParser.TOK_TAB) {
child0 = (ASTNode) child0.getChild(0);
if (child0.getToken().getType() == HiveParser.TOK_TABNAME) {
child1 = (ASTNode) tree.getChild(1);
if (child1.getToken().getType() == HiveParser.TOK_TABCOLNAME) {
rwt = true;
}
}
}
}
return rwt;
}
private boolean isPartitionLevelStats(ASTNode tree) {
boolean isPartitioned = false;
ASTNode child = (ASTNode) tree.getChild(0);
if (child.getChildCount() > 1) {
child = (ASTNode) child.getChild(1);
if (child.getToken().getType() == HiveParser.TOK_PARTSPEC) {
isPartitioned = true;
}
}
return isPartitioned;
}
private String getTableName(ASTNode tree) {
return getUnescapedName((ASTNode) tree.getChild(0).getChild(0));
}
private PartitionList getPartKeyValuePairsFromAST(ASTNode tree) {
ASTNode child = ((ASTNode) tree.getChild(0).getChild(1));
int numParts = child.getChildCount();
PartitionList partList = new PartitionList(numParts);
String partKey;
String partValue;
int numPartValue = 0;
for (int i = 0; i < numParts; i++) {
partKey = new String(getUnescapedName((ASTNode) child.getChild(i).getChild(0)));
if (child.getChild(i).getChildCount() > 1) {
partValue = new String(getUnescapedName((ASTNode) child.getChild(i).getChild(1)));
partValue = partValue.replaceAll("'", "");
numPartValue += 1;
} else {
partValue = null;
}
partList.addPartKey(partKey, i);
if (partValue != null) {
partList.addPartValue(partValue, i);
}
}
partList.setNumPartValues(numPartValue);
return partList;
}
private List getColumnName(ASTNode tree) {
int numCols = tree.getChild(1).getChildCount();
List colName = new LinkedList();
for (int i = 0; i < numCols; i++) {
colName.add(i, new String(getUnescapedName((ASTNode) tree.getChild(1).getChild(i))));
}
return colName;
}
private int getNumColumns(ASTNode tree) {
return tree.getChild(1).getChildCount();
}
private void validatePartitionKeys(String tableName, PartitionList partList) throws
SemanticException {
Table tbl;
try {
tbl = db.getTable(tableName);
} catch (HiveException e) {
throw new SemanticException(ErrorMsg.INVALID_TABLE.getMsg(tableName));
}
List partKeys = tbl.getPartitionKeys();
String[] inputPartKeys = partList.getPartKeys();
if (inputPartKeys.length != partKeys.size()) {
throw new SemanticException(ErrorMsg.COLUMNSTATSCOLLECTOR_INCORRECT_NUM_PART_KEY.getMsg());
}
Map partKeysMap = new LinkedHashMap();
for (int i=0; i partKeys = tbl.getPartitionKeys();
String[] inputPartKeys = partList.getPartKeys();
String[] inputPartKeyTypes = new String[inputPartKeys.length];
for (int i=0; i < inputPartKeys.length; i++) {
for (FieldSchema partKey:partKeys) {
if (inputPartKeys[i].equalsIgnoreCase(partKey.getName())) {
inputPartKeyTypes[i] = new String(partKey.getType());
break;
}
}
}
return inputPartKeyTypes;
}
private String constructPartitionName(String tableName, PartitionList partList)
throws SemanticException {
Table tbl;
Partition part;
String[] partKeys = partList.getPartKeys();
String[] partValues = partList.getPartValues();
try {
tbl = db.getTable(tableName);
} catch (HiveException e) {
throw new SemanticException(ErrorMsg.INVALID_TABLE.getMsg(tableName));
}
Map partSpec = new LinkedHashMap();
for (int i=0; i getTableColumnType(String tableName, List colNames, int numCols)
throws SemanticException{
List colTypes = new LinkedList();
String colName;
Table tbl;
try {
tbl = db.getTable(tableName);
} catch (HiveException e) {
throw new SemanticException(ErrorMsg.INVALID_TABLE.getMsg(tableName));
}
List cols = tbl.getCols();
for (int i=0; i getPartitionColumnType(String tableName, String partName,
List colNames, int numCols) throws SemanticException {
List colTypes = new LinkedList();
String colName;
Table tbl;
try {
tbl = db.getTable(tableName);
} catch (HiveException e) {
throw new SemanticException(ErrorMsg.INVALID_TABLE.getMsg(tableName));
}
List partNames = new ArrayList();
partNames.add(partName);
List partitionList;
try {
partitionList = db.getPartitionsByNames(tbl, partNames);
} catch (HiveException e) {
throw new SemanticException(ErrorMsg.INVALID_PARTITION.getMsg(partName));
}
Partition part = partitionList.get(0);
List cols = part.getCols();
for (int i=0; i colNames, int numBitVectors, PartitionList partList,
boolean isPartitionStats) throws SemanticException{
StringBuilder rewrittenQueryBuilder = new StringBuilder("select ");
String rewrittenQuery;
for (int i = 0; i < colNames.size(); i++) {
if (i > 0) {
rewrittenQueryBuilder.append(" , ");
}
rewrittenQueryBuilder.append("compute_stats(");
rewrittenQueryBuilder.append(colNames.get(i));
rewrittenQueryBuilder.append(" , ");
rewrittenQueryBuilder.append(numBitVectors);
rewrittenQueryBuilder.append(" )");
}
rewrittenQueryBuilder.append(" from ");
rewrittenQueryBuilder.append(tableName);
isRewritten = true;
// If partition level statistics is requested, add predicate and group by as needed to rewritten
// query
if (isPartitionStats) {
rewrittenQueryBuilder.append(genPartitionClause(partList));
}
rewrittenQuery = rewrittenQueryBuilder.toString();
rewrittenQuery = new VariableSubstitution().substitute(conf, rewrittenQuery);
return rewrittenQuery;
}
private ASTNode genRewrittenTree(String rewrittenQuery) throws SemanticException {
ASTNode rewrittenTree;
// Parse the rewritten query string
try {
ctx = new Context(conf);
} catch (IOException e) {
throw new SemanticException(ErrorMsg.COLUMNSTATSCOLLECTOR_IO_ERROR.getMsg());
}
ctx.setCmd(rewrittenQuery);
ParseDriver pd = new ParseDriver();
try {
rewrittenTree = pd.parse(rewrittenQuery, ctx);
} catch (ParseException e) {
throw new SemanticException(ErrorMsg.COLUMNSTATSCOLLECTOR_PARSE_ERROR.getMsg());
}
rewrittenTree = ParseUtils.findRootNonNullToken(rewrittenTree);
return rewrittenTree;
}
public ColumnStatsSemanticAnalyzer(HiveConf conf, ASTNode tree) throws SemanticException {
super(conf);
// check if it is no scan. grammar prevents coexit noscan/columns
super.processNoScanCommand(tree);
// check if it is partial scan. grammar prevents coexit partialscan/columns
super.processPartialScanCommand(tree);
/* Rewrite only analyze table <> column <> compute statistics; Don't rewrite analyze table
* command - table stats are collected by the table scan operator and is not rewritten to
* an aggregation.
*/
if (shouldRewrite(tree)) {
tableName = new String(getTableName(tree));
colNames = getColumnName(tree);
int numCols = getNumColumns(tree);
// Save away the original AST
originalTree = tree;
boolean isPartitionStats = isPartitionLevelStats(tree);
PartitionList partList = null;
checkForPartitionColumns(colNames, getPartitionKeys(tableName));
validateSpecifiedColumnNames(tableName, colNames);
if (isPartitionStats) {
isTableLevel = false;
partList = getPartKeyValuePairsFromAST(tree);
validatePartitionClause(tableName, partList);
partName = constructPartitionName(tableName, partList);
colType = getPartitionColumnType(tableName, partName, colNames, numCols);
} else {
isTableLevel = true;
colType = getTableColumnType(tableName, colNames, numCols);
}
int numBitVectors = getNumBitVectorsForNDVEstimation(conf);
rewrittenQuery = genRewrittenQuery(colNames, numBitVectors, partList, isPartitionStats);
rewrittenTree = genRewrittenTree(rewrittenQuery);
} else {
// Not an analyze table column compute statistics statement - don't do any rewrites
originalTree = rewrittenTree = tree;
rewrittenQuery = null;
isRewritten = false;
}
}
// fail early if the columns specified for column statistics are not valid
private void validateSpecifiedColumnNames(String tableName, List specifiedCols)
throws SemanticException {
List fields = null;
try {
fields = db.getTable(tableName).getAllCols();
} catch (HiveException e) {
throw new SemanticException(ErrorMsg.INVALID_TABLE.getMsg(tableName));
}
List tableCols = Utilities.getColumnNamesFromFieldSchema(fields);
for(String sc : specifiedCols) {
if (!tableCols.contains(sc.toLowerCase())) {
String msg = "'" + sc + "' (possible columns are " + tableCols.toString() + ")";
throw new SemanticException(ErrorMsg.INVALID_COLUMN.getMsg(msg));
}
}
}
private List getPartitionKeys(String tableName) throws SemanticException {
List fields;
try {
fields = db.getTable(tableName).getPartitionKeys();
} catch (HiveException e) {
throw new SemanticException(ErrorMsg.INVALID_TABLE.getMsg(tableName));
}
return Utilities.getColumnNamesFromFieldSchema(fields);
}
private void checkForPartitionColumns(List specifiedCols, List partCols)
throws SemanticException {
// Raise error if user has specified partition column for stats
for (String pc : partCols) {
for (String sc : specifiedCols) {
if (pc.equalsIgnoreCase(sc)) {
throw new SemanticException(ErrorMsg.COLUMNSTATSCOLLECTOR_INVALID_COLUMN.getMsg()
+ " [Try removing column '" + sc + "' from column list]");
}
}
}
}
@Override
public void analyze(ASTNode ast, Context origCtx) throws SemanticException {
QB qb;
QBParseInfo qbp;
// initialize QB
init();
// Setup the necessary metadata if originating from analyze rewrite
if (isRewritten) {
qb = getQB();
qb.setAnalyzeRewrite(true);
qbp = qb.getParseInfo();
qbp.setTableName(tableName);
qbp.setTblLvl(isTableLevel);
if (!isTableLevel) {
qbp.setPartName(partName);
}
qbp.setColName(colNames);
qbp.setColType(colType);
initCtx(ctx);
LOG.info("Invoking analyze on rewritten query");
analyzeInternal(rewrittenTree);
} else {
initCtx(origCtx);
LOG.info("Invoking analyze on original query");
analyzeInternal(originalTree);
}
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy