Many resources are needed to download a project. Please understand that we have to compensate our server costs. Thank you in advance. Project price only 1 $
You can buy this project and download/modify it how often you want.
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hadoop.hive.ql.parse;
import java.io.IOException;
import java.net.URI;
import java.net.URISyntaxException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashMap;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.Map;
import java.util.Set;
import org.antlr.runtime.tree.Tree;
import org.apache.commons.codec.DecoderException;
import org.apache.commons.codec.net.URLCodec;
import org.apache.commons.lang3.StringUtils;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.fs.PathFilter;
import org.apache.hadoop.hive.conf.HiveConf;
import org.apache.hadoop.hive.conf.HiveConf.StrictChecks;
import org.apache.hadoop.hive.metastore.TableType;
import org.apache.hadoop.hive.metastore.api.FieldSchema;
import org.apache.hadoop.hive.ql.Context;
import org.apache.hadoop.hive.ql.ErrorMsg;
import org.apache.hadoop.hive.ql.QueryState;
import org.apache.hadoop.hive.ql.exec.Task;
import org.apache.hadoop.hive.ql.exec.TaskFactory;
import org.apache.hadoop.hive.ql.exec.Utilities;
import org.apache.hadoop.hive.ql.hooks.WriteEntity;
import org.apache.hadoop.hive.ql.io.AcidUtils;
import org.apache.hadoop.hive.ql.io.HiveFileFormatUtils;
import org.apache.hadoop.hive.ql.io.StorageFormatDescriptor;
import org.apache.hadoop.hive.ql.lockmgr.LockException;
import org.apache.hadoop.hive.ql.metadata.Hive;
import org.apache.hadoop.hive.ql.metadata.HiveException;
import org.apache.hadoop.hive.ql.metadata.HiveStorageHandler;
import org.apache.hadoop.hive.ql.metadata.Partition;
import org.apache.hadoop.hive.ql.metadata.Table;
import org.apache.hadoop.hive.ql.plan.BasicStatsWork;
import org.apache.hadoop.hive.ql.plan.LoadTableDesc;
import org.apache.hadoop.hive.ql.plan.LoadTableDesc.LoadFileType;
import org.apache.hadoop.hive.ql.plan.MoveWork;
import org.apache.hadoop.hive.ql.plan.StatsWork;
import org.apache.hadoop.mapred.InputFormat;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.google.common.collect.Lists;
import static org.apache.hadoop.hive.conf.HiveConf.ConfVars.HIVE_LOAD_DATA_USE_NATIVE_API;
/**
* LoadSemanticAnalyzer.
*
*/
public class LoadSemanticAnalyzer extends SemanticAnalyzer {
private static final Logger LOG = LoggerFactory.getLogger(LoadSemanticAnalyzer.class);
private boolean queryReWritten = false;
private final String tempTblNameSuffix = "__temp_table_for_load_data__";
// AST specific data
private Tree fromTree, tableTree;
private boolean isLocal = false, isOverWrite = false;
private String inputFormatClassName = null;
private String serDeClassName = null;
public LoadSemanticAnalyzer(QueryState queryState) throws SemanticException {
super(queryState);
}
public static FileStatus[] matchFilesOrDir(FileSystem fs, Path path)
throws IOException {
FileStatus[] srcs = fs.globStatus(path, new PathFilter() {
@Override
public boolean accept(Path p) {
String name = p.getName();
return name.equals(EximUtil.METADATA_NAME) || (!name.startsWith("_") && !name.startsWith("."));
}
});
if ((srcs != null) && srcs.length == 1) {
if (srcs[0].isDirectory()) {
srcs = fs.listStatus(srcs[0].getPath(), new PathFilter() {
@Override
public boolean accept(Path p) {
String name = p.getName();
return !name.startsWith("_") && !name.startsWith(".");
}
});
}
}
return (srcs);
}
private URI initializeFromURI(String fromPath, boolean isLocal)
throws IOException, URISyntaxException, SemanticException {
URI fromURI = new Path(fromPath).toUri();
String fromScheme = fromURI.getScheme();
String fromAuthority = fromURI.getAuthority();
String path = fromURI.getPath();
// generate absolute path relative to current directory or hdfs home
// directory
if (!path.startsWith("/")) {
if (isLocal) {
try {
path = new String(URLCodec.decodeUrl(
new Path(System.getProperty("user.dir"), fromPath).toUri().toString()
.getBytes("US-ASCII")), "US-ASCII");
} catch (DecoderException de) {
throw new SemanticException("URL Decode failed", de);
}
} else {
path = new Path(new Path("/user/" + System.getProperty("user.name")),
path).toString();
}
}
// set correct scheme and authority
if (StringUtils.isEmpty(fromScheme)) {
if (isLocal) {
// file for local
fromScheme = "file";
} else {
// use default values from fs.default.name
URI defaultURI = FileSystem.get(conf).getUri();
fromScheme = defaultURI.getScheme();
fromAuthority = defaultURI.getAuthority();
}
}
// if scheme is specified but not authority then use the default authority
if ((!fromScheme.equals("file")) && StringUtils.isEmpty(fromAuthority)) {
URI defaultURI = FileSystem.get(conf).getUri();
fromAuthority = defaultURI.getAuthority();
}
LOG.debug(fromScheme + "@" + fromAuthority + "@" + path);
return new URI(fromScheme, fromAuthority, path, null, null);
}
private List applyConstraintsAndGetFiles(URI fromURI, Table table) throws SemanticException {
FileStatus[] srcs = null;
// local mode implies that scheme should be "file"
// we can change this going forward
if (isLocal && !fromURI.getScheme().equals("file")) {
throw new SemanticException(ASTErrorUtils.getMsg(
ErrorMsg.ILLEGAL_PATH.getMsg(), fromTree,
"Source file system should be \"file\" if \"local\" is specified"));
}
try {
FileSystem fileSystem = FileSystem.get(fromURI, conf);
srcs = matchFilesOrDir(fileSystem, new Path(fromURI));
if (srcs == null || srcs.length == 0) {
throw new SemanticException(ASTErrorUtils.getMsg(
ErrorMsg.INVALID_PATH.getMsg(), fromTree,
"No files matching path " + fromURI));
}
for (FileStatus oneSrc : srcs) {
if (oneSrc.isDir()) {
reparseAndSuperAnalyze(table, fromURI);
return null;
}
}
AcidUtils.validateAcidFiles(table, srcs, fileSystem);
// Do another loop if table is bucketed
List bucketCols = table.getBucketCols();
if (bucketCols != null && !bucketCols.isEmpty()) {
// Hive assumes that user names the files as per the corresponding
// bucket. For e.g, file names should follow the format 000000_0, 000000_1 etc.
// Here the 1st file will belong to bucket 0 and 2nd to bucket 1 and so on.
boolean[] bucketArray = new boolean[table.getNumBuckets()];
// initialize the array
Arrays.fill(bucketArray, false);
int numBuckets = table.getNumBuckets();
for (FileStatus oneSrc : srcs) {
String bucketName = oneSrc.getPath().getName();
//get the bucket id
String bucketIdStr =
Utilities.getBucketFileNameFromPathSubString(bucketName);
int bucketId = Utilities.getBucketIdFromFile(bucketIdStr);
LOG.debug("bucket ID for file " + oneSrc.getPath() + " = " + bucketId
+ " for table " + table.getFullyQualifiedName());
if (bucketId == -1 || bucketId >= numBuckets || bucketArray[bucketId]) {
reparseAndSuperAnalyze(table, fromURI);
return null;
}
bucketArray[bucketId] = true;
}
}
} catch (IOException e) {
// Has to use full name to make sure it does not conflict with
// org.apache.commons.lang.StringUtils
throw new SemanticException(ASTErrorUtils.getMsg(
ErrorMsg.INVALID_PATH.getMsg(), fromTree), e);
}
return Lists.newArrayList(srcs);
}
@Override
public void init(boolean clearPartsCache) {
Table tempTable = ctx.getTempTableForLoad();
if (tempTable != null) {
// tempTable is only set when load is rewritten.
super.init(clearPartsCache);
tabNameToTabObject.put(tempTable.getTableName().toLowerCase(), tempTable);
}
}
@Override
public void analyzeInternal(ASTNode ast) throws SemanticException {
if (ctx.getTempTableForLoad() != null) {
super.analyzeInternal(ast);
} else {
analyzeLoad(ast);
}
}
private void analyzeLoad(ASTNode ast) throws SemanticException {
fromTree = ast.getChild(0);
tableTree = ast.getChild(1);
boolean inputInfo = false;
// Check the last node
ASTNode child = (ASTNode)ast.getChild(ast.getChildCount() - 1);
if (child.getToken().getType() == HiveParser.TOK_INPUTFORMAT) {
if (child.getChildCount() != 2) {
throw new SemanticException("FileFormat should contain both input format and Serde");
}
try {
inputFormatClassName = stripQuotes(child.getChild(0).getText());
serDeClassName = stripQuotes(child.getChild(1).getText());
inputInfo = true;
} catch (Exception e) {
throw new SemanticException("FileFormat inputFormatClassName or serDeClassName is incorrect");
}
}
if ((!inputInfo && ast.getChildCount() == 4) ||
(inputInfo && ast.getChildCount() == 5)) {
isLocal = true;
isOverWrite = true;
}
if ((!inputInfo && ast.getChildCount() == 3) ||
(inputInfo && ast.getChildCount() == 4)) {
if (ast.getChild(2).getText().toLowerCase().equals("local")) {
isLocal = true;
} else {
isOverWrite = true;
}
}
// initialize load path
URI fromURI;
try {
String fromPath = stripQuotes(fromTree.getText());
fromURI = initializeFromURI(fromPath, isLocal);
} catch (IOException | URISyntaxException e) {
throw new SemanticException(ASTErrorUtils.getMsg(
ErrorMsg.INVALID_PATH.getMsg(), fromTree, e.getMessage()), e);
}
// initialize destination table/partition
TableSpec ts = new TableSpec(db, conf, (ASTNode) tableTree);
if (ts.tableHandle.isView() || ts.tableHandle.isMaterializedView()) {
throw new SemanticException(ErrorMsg.DML_AGAINST_VIEW.getMsg());
}
if (ts.tableHandle.isNonNative()) {
HiveStorageHandler storageHandler = ts.tableHandle.getStorageHandler();
boolean isUseNativeApi = conf.getBoolVar(HIVE_LOAD_DATA_USE_NATIVE_API);
boolean supportAppend = isUseNativeApi && storageHandler.supportsAppendData(ts.tableHandle.getTTable(),
ts.getPartSpec() != null && !ts.getPartSpec().isEmpty());
if (supportAppend) {
LoadTableDesc loadTableWork =
new LoadTableDesc(new Path(fromURI), ts.tableHandle, isOverWrite, true, ts.getPartSpec());
Task> childTask =
TaskFactory.get(new MoveWork(getInputs(), getOutputs(), loadTableWork, null, true, isLocal));
rootTasks.add(childTask);
return;
} else {
// launch a tez job
StorageFormatDescriptor ss = storageHandler.getStorageFormatDescriptor(ts.tableHandle.getTTable());
if (ss != null) {
inputFormatClassName = ss.getInputFormat();
serDeClassName = ss.getSerde();
reparseAndSuperAnalyze(ts.tableHandle, fromURI);
return;
}
throw new SemanticException(ErrorMsg.LOAD_INTO_NON_NATIVE.getMsg());
}
}
if(ts.tableHandle.isStoredAsSubDirectories()) {
throw new SemanticException(ErrorMsg.LOAD_INTO_STORED_AS_DIR.getMsg());
}
List parts = ts.tableHandle.getPartitionKeys();
if ((parts != null && parts.size() > 0)
&& (ts.partSpec == null || ts.partSpec.size() == 0)) {
// launch a tez job
reparseAndSuperAnalyze(ts.tableHandle, fromURI);
return;
}
List bucketCols = ts.tableHandle.getBucketCols();
if (bucketCols != null && !bucketCols.isEmpty()) {
String error = StrictChecks.checkBucketing(conf);
if (error != null) {
// launch a tez job
reparseAndSuperAnalyze(ts.tableHandle, fromURI);
return;
}
}
// make sure the arguments make sense
List files = applyConstraintsAndGetFiles(fromURI, ts.tableHandle);
if (queryReWritten) {
return;
}
// for managed tables, make sure the file formats match
if (TableType.MANAGED_TABLE.equals(ts.tableHandle.getTableType())
&& conf.getBoolVar(HiveConf.ConfVars.HIVECHECKFILEFORMAT)) {
ensureFileFormatsMatch(ts, files, fromURI);
}
inputs.add(toReadEntity(new Path(fromURI)));
// create final load/move work
boolean preservePartitionSpecs = false;
Map partSpec = ts.getPartSpec();
if (partSpec == null) {
partSpec = new LinkedHashMap();
outputs.add(new WriteEntity(ts.tableHandle,
(isOverWrite ? WriteEntity.WriteType.INSERT_OVERWRITE :
WriteEntity.WriteType.INSERT)));
} else {
try{
Partition part = Hive.get().getPartition(ts.tableHandle, partSpec, false);
if (part != null) {
if (isOverWrite){
outputs.add(new WriteEntity(part, WriteEntity.WriteType.INSERT_OVERWRITE));
} else {
outputs.add(new WriteEntity(part, WriteEntity.WriteType.INSERT));
// If partition already exists and we aren't overwriting it, then respect
// its current location info rather than picking it from the parent TableDesc
preservePartitionSpecs = true;
}
} else {
outputs.add(new WriteEntity(ts.tableHandle,
(isOverWrite ? WriteEntity.WriteType.INSERT_OVERWRITE :
WriteEntity.WriteType.INSERT)));
}
} catch(HiveException e) {
throw new SemanticException(e);
}
}
Long writeId = null;
int stmtId = -1;
boolean isTxnTable = AcidUtils.isTransactionalTable(ts.tableHandle);
if (isTxnTable) {
try {
writeId = getTxnMgr().getTableWriteId(ts.tableHandle.getDbName(),
ts.tableHandle.getTableName());
} catch (LockException ex) {
throw new SemanticException("Failed to allocate the write id", ex);
}
stmtId = getTxnMgr().getStmtIdAndIncrement();
}
// Note: this sets LoadFileType incorrectly for ACID; is that relevant for load?
// See setLoadFileType and setIsAcidIow calls elsewhere for an example.
LoadTableDesc loadTableWork = new LoadTableDesc(new Path(fromURI),
Utilities.getTableDesc(ts.tableHandle), partSpec, isOverWrite
? LoadFileType.REPLACE_ALL : LoadFileType.KEEP_EXISTING, writeId);
loadTableWork.setStmtId(stmtId);
loadTableWork.setInsertOverwrite(isOverWrite);
if (preservePartitionSpecs) {
// Note : preservePartitionSpecs=true implies inheritTableSpecs=false but
// but preservePartitionSpecs=false(default) here is not sufficient enough
// info to set inheritTableSpecs=true
loadTableWork.setInheritTableSpecs(false);
}
Task> childTask = TaskFactory.get(
new MoveWork(getInputs(), getOutputs(), loadTableWork, null, true, isLocal)
);
rootTasks.add(childTask);
// The user asked for stats to be collected.
// Some stats like number of rows require a scan of the data
// However, some other stats, like number of files, do not require a complete scan
// Update the stats which do not require a complete scan.
Task> statTask = null;
if (conf.getBoolVar(HiveConf.ConfVars.HIVESTATSAUTOGATHER)) {
BasicStatsWork basicStatsWork = new BasicStatsWork(loadTableWork);
basicStatsWork.setNoStatsAggregator(true);
basicStatsWork.setClearAggregatorStats(true);
StatsWork columnStatsWork = new StatsWork(ts.tableHandle, basicStatsWork, conf);
statTask = TaskFactory.get(columnStatsWork);
}
if (statTask != null) {
childTask.addDependentTask(statTask);
}
}
private void ensureFileFormatsMatch(TableSpec ts, List fileStatuses,
final URI fromURI)
throws SemanticException {
final Class extends InputFormat> destInputFormat;
try {
if (ts.getPartSpec() == null || ts.getPartSpec().isEmpty()) {
destInputFormat = ts.tableHandle.getInputFormatClass();
} else {
destInputFormat = ts.partHandle.getInputFormatClass();
}
} catch (HiveException e) {
throw new SemanticException(e);
}
try {
FileSystem fs = FileSystem.get(fromURI, conf);
boolean validFormat = HiveFileFormatUtils.checkInputFormat(fs, conf, destInputFormat,
fileStatuses);
if (!validFormat) {
throw new SemanticException(ErrorMsg.INVALID_FILE_FORMAT_IN_LOAD.getMsg());
}
} catch (Exception e) {
throw new SemanticException("Unable to load data to destination table." +
" Error: " + e.getMessage());
}
}
// Rewrite the load to launch an insert job.
private void reparseAndSuperAnalyze(Table table, URI fromURI) throws SemanticException {
LOG.info("Load data triggered a Tez job instead of usual file operation");
// Step 1 : Create a temp table object
// Create a Table object
Table tempTableObj = new Table(new org.apache.hadoop.hive.metastore.api.Table(table.getTTable()));
// Construct a temp table name
String tempTblName = table.getTableName() + tempTblNameSuffix;
tempTableObj.setTableName(tempTblName);
// Reset table params
tempTableObj.setParameters(new HashMap<>());
// Set data location and input format, it must be text
tempTableObj.setDataLocation(new Path(fromURI));
if (inputFormatClassName != null && serDeClassName != null) {
try {
tempTableObj.setInputFormatClass(inputFormatClassName);
tempTableObj.setSerializationLib(serDeClassName);
} catch (HiveException e) {
throw new SemanticException("Load Data: Failed to set inputFormat or SerDe");
}
}
// Make the columns list for the temp table (input data file).
// Move all the partition columns at the end of table columns.
ArrayList colList = new ArrayList();
colList.addAll(table.getCols());
// inpPartSpec is a mapping from partition column name to its value.
Map inpPartSpec = null;
// Partition spec was already validated by caller when create TableSpec object.
// So, need not validate inpPartSpec here.
List parts = table.getPartCols();
if (tableTree.getChildCount() >= 2) {
ASTNode partSpecNode = (ASTNode) tableTree.getChild(1);
inpPartSpec = new HashMap<>(partSpecNode.getChildCount());
for (int i = 0; i < partSpecNode.getChildCount(); ++i) {
ASTNode partSpecValNode = (ASTNode) partSpecNode.getChild(i);
String partVal = null;
String partColName = unescapeIdentifier(partSpecValNode.getChild(0).getText().toLowerCase());
if (partSpecValNode.getChildCount() >= 2) { // in the form of T partition (ds="2010-03-03")
// Not stripping quotes here as we need to use it as it is while framing PARTITION clause
// in INSERT query.
partVal = partSpecValNode.getChild(1).getText();
}
inpPartSpec.put(partColName, partVal);
}
// Add only dynamic partition columns to the temp table (input data file).
// For static partitions, values would be obtained from partition(key=value...) clause.
for (FieldSchema fs : parts) {
String partKey = fs.getName();
// If a partition value is not there, then it is dynamic partition key.
if (inpPartSpec.get(partKey) == null) {
colList.add(fs);
}
}
} else {
// No static partitions specified and hence all are dynamic partition keys and need to be part
// of temp table (input data file).
colList.addAll(parts);
}
// Set columns list for temp table.
tempTableObj.setFields(colList);
// Wipe out partition columns
tempTableObj.setPartCols(new ArrayList<>());
// Step 2 : create the Insert query
StringBuilder rewrittenQueryStr = new StringBuilder();
if (isOverWrite) {
rewrittenQueryStr.append("insert overwrite table ");
} else {
rewrittenQueryStr.append("insert into table ");
}
rewrittenQueryStr.append(getFullTableNameForSQL((ASTNode)(tableTree.getChild(0))));
addPartitionColsToInsert(table.getPartCols(), inpPartSpec, rewrittenQueryStr);
rewrittenQueryStr.append(" select * from ");
rewrittenQueryStr.append(tempTblName);
// Step 3 : parse the query
// Set dynamic partitioning to nonstrict so that queries do not need any partition
// references.
HiveConf.setVar(conf, HiveConf.ConfVars.DYNAMICPARTITIONINGMODE, "nonstrict");
// Parse the rewritten query string
Context rewrittenCtx;
rewrittenCtx = new Context(conf);
// We keep track of all the contexts that are created by this query
// so we can clear them when we finish execution
ctx.addSubContext(rewrittenCtx);
rewrittenCtx.setExplainConfig(ctx.getExplainConfig());
rewrittenCtx.setExplainPlan(ctx.isExplainPlan());
rewrittenCtx.setCmd(rewrittenQueryStr.toString());
rewrittenCtx.setTempTableForLoad(tempTableObj);
ASTNode rewrittenTree;
try {
LOG.info("Going to reparse <" + ctx.getCmd() + "> as \n<" + rewrittenQueryStr.toString() + ">");
rewrittenTree = ParseUtils.parse(rewrittenQueryStr.toString(), rewrittenCtx);
} catch (ParseException e) {
throw new SemanticException(ErrorMsg.LOAD_DATA_LAUNCH_JOB_PARSE_ERROR.getMsg(), e);
}
// Step 4 : Reanalyze
super.analyze(rewrittenTree, rewrittenCtx);
queryReWritten = true;
}
@Override
public Set getAllOutputs() {
return outputs;
}
}