Many resources are needed to download a project. Please understand that we have to compensate our server costs. Thank you in advance. Project price only 1 $
You can buy this project and download/modify it how often you want.
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hadoop.hive.ql.exec;
import org.apache.commons.collections.CollectionUtils;
import org.apache.commons.io.IOUtils;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hive.common.BlobStorageUtils;
import org.apache.hadoop.hive.common.FileUtils;
import org.apache.hadoop.hive.common.HiveStatsUtils;
import org.apache.hadoop.hive.conf.HiveConf;
import org.apache.hadoop.hive.metastore.api.FieldSchema;
import org.apache.hadoop.hive.metastore.api.InvalidOperationException;
import org.apache.hadoop.hive.metastore.api.Order;
import org.apache.hadoop.hive.metastore.api.hive_metastoreConstants;
import org.apache.hadoop.hive.metastore.utils.MetaStoreUtils;
import org.apache.hadoop.hive.ql.ErrorMsg;
import org.apache.hadoop.hive.ql.ddl.DDLUtils;
import org.apache.hadoop.hive.ql.ddl.table.create.CreateTableDesc;
import org.apache.hadoop.hive.ql.ddl.view.create.CreateMaterializedViewDesc;
import org.apache.hadoop.hive.ql.exec.mr.MapRedTask;
import org.apache.hadoop.hive.ql.exec.mr.MapredLocalTask;
import org.apache.hadoop.hive.ql.exec.repl.util.ReplUtils;
import org.apache.hadoop.hive.ql.hooks.LineageInfo.DataContainer;
import org.apache.hadoop.hive.ql.hooks.WriteEntity;
import org.apache.hadoop.hive.ql.io.AcidUtils;
import org.apache.hadoop.hive.ql.io.HiveFileFormatUtils;
import org.apache.hadoop.hive.ql.io.merge.MergeFileTask;
import org.apache.hadoop.hive.ql.lockmgr.HiveLock;
import org.apache.hadoop.hive.ql.lockmgr.HiveLockManager;
import org.apache.hadoop.hive.ql.lockmgr.HiveLockMode;
import org.apache.hadoop.hive.ql.lockmgr.HiveLockObj;
import org.apache.hadoop.hive.ql.lockmgr.HiveLockObject;
import org.apache.hadoop.hive.ql.lockmgr.LockException;
import org.apache.hadoop.hive.ql.log.PerfLogger;
import org.apache.hadoop.hive.ql.metadata.Hive;
import org.apache.hadoop.hive.ql.metadata.HiveException;
import org.apache.hadoop.hive.ql.metadata.HiveStorageHandler;
import org.apache.hadoop.hive.ql.metadata.HiveUtils;
import org.apache.hadoop.hive.ql.metadata.Partition;
import org.apache.hadoop.hive.ql.metadata.Table;
import org.apache.hadoop.hive.ql.optimizer.physical.BucketingSortingCtx.BucketCol;
import org.apache.hadoop.hive.ql.optimizer.physical.BucketingSortingCtx.SortCol;
import org.apache.hadoop.hive.ql.parse.ExplainConfiguration.AnalyzeState;
import org.apache.hadoop.hive.ql.parse.SemanticException;
import org.apache.hadoop.hive.ql.plan.DynamicPartitionCtx;
import org.apache.hadoop.hive.ql.plan.LoadFileDesc;
import org.apache.hadoop.hive.ql.plan.LoadMultiFilesDesc;
import org.apache.hadoop.hive.ql.plan.LoadTableDesc;
import org.apache.hadoop.hive.ql.plan.LoadTableDesc.LoadFileType;
import org.apache.hadoop.hive.ql.plan.MapWork;
import org.apache.hadoop.hive.ql.plan.MapredWork;
import org.apache.hadoop.hive.ql.plan.MoveWork;
import org.apache.hadoop.hive.ql.plan.TableDesc;
import org.apache.hadoop.hive.ql.plan.api.StageType;
import org.apache.hadoop.hive.ql.session.SessionState;
import org.apache.hadoop.hive.ql.util.DirectionUtils;
import org.apache.hadoop.util.StringUtils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.io.Closeable;
import java.io.IOException;
import java.io.Serializable;
import java.nio.charset.Charset;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashSet;
import java.util.LinkedHashSet;
import java.util.List;
import java.util.Map;
import java.util.Optional;
import java.util.Properties;
import static org.apache.hadoop.hive.ql.exec.Utilities.BLOB_MANIFEST_FILE;
/**
* MoveTask implementation.
**/
public class MoveTask extends Task implements Serializable {
private static final long serialVersionUID = 1L;
private static transient final Logger LOG = LoggerFactory.getLogger(MoveTask.class);
public MoveTask() {
super();
}
private boolean moveFilesUsingManifestFile(FileSystem fs, Path sourcePath, Path targetPath)
throws HiveException, IOException {
if (work.isCTAS() && BlobStorageUtils.isBlobStorageFileSystem(conf, fs)) {
if (fs.exists(new Path(sourcePath, BLOB_MANIFEST_FILE))) {
LOG.debug("Attempting to copy using the paths available in {}", new Path(sourcePath, BLOB_MANIFEST_FILE));
ArrayList filesKept;
try (FSDataInputStream inStream = fs.open(new Path(sourcePath, BLOB_MANIFEST_FILE))) {
String paths = IOUtils.toString(inStream, Charset.defaultCharset());
filesKept = new ArrayList(Arrays.asList(paths.split(System.lineSeparator())));
}
// Remove the first entry from the list, it is the source path.
Path srcPath = new Path(filesKept.remove(0));
LOG.info("Copying files {} from {} to {}", filesKept, srcPath, targetPath);
// Do the move using the filesKept now directly to the target dir.
Utilities.moveSpecifiedFilesInParallel(conf, fs, srcPath, targetPath, new HashSet<>(filesKept));
return true;
}
// Fallback case, in any case the _blob_files_kept isn't created, we can do the normal logic. The file won't
// be created in case of empty source table as well
}
return false;
}
private void moveFile(Path sourcePath, Path targetPath, boolean isDfsDir)
throws HiveException {
try {
PerfLogger perfLogger = SessionState.getPerfLogger();
perfLogger.perfLogBegin("MoveTask", PerfLogger.FILE_MOVES);
String mesg = "Moving data to " + (isDfsDir ? "" : "local ") + "directory "
+ targetPath.toString();
String mesg_detail = " from " + sourcePath.toString();
console.printInfo(mesg, mesg_detail);
FileSystem fs = sourcePath.getFileSystem(conf);
// if _blob_files_kept is present, use it to move the files. Else fall back to normal case.
if (moveFilesUsingManifestFile(fs, sourcePath, targetPath)) {
perfLogger.perfLogEnd("MoveTask", PerfLogger.FILE_MOVES);
return;
}
if (isDfsDir) {
moveFileInDfs (sourcePath, targetPath, conf);
} else {
// This is a local file
FileSystem dstFs = FileSystem.getLocal(conf);
moveFileFromDfsToLocal(sourcePath, targetPath, fs, dstFs);
}
perfLogger.perfLogEnd("MoveTask", PerfLogger.FILE_MOVES);
} catch (Exception e) {
throw new HiveException("Unable to move source " + sourcePath + " to destination "
+ targetPath, e);
}
}
private void moveFileInDfs (Path sourcePath, Path targetPath, HiveConf conf)
throws HiveException, IOException {
final FileSystem srcFs, tgtFs;
try {
tgtFs = targetPath.getFileSystem(conf);
} catch (IOException e) {
LOG.error("Failed to get dest fs", e);
throw new HiveException(e.getMessage(), e);
}
try {
srcFs = sourcePath.getFileSystem(conf);
} catch (IOException e) {
LOG.error("Failed to get src fs", e);
throw new HiveException(e.getMessage(), e);
}
// if source exists, rename. Otherwise, create a empty directory
if (srcFs.exists(sourcePath)) {
Path deletePath = null;
// If it multiple level of folder are there fs.rename is failing so first
// create the targetpath.getParent() if it not exist
if (HiveConf.getBoolVar(conf, HiveConf.ConfVars.HIVE_INSERT_INTO_MULTILEVEL_DIRS)) {
deletePath = createTargetPath(targetPath, tgtFs);
}
//For acid table incremental replication, just copy the content of staging directory to destination.
//No need to clean it.
if (work.isNeedCleanTarget()) {
Hive.clearDestForSubDirSrc(conf, targetPath, sourcePath, false);
}
// Set isManaged to false as this is not load data operation for which it is needed.
if (!Hive.moveFile(conf, sourcePath, targetPath, true, false, false)) {
try {
if (deletePath != null) {
tgtFs.delete(deletePath, true);
}
} catch (IOException e) {
LOG.info("Unable to delete the path created for facilitating rename: {}",
deletePath);
}
throw new HiveException("Unable to rename: " + sourcePath
+ " to: " + targetPath);
}
} else if (!tgtFs.mkdirs(targetPath)) {
throw new HiveException("Unable to make directory: " + targetPath);
}
}
private void moveFileFromDfsToLocal(Path sourcePath, Path targetPath, FileSystem fs,
FileSystem dstFs) throws HiveException, IOException {
// RawLocalFileSystem seems not able to get the right permissions for a local file, it
// always returns hdfs default permission (00666). So we can not overwrite a directory
// by deleting and recreating the directory and restoring its permissions. We should
// delete all its files and subdirectories instead.
if (dstFs.exists(targetPath)) {
if (dstFs.isDirectory(targetPath)) {
FileStatus[] destFiles = dstFs.listStatus(targetPath);
for (FileStatus destFile : destFiles) {
if (!dstFs.delete(destFile.getPath(), true)) {
throw new IOException("Unable to clean the destination directory: " + targetPath);
}
}
} else {
throw new HiveException("Target " + targetPath + " is not a local directory.");
}
} else {
if (!FileUtils.mkdir(dstFs, targetPath, conf)) {
throw new HiveException("Failed to create local target directory " + targetPath);
}
}
if (fs.exists(sourcePath)) {
FileStatus[] srcs = fs.listStatus(sourcePath, FileUtils.HIDDEN_FILES_PATH_FILTER);
for (FileStatus status : srcs) {
fs.copyToLocalFile(status.getPath(), targetPath);
}
}
}
private Path createTargetPath(Path targetPath, FileSystem fs) throws IOException {
Path deletePath = null;
Path mkDirPath = targetPath.getParent();
if (mkDirPath != null && !fs.exists(mkDirPath)) {
Path actualPath = mkDirPath;
// targetPath path is /x/y/z/1/2/3 here /x/y/z is present in the file system
// create the structure till /x/y/z/1/2 to work rename for multilevel directory
// and if rename fails delete the path /x/y/z/1
// If targetPath have multilevel directories like /x/y/z/1/2/3 , /x/y/z/1/2/4
// the renaming of the directories are not atomic the execution will happen one
// by one
while (actualPath != null && !fs.exists(actualPath)) {
deletePath = actualPath;
actualPath = actualPath.getParent();
}
fs.mkdirs(mkDirPath);
}
return deletePath;
}
// Release all the locks acquired for this object
// This becomes important for multi-table inserts when one branch may take much more
// time than the others. It is better to release the lock for this particular insert.
// The other option is to wait for all the branches to finish, or set
// hive.multi.insert.move.tasks.share.dependencies to true, which will mean that the
// first multi-insert results will be available when all of the branches of multi-table
// inserts are done.
private void releaseLocks(LoadTableDesc ltd) throws HiveException {
// nothing needs to be done
if (!conf.getBoolVar(HiveConf.ConfVars.HIVE_SUPPORT_CONCURRENCY)) {
LOG.debug("No locks to release because Hive concurrency support is not enabled");
return;
}
if (context.getHiveTxnManager().supportsAcid()) {
//Acid LM doesn't maintain getOutputLockObjects(); this 'if' just makes logic more explicit
return;
}
HiveLockManager lockMgr = context.getHiveTxnManager().getLockManager();
WriteEntity output = context.getLoadTableOutputMap().get(ltd);
List lockObjects = context.getOutputLockObjects().get(output);
if (CollectionUtils.isEmpty(lockObjects)) {
LOG.debug("No locks found to release");
return;
}
LOG.info("Releasing {} locks", lockObjects.size());
for (HiveLockObj lockObj : lockObjects) {
List locks = lockMgr.getLocks(lockObj.getObj(), false, true);
for (HiveLock lock : locks) {
if (lock.getHiveLockMode() == lockObj.getMode()) {
if (context.getHiveLocks().remove(lock)) {
try {
lockMgr.unlock(lock);
} catch (LockException le) {
// should be OK since the lock is ephemeral and will eventually be deleted
// when the query finishes and zookeeper session is closed.
LOG.warn("Could not release lock {}", lock.getHiveLockObject().getName(), le);
}
}
}
}
}
}
// we check if there is only one immediate child task and it is stats task
public boolean hasFollowingStatsTask() {
if (this.getNumChild() == 1) {
return this.getChildTasks().get(0) instanceof StatsTask;
}
return false;
}
// Whether statistics need to be reset as part of MoveTask execution.
private boolean resetStatisticsProps(Table table) {
if (hasFollowingStatsTask()) {
// If there's a follow-on stats task then the stats will be correct after load, so don't
// need to reset the statistics.
return false;
}
if (!work.getIsInReplicationScope()) {
// If the load is not happening during replication and there is not follow-on stats
// task, stats will be inaccurate after load and so need to be reset.
return true;
}
// If we are loading a table during replication, the stats will also be replicated
// and hence accurate. No need to reset those.
return false;
}
private final static class TaskInformation {
public List bucketCols = null;
public List sortCols = null;
public int numBuckets = -1;
public Task task;
public String path;
public TaskInformation(Task task, String path) {
this.task = task;
this.path = path;
}
}
@Override
public int execute() {
try {
initializeFromDeferredContext();
} catch (HiveException he) {
return processHiveException(he);
}
if (Utilities.FILE_OP_LOGGER.isTraceEnabled()) {
Utilities.FILE_OP_LOGGER.trace("Executing MoveWork " + System.identityHashCode(work)
+ " with " + work.getLoadFileWork() + "; " + work.getLoadTableWork() + "; "
+ work.getLoadMultiFilesWork());
}
if (context.getExplainAnalyze() == AnalyzeState.RUNNING) {
return 0;
}
try (LocalTableLock lock = acquireLockForFileMove(work.getLoadTableWork())) {
if (checkAndCommitNatively(work, conf)) {
return 0;
}
Hive db = getHive();
// Do any hive related operations like moving tables and files
// to appropriate locations
LoadFileDesc lfd = work.getLoadFileWork();
if (lfd != null) {
Path targetPath = lfd.getTargetDir();
Path sourcePath = lfd.getSourcePath();
if (targetPath.equals(sourcePath)) {
Utilities.FILE_OP_LOGGER.debug("MoveTask not moving " + sourcePath);
} else {
Utilities.FILE_OP_LOGGER.debug("MoveTask moving " + sourcePath + " to " + targetPath);
if(lfd.getWriteType() == AcidUtils.Operation.INSERT) {
//'targetPath' is table root of un-partitioned table or partition
//'sourcePath' result of 'select ...' part of CTAS statement
assert lfd.getIsDfsDir();
FileSystem srcFs = sourcePath.getFileSystem(conf);
FileStatus[] srcs = srcFs.globStatus(sourcePath);
if(srcs != null) {
Hive.moveAcidFiles(srcFs, srcs, targetPath, null, conf);
} else {
LOG.debug("No files found to move from " + sourcePath + " to " + targetPath);
}
}
else {
FileSystem targetFs = targetPath.getFileSystem(conf);
if (!targetFs.exists(targetPath.getParent())){
targetFs.mkdirs(targetPath.getParent());
}
moveFile(sourcePath, targetPath, lfd.getIsDfsDir());
}
}
}
// Multi-file load is for dynamic partitions when some partitions do not
// need to merge and they can simply be moved to the target directory.
// This is also used for MM table conversion.
LoadMultiFilesDesc lmfd = work.getLoadMultiFilesWork();
if (lmfd != null) {
boolean isDfsDir = lmfd.getIsDfsDir();
List targetPrefixes = lmfd.getTargetPrefixes();
for (int i = 0; i 0) { // dynamic partitions
// if _blob_files_kept is present, use it to move the files to the target path
// before loading the partitions.
moveFilesUsingManifestFile(tbd.getSourcePath().getFileSystem(conf),
tbd.getSourcePath(), dpCtx.getRootPath());
dc = handleDynParts(db, table, tbd, ti, dpCtx);
} else { // static partitions
dc = handleStaticParts(db, table, tbd, ti);
}
}
if (dc != null) {
// If we are doing an update or a delete the number of columns in the table will not
// match the number of columns in the file sink. For update there will be one too many
// (because of the ROW__ID), and in the case of the delete there will be just the
// ROW__ID, which we don't need to worry about from a lineage perspective.
List tableCols = null;
switch (work.getLoadTableWork().getWriteType()) {
case DELETE:
case UPDATE:
// Pass an empty list as no columns will be written to the file.
// TODO I should be able to make this work for update
tableCols = new ArrayList<>();
break;
default:
tableCols = table.getCols();
break;
}
queryState.getLineageState().setLineage(tbd.getSourcePath(), dc, tableCols);
}
releaseLocks(tbd);
}
return 0;
} catch (HiveException he) {
return processHiveException(he);
} catch (Exception e) {
console.printError("Failed with exception " + e.getMessage(), "\n"
+ StringUtils.stringifyException(e));
setException(e);
LOG.error("MoveTask failed", e);
return ReplUtils.handleException(work.isReplication(), e, work.getDumpDirectory(), work.getMetricCollector(),
getName(), conf);
}
}
private int processHiveException(HiveException he) {
int errorCode = 1;
if (he.getCanonicalErrorMsg() != ErrorMsg.GENERIC_ERROR) {
errorCode = he.getCanonicalErrorMsg().getErrorCode();
if (he.getCanonicalErrorMsg() == ErrorMsg.UNRESOLVED_RT_EXCEPTION) {
console.printError("Failed with exception " + he.getMessage(), "\n"
+ StringUtils.stringifyException(he));
} else {
console.printError("Failed with exception " + he.getMessage()
+ "\nRemote Exception: " + he.getRemoteErrorMsg());
console.printInfo("\n", StringUtils.stringifyException(he),false);
}
}
setException(he);
errorCode = ReplUtils.handleException(work.isReplication(), he, work.getDumpDirectory(),
work.getMetricCollector(), getName(), conf);
return errorCode;
}
private void initializeFromDeferredContext() throws HiveException {
if (null != getDeferredWorkContext()) {
work.initializeFromDeferredContext(getDeferredWorkContext());
}
}
public void logMessage(LoadTableDesc tbd) {
StringBuilder mesg = new StringBuilder("Loading data to table ")
.append( tbd.getTable().getTableName());
if (tbd.getPartitionSpec().size() > 0) {
mesg.append(" partition (");
Map partSpec = tbd.getPartitionSpec();
for (String key: partSpec.keySet()) {
mesg.append(key).append('=').append(partSpec.get(key)).append(", ");
}
mesg.setLength(mesg.length()-2);
mesg.append(')');
}
String mesg_detail = " from " + tbd.getSourcePath();
if (Utilities.FILE_OP_LOGGER.isTraceEnabled()) {
Utilities.FILE_OP_LOGGER.trace(mesg.toString() + " " + mesg_detail);
}
console.printInfo(mesg.toString(), mesg_detail);
}
private DataContainer handleStaticParts(Hive db, Table table, LoadTableDesc tbd,
TaskInformation ti) throws HiveException, IOException, InvalidOperationException {
List partVals = MetaStoreUtils.getPvals(table.getPartCols(), tbd.getPartitionSpec());
db.validatePartitionNameCharacters(partVals);
if (Utilities.FILE_OP_LOGGER.isTraceEnabled()) {
Utilities.FILE_OP_LOGGER.trace("loadPartition called from " + tbd.getSourcePath()
+ " into " + tbd.getTable().getTableName());
}
db.loadPartition(tbd.getSourcePath(), db.getTable(tbd.getTable().getTableName()),
tbd.getPartitionSpec(), tbd.getLoadFileType(), tbd.getInheritTableSpecs(),
tbd.getInheritLocation(), isSkewedStoredAsDirs(tbd), work.isSrcLocal(),
work.getLoadTableWork().getWriteType() != AcidUtils.Operation.NOT_ACID &&
!tbd.isMmTable(),
resetStatisticsProps(table), tbd.getWriteId(), tbd.getStmtId(),
tbd.isInsertOverwrite(), tbd.isDirectInsert());
Partition partn = db.getPartition(table, tbd.getPartitionSpec(), false);
// See the comment inside updatePartitionBucketSortColumns.
if (!tbd.isMmTable() && (ti.bucketCols != null || ti.sortCols != null)) {
updatePartitionBucketSortColumns(db, table, partn, ti.bucketCols,
ti.numBuckets, ti.sortCols);
}
DataContainer dc = new DataContainer(table.getTTable(), partn.getTPartition());
// add this partition to post-execution hook
if (work.getOutputs() != null) {
DDLUtils.addIfAbsentByName(new WriteEntity(partn,
getWriteType(tbd, work.getLoadTableWork().getWriteType())), work.getOutputs());
}
return dc;
}
private DataContainer handleDynParts(Hive db, Table table, LoadTableDesc tbd,
TaskInformation ti, DynamicPartitionCtx dpCtx) throws HiveException,
IOException, InvalidOperationException {
DataContainer dc;
// In case of direct insert, we need to get the statementId in order to make a merge statement work properly.
// In case of a merge statement there will be multiple FSOs and multiple MoveTasks. One for the INSERT, one for
// the UPDATE and one for the DELETE part of the statement. If the direct insert is turned off, these are identified
// by the staging directory path they are using. Also the partition listing will happen within the staging directories,
// so all partitions will be listed only in one MoveTask. But in case of direct insert, there won't be any staging dir
// only the table dir. So all partitions and all deltas will be listed by all MoveTasks. If we have the statementId
// we could restrict the file listing to the directory the particular MoveTask is responsible for.
int statementId = tbd.getStmtId();
if (tbd.isDirectInsert() || tbd.isMmTable()) {
statementId = queryPlan.getStatementIdForAcidWriteType(work.getLoadTableWork().getWriteId(),
tbd.getMoveTaskId(), work.getLoadTableWork().getWriteType(), tbd.getSourcePath(), statementId);
LOG.debug("The statementId used when loading the dynamic partitions is " + statementId);
}
Map> dynamicPartitionSpecs = null;
if (tbd.isMmTable() || tbd.isDirectInsert()) {
dynamicPartitionSpecs = queryPlan.getDynamicPartitionSpecs(work.getLoadTableWork().getWriteId(), tbd.getMoveTaskId(),
work.getLoadTableWork().getWriteType(), tbd.getSourcePath());
}
Map dps = Utilities.getFullDPSpecs(conf, dpCtx, dynamicPartitionSpecs);
console.printInfo(System.getProperty("line.separator"));
long startTime = System.currentTimeMillis();
// load the list of DP partitions and return the list of partition specs
// TODO: In a follow-up to HIVE-1361, we should refactor loadDynamicPartitions
// to use Utilities.getFullDPSpecs() to get the list of full partSpecs.
// After that check the number of DPs created to not exceed the limit and
// iterate over it and call loadPartition() here.
// The reason we don't do inside HIVE-1361 is the latter is large and we
// want to isolate any potential issue it may introduce.
Map