org.apache.hadoop.hive.ql.plan.ConditionalResolverMergeFiles Maven / Gradle / Ivy
/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hadoop.hive.ql.plan;
import java.io.IOException;
import java.io.Serializable;
import java.util.ArrayList;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.Map;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hive.common.HiveStatsUtils;
import org.apache.hadoop.hive.conf.HiveConf;
import org.apache.hadoop.hive.metastore.Warehouse;
import org.apache.hadoop.hive.ql.exec.Task;
/**
* Conditional task resolution interface. This is invoked at run time to get the
* task to invoke. Developers can plug in their own resolvers
*/
public class ConditionalResolverMergeFiles implements ConditionalResolver,
Serializable {
private static final long serialVersionUID = 1L;
public ConditionalResolverMergeFiles() {
}
/**
* ConditionalResolverMergeFilesCtx.
*
*/
public static class ConditionalResolverMergeFilesCtx implements Serializable {
private static final long serialVersionUID = 1L;
List> listTasks;
private String dir;
private DynamicPartitionCtx dpCtx; // merge task could be after dynamic partition insert
private ListBucketingCtx lbCtx;
public ConditionalResolverMergeFilesCtx() {
}
/**
* @param dir
*/
public ConditionalResolverMergeFilesCtx(
List> listTasks, String dir) {
this.listTasks = listTasks;
this.dir = dir;
}
/**
* @return the dir
*/
public String getDir() {
return dir;
}
/**
* @param dir
* the dir to set
*/
public void setDir(String dir) {
this.dir = dir;
}
/**
* @return the listTasks
*/
public List> getListTasks() {
return listTasks;
}
/**
* @param listTasks
* the listTasks to set
*/
public void setListTasks(List> listTasks) {
this.listTasks = listTasks;
}
public DynamicPartitionCtx getDPCtx() {
return dpCtx;
}
public void setDPCtx(DynamicPartitionCtx dp) {
dpCtx = dp;
}
/**
* @return the lbCtx
*/
public ListBucketingCtx getLbCtx() {
return lbCtx;
}
/**
* @param lbCtx the lbCtx to set
*/
public void setLbCtx(ListBucketingCtx lbCtx) {
this.lbCtx = lbCtx;
}
}
public List> getTasks(HiveConf conf,
Object objCtx) {
ConditionalResolverMergeFilesCtx ctx = (ConditionalResolverMergeFilesCtx) objCtx;
String dirName = ctx.getDir();
List> resTsks = new ArrayList>();
// check if a map-reduce job is needed to merge the files
// If the current size is smaller than the target, merge
long trgtSize = conf.getLongVar(HiveConf.ConfVars.HIVEMERGEMAPFILESSIZE);
long avgConditionSize = conf
.getLongVar(HiveConf.ConfVars.HIVEMERGEMAPFILESAVGSIZE);
trgtSize = Math.max(trgtSize, avgConditionSize);
Task extends Serializable> mvTask = ctx.getListTasks().get(0);
Task extends Serializable> mrTask = ctx.getListTasks().get(1);
Task extends Serializable> mrAndMvTask = ctx.getListTasks().get(2);
try {
Path dirPath = new Path(dirName);
FileSystem inpFs = dirPath.getFileSystem(conf);
DynamicPartitionCtx dpCtx = ctx.getDPCtx();
if (inpFs.exists(dirPath)) {
// For each dynamic partition, check if it needs to be merged.
MapWork work;
if (mrTask.getWork() instanceof MapredWork) {
work = ((MapredWork) mrTask.getWork()).getMapWork();
} else if (mrTask.getWork() instanceof TezWork){
work = (MapWork) ((TezWork) mrTask.getWork()).getAllWork().get(0);
} else {
work = (MapWork) mrTask.getWork();
}
int lbLevel = (ctx.getLbCtx() == null) ? 0 : ctx.getLbCtx().calculateListBucketingLevel();
/**
* In order to make code easier to read, we write the following in the way:
* 1. the first if clause to differ dynamic partition and static partition
* 2. with static partition, we differ list bucketing from non-list bucketing.
* Another way to write it is to merge static partition w/ LB wit DP. In that way,
* we still need to further differ them, since one uses lbLevel and
* another lbLevel+numDPCols.
* The first one is selected mainly for easy to read.
*/
// Dynamic partition: replace input path (root to dp paths) with dynamic partition
// input paths.
if (dpCtx != null && dpCtx.getNumDPCols() > 0) {
int numDPCols = dpCtx.getNumDPCols();
int dpLbLevel = numDPCols + lbLevel;
generateActualTasks(conf, resTsks, trgtSize, avgConditionSize, mvTask, mrTask,
mrAndMvTask, dirPath, inpFs, ctx, work, dpLbLevel);
} else { // no dynamic partitions
if(lbLevel == 0) {
// static partition without list bucketing
long totalSz = getMergeSize(inpFs, dirPath, avgConditionSize);
if (totalSz >= 0) { // add the merge job
setupMapRedWork(conf, work, trgtSize, totalSz);
resTsks.add(mrTask);
} else { // don't need to merge, add the move job
resTsks.add(mvTask);
}
} else {
// static partition and list bucketing
generateActualTasks(conf, resTsks, trgtSize, avgConditionSize, mvTask, mrTask,
mrAndMvTask, dirPath, inpFs, ctx, work, lbLevel);
}
}
} else {
resTsks.add(mvTask);
}
} catch (IOException e) {
e.printStackTrace();
}
// Only one of the tasks should ever be added to resTsks
assert(resTsks.size() == 1);
return resTsks;
}
/**
* This method generates actual task for conditional tasks. It could be
* 1. move task only
* 2. merge task only
* 3. merge task followed by a move task.
* It used to be true for dynamic partition only since static partition doesn't have #3.
* It changes w/ list bucketing. Static partition has #3 since it has sub-directories.
* For example, if a static partition is defined as skewed and stored-as-directores,
* instead of all files in one directory, it will create a sub-dir per skewed value plus
* default directory. So #3 is required for static partition.
* So, we move it to a method so that it can be used by both SP and DP.
* @param conf
* @param resTsks
* @param trgtSize
* @param avgConditionSize
* @param mvTask
* @param mrTask
* @param mrAndMvTask
* @param dirPath
* @param inpFs
* @param ctx
* @param work
* @param dpLbLevel
* @throws IOException
*/
private void generateActualTasks(HiveConf conf, List> resTsks,
long trgtSize, long avgConditionSize, Task extends Serializable> mvTask,
Task extends Serializable> mrTask, Task extends Serializable> mrAndMvTask, Path dirPath,
FileSystem inpFs, ConditionalResolverMergeFilesCtx ctx, MapWork work, int dpLbLevel)
throws IOException {
DynamicPartitionCtx dpCtx = ctx.getDPCtx();
// get list of dynamic partitions
FileStatus[] status = HiveStatsUtils.getFileStatusRecurse(dirPath, dpLbLevel, inpFs);
// cleanup pathToPartitionInfo
Map ptpi = work.getPathToPartitionInfo();
assert ptpi.size() == 1;
String path = ptpi.keySet().iterator().next();
PartitionDesc partDesc = ptpi.get(path);
TableDesc tblDesc = partDesc.getTableDesc();
ptpi.remove(path); // the root path is not useful anymore
// cleanup pathToAliases
Map> pta = work.getPathToAliases();
assert pta.size() == 1;
path = pta.keySet().iterator().next();
ArrayList aliases = pta.get(path);
pta.remove(path); // the root path is not useful anymore
// populate pathToPartitionInfo and pathToAliases w/ DP paths
long totalSz = 0;
boolean doMerge = false;
// list of paths that don't need to merge but need to move to the dest location
List toMove = new ArrayList();
for (int i = 0; i < status.length; ++i) {
long len = getMergeSize(inpFs, status[i].getPath(), avgConditionSize);
if (len >= 0) {
doMerge = true;
totalSz += len;
PartitionDesc pDesc = (dpCtx != null) ? generateDPFullPartSpec(dpCtx, status, tblDesc, i)
: partDesc;
work.resolveDynamicPartitionStoredAsSubDirsMerge(conf, status[i].getPath(), tblDesc,
aliases, pDesc);
} else {
toMove.add(status[i].getPath());
}
}
if (doMerge) {
// add the merge MR job
setupMapRedWork(conf, work, trgtSize, totalSz);
// add the move task for those partitions that do not need merging
if (toMove.size() > 0) {
// modify the existing move task as it is already in the candidate running tasks
// running the MoveTask and MR task in parallel may
// cause the mvTask write to /ds=1 and MR task write
// to /ds=1_1 for the same partition.
// make the MoveTask as the child of the MR Task
resTsks.add(mrAndMvTask);
MoveWork mvWork = (MoveWork) mvTask.getWork();
LoadFileDesc lfd = mvWork.getLoadFileWork();
Path targetDir = lfd.getTargetDir();
List targetDirs = new ArrayList(toMove.size());
for (int i = 0; i < toMove.size(); i++) {
String[] moveStrSplits = toMove.get(i).toUri().toString().split(Path.SEPARATOR);
int dpIndex = moveStrSplits.length - dpLbLevel;
Path target = targetDir;
while (dpIndex < moveStrSplits.length) {
target = new Path(target, moveStrSplits[dpIndex]);
dpIndex++;
}
targetDirs.add(target);
}
LoadMultiFilesDesc lmfd = new LoadMultiFilesDesc(toMove,
targetDirs, lfd.getIsDfsDir(), lfd.getColumns(), lfd.getColumnTypes());
mvWork.setLoadFileWork(null);
mvWork.setLoadTableWork(null);
mvWork.setMultiFilesDesc(lmfd);
} else {
resTsks.add(mrTask);
}
} else { // add the move task
resTsks.add(mvTask);
}
}
private PartitionDesc generateDPFullPartSpec(DynamicPartitionCtx dpCtx, FileStatus[] status,
TableDesc tblDesc, int i) {
Map fullPartSpec = new LinkedHashMap(
dpCtx.getPartSpec());
Warehouse.makeSpecFromName(fullPartSpec, status[i].getPath());
PartitionDesc pDesc = new PartitionDesc(tblDesc, (LinkedHashMap) fullPartSpec);
return pDesc;
}
private void setupMapRedWork(HiveConf conf, MapWork mWork, long targetSize, long totalSize) {
mWork.setMaxSplitSize(targetSize);
mWork.setMinSplitSize(targetSize);
mWork.setMinSplitSizePerNode(targetSize);
mWork.setMinSplitSizePerRack(targetSize);
}
private static class AverageSize {
private final long totalSize;
private final int numFiles;
public AverageSize(long totalSize, int numFiles) {
this.totalSize = totalSize;
this.numFiles = numFiles;
}
public long getTotalSize() {
return totalSize;
}
public int getNumFiles() {
return numFiles;
}
}
private AverageSize getAverageSize(FileSystem inpFs, Path dirPath) {
AverageSize error = new AverageSize(-1, -1);
try {
FileStatus[] fStats = inpFs.listStatus(dirPath);
long totalSz = 0;
int numFiles = 0;
for (FileStatus fStat : fStats) {
if (fStat.isDir()) {
AverageSize avgSzDir = getAverageSize(inpFs, fStat.getPath());
if (avgSzDir.getTotalSize() < 0) {
return error;
}
totalSz += avgSzDir.getTotalSize();
numFiles += avgSzDir.getNumFiles();
}
else {
totalSz += fStat.getLen();
numFiles++;
}
}
return new AverageSize(totalSz, numFiles);
} catch (IOException e) {
return error;
}
}
/**
* Whether to merge files inside directory given the threshold of the average file size.
*
* @param inpFs input file system.
* @param dirPath input file directory.
* @param avgSize threshold of average file size.
* @return -1 if not need to merge (either because of there is only 1 file or the
* average size is larger than avgSize). Otherwise the size of the total size of files.
* If return value is 0 that means there are multiple files each of which is an empty file.
* This could be true when the table is bucketized and all buckets are empty.
*/
private long getMergeSize(FileSystem inpFs, Path dirPath, long avgSize) {
AverageSize averageSize = getAverageSize(inpFs, dirPath);
if (averageSize.getTotalSize() <= 0) {
return -1;
}
if (averageSize.getNumFiles() <= 1) {
return -1;
}
if (averageSize.getTotalSize()/averageSize.getNumFiles() < avgSize) {
return averageSize.getTotalSize();
}
return -1;
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy