org.apache.hudi.hadoop.hive.HoodieCombineHiveInputFormat Maven / Gradle / Ivy
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hudi.hadoop.hive;
import org.apache.hudi.common.table.HoodieTableMetaClient;
import org.apache.hudi.common.table.TableSchemaResolver;
import org.apache.hudi.common.util.Option;
import org.apache.hudi.common.util.ReflectionUtils;
import org.apache.hudi.common.util.TablePathUtils;
import org.apache.hudi.common.util.ValidationUtils;
import org.apache.hudi.hadoop.HoodieParquetInputFormat;
import org.apache.hudi.hadoop.HoodieParquetInputFormatBase;
import org.apache.hudi.hadoop.fs.HadoopFSUtils;
import org.apache.hudi.hadoop.realtime.HoodieCombineRealtimeRecordReader;
import org.apache.hudi.hadoop.realtime.HoodieParquetRealtimeInputFormat;
import org.apache.hudi.hadoop.utils.HoodieInputFormatUtils;
import org.apache.hudi.internal.schema.InternalSchema;
import org.apache.hudi.internal.schema.utils.SerDeHelper;
import org.apache.hudi.storage.HoodieStorage;
import org.apache.hudi.storage.StoragePath;
import org.apache.hudi.storage.hadoop.HadoopStorageConfiguration;
import org.apache.hudi.storage.hadoop.HoodieHadoopStorage;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.fs.PathFilter;
import org.apache.hadoop.hive.common.StringInternUtils;
import org.apache.hadoop.hive.conf.HiveConf;
import org.apache.hadoop.hive.metastore.api.hive_metastoreConstants;
import org.apache.hadoop.hive.ql.exec.Operator;
import org.apache.hadoop.hive.ql.exec.Utilities;
import org.apache.hadoop.hive.ql.io.CombineHiveRecordReader;
import org.apache.hadoop.hive.ql.io.HiveFileFormatUtils;
import org.apache.hadoop.hive.ql.io.HiveInputFormat;
import org.apache.hadoop.hive.ql.io.IOContextMap;
import org.apache.hadoop.hive.ql.io.IOPrepareCache;
import org.apache.hadoop.hive.ql.io.SymlinkTextInputFormat;
import org.apache.hadoop.hive.ql.log.PerfLogger;
import org.apache.hadoop.hive.ql.parse.SplitSample;
import org.apache.hadoop.hive.ql.plan.OperatorDesc;
import org.apache.hadoop.hive.ql.plan.PartitionDesc;
import org.apache.hadoop.hive.ql.plan.TableDesc;
import org.apache.hadoop.hive.ql.session.SessionState;
import org.apache.hadoop.hive.shims.HadoopShims.CombineFileInputFormatShim;
import org.apache.hadoop.hive.shims.HadoopShimsSecure;
import org.apache.hadoop.hive.shims.HadoopShimsSecure.InputSplitShim;
import org.apache.hadoop.hive.shims.ShimLoader;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.Writable;
import org.apache.hadoop.io.WritableComparable;
import org.apache.hadoop.mapred.FileInputFormat;
import org.apache.hadoop.mapred.FileSplit;
import org.apache.hadoop.mapred.InputFormat;
import org.apache.hadoop.mapred.InputSplit;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.RecordReader;
import org.apache.hadoop.mapred.Reporter;
import org.apache.hadoop.mapred.lib.CombineFileInputFormat;
import org.apache.hadoop.mapred.lib.CombineFileSplit;
import org.apache.hadoop.mapreduce.JobContext;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
import java.lang.reflect.Method;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.HashMap;
import java.util.HashSet;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import java.util.Objects;
import java.util.Set;
import java.util.concurrent.Callable;
import java.util.concurrent.ExecutionException;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
import java.util.concurrent.Future;
/**
* This is just a copy of the org.apache.hadoop.hive.ql.io.CombineHiveInputFormat from Hive 2.x Search for **MOD** to
* see minor modifications to support custom inputformat in CombineHiveInputFormat. See
* https://issues.apache.org/jira/browse/HIVE-9771
*
*
* CombineHiveInputFormat is a parameterized InputFormat which looks at the path name and determine the correct
* InputFormat for that path name from mapredPlan.pathToPartitionInfo(). It can be used to read files with different
* input format in the same map-reduce job.
*
* NOTE : This class is implemented to work with Hive 2.x +
*/
public class HoodieCombineHiveInputFormat
extends HiveInputFormat {
private static final String CLASS_NAME = HoodieCombineHiveInputFormat.class.getName();
public static final Logger LOG = LoggerFactory.getLogger(CLASS_NAME);
// max number of threads we can use to check non-combinable paths
private static final int MAX_CHECK_NONCOMBINABLE_THREAD_NUM = 50;
private static final int DEFAULT_NUM_PATH_PER_THREAD = 100;
public static final String INTERNAL_SCHEMA_CACHE_KEY_PREFIX = "hudi.hive.internal.schema.cache.key.prefix";
public static final String SCHEMA_CACHE_KEY_PREFIX = "hudi.hive.schema.cache.key.prefix";
protected String getParquetInputFormatClassName() {
return HoodieParquetInputFormat.class.getName();
}
protected String getParquetRealtimeInputFormatClassName() {
return HoodieParquetRealtimeInputFormat.class.getName();
}
protected HoodieCombineFileInputFormatShim createInputFormatShim() {
return new HoodieCombineHiveInputFormat.HoodieCombineFileInputFormatShim<>();
}
/**
* Create Hive splits based on CombineFileSplit.
*/
private InputSplit[] getCombineSplits(JobConf job, int numSplits, Map pathToPartitionInfo)
throws IOException {
init(job);
Map> pathToAliases = mrwork.getPathToAliases();
Map> aliasToWork = mrwork.getAliasToWork();
/* MOD - Initialize a custom combine input format shim that will call listStatus on the custom inputFormat **/
HoodieCombineHiveInputFormat.HoodieCombineFileInputFormatShim combine = createInputFormatShim();
InputSplit[] splits;
if (combine.getInputPathsShim(job).length == 0) {
throw new IOException("No input paths specified in job");
}
List result = new ArrayList<>();
// combine splits only from same tables and same partitions. Do not combine splits from multiple
// tables or multiple partitions.
Path[] paths = StringInternUtils.internUriStringsInPathArray(combine.getInputPathsShim(job));
List inpDirs = new ArrayList<>();
List inpFiles = new ArrayList<>();
Map poolMap = new HashMap<>();
Set poolSet = new HashSet<>();
for (Path path : paths) {
PartitionDesc part = getPartitionFromPath(pathToPartitionInfo, path,
IOPrepareCache.get().allocatePartitionDescMap());
TableDesc tableDesc = part.getTableDesc();
if ((tableDesc != null) && tableDesc.isNonNative()) {
return super.getSplits(job, numSplits);
}
// Use HiveInputFormat if any of the paths is not splittable
Class> inputFormatClass = part.getInputFileFormatClass();
String inputFormatClassName = inputFormatClass.getName();
InputFormat inputFormat = getInputFormatFromCache(inputFormatClass, job);
LOG.info("Input Format => " + inputFormatClass.getName());
// **MOD** Set the hoodie filter in the combine
if (inputFormatClass.getName().equals(getParquetInputFormatClassName())) {
combine.setHoodieFilter(true);
} else if (inputFormatClass.getName().equals(getParquetRealtimeInputFormatClassName())) {
LOG.info("Setting hoodie filter and realtime input format");
combine.setHoodieFilter(true);
combine.setRealTime(true);
if (job.get(hive_metastoreConstants.META_TABLE_PARTITION_COLUMNS, "").isEmpty()) {
List partitions = new ArrayList<>(part.getPartSpec().keySet());
if (!partitions.isEmpty()) {
String partitionStr = String.join("/", partitions);
LOG.info("Setting Partitions in jobConf - Partition Keys for Path : " + path + " is :" + partitionStr);
job.set(hive_metastoreConstants.META_TABLE_PARTITION_COLUMNS, partitionStr);
} else {
job.set(hive_metastoreConstants.META_TABLE_PARTITION_COLUMNS, "");
}
}
}
String deserializerClassName = null;
try {
deserializerClassName = part.getDeserializer(job).getClass().getName();
} catch (Exception e) {
// ignore
LOG.error("Getting deserializer class name error ", e);
}
// don't combine if inputformat is a SymlinkTextInputFormat
if (inputFormat instanceof SymlinkTextInputFormat) {
splits = super.getSplits(job, numSplits);
return splits;
}
Path filterPath = path;
// Does a pool exist for this path already
CombineFilter f;
List> opList;
if (!mrwork.isMapperCannotSpanPartns()) {
// if mapper can span partitions, make sure a splits does not contain multiple
// opList + inputFormatClassName + deserializerClassName combination
// This is done using the Map of CombinePathInputFormat to PathFilter
opList = HiveFileFormatUtils.doGetWorksFromPath(pathToAliases, aliasToWork, filterPath);
CombinePathInputFormat combinePathInputFormat =
new CombinePathInputFormat(opList, inputFormatClassName, deserializerClassName);
f = poolMap.get(combinePathInputFormat);
if (f == null) {
f = new CombineFilter(filterPath);
LOG.info("CombineHiveInputSplit creating pool for " + path + "; using filter path " + filterPath);
combine.createPool(job, f);
poolMap.put(combinePathInputFormat, f);
} else {
LOG.info("CombineHiveInputSplit: pool is already created for " + path + "; using filter path " + filterPath);
f.addPath(filterPath);
}
} else {
// In the case of tablesample, the input paths are pointing to files rather than directories.
// We need to get the parent directory as the filtering path so that all files in the same
// parent directory will be grouped into one pool but not files from different parent
// directories. This guarantees that a split will combine all files in the same partition
// but won't cross multiple partitions if the user has asked so.
if (!path.getFileSystem(job).getFileStatus(path).isDirectory()) { // path is not directory
filterPath = path.getParent();
inpFiles.add(path);
poolSet.add(filterPath);
} else {
inpDirs.add(path);
}
}
}
// Processing directories
List iss = new ArrayList<>();
if (!mrwork.isMapperCannotSpanPartns()) {
// mapper can span partitions
// combine into as few as one split, subject to the PathFilters set
// using combine.createPool.
iss = Arrays.asList(combine.getSplits(job, 1));
} else {
for (Path path : inpDirs) {
processPaths(job, combine, iss, path);
}
if (inpFiles.size() > 0) {
// Processing files
for (Path filterPath : poolSet) {
combine.createPool(job, new CombineFilter(filterPath));
}
processPaths(job, combine, iss, inpFiles.toArray(new Path[0]));
}
}
if (mrwork.getNameToSplitSample() != null && !mrwork.getNameToSplitSample().isEmpty()) {
iss = sampleSplits(iss);
}
for (CombineFileSplit is : iss) {
final InputSplit csplit;
if (combine.isRealTime) {
if (is instanceof HoodieCombineRealtimeHiveSplit) {
csplit = is;
} else {
csplit = new HoodieCombineRealtimeHiveSplit(job, is, pathToPartitionInfo);
}
} else {
csplit = new CombineHiveInputSplit(job, is, pathToPartitionInfo);
}
result.add(csplit);
}
LOG.info("number of splits " + result.size());
return result.toArray(new CombineHiveInputSplit[result.size()]);
}
/**
* Gets all the path indices that should not be combined.
*/
public Set getNonCombinablePathIndices(JobConf job, Path[] paths, int numThreads)
throws ExecutionException, InterruptedException {
LOG.info("Total number of paths: " + paths.length + ", launching " + numThreads
+ " threads to check non-combinable ones.");
int numPathPerThread = (int) Math.ceil((double) paths.length / numThreads);
ExecutorService executor = Executors.newFixedThreadPool(numThreads);
List>> futureList = new ArrayList<>(numThreads);
try {
for (int i = 0; i < numThreads; i++) {
int start = i * numPathPerThread;
int length = i != numThreads - 1 ? numPathPerThread : paths.length - start;
futureList.add(executor.submit(new CheckNonCombinablePathCallable(paths, start, length, job)));
}
Set nonCombinablePathIndices = new HashSet<>();
for (Future> future : futureList) {
nonCombinablePathIndices.addAll(future.get());
}
return nonCombinablePathIndices;
} finally {
executor.shutdownNow();
}
}
/**
* Create Hive splits based on CombineFileSplit.
*/
@Override
public InputSplit[] getSplits(JobConf job, int numSplits) throws IOException {
PerfLogger perfLogger = SessionState.getPerfLogger();
perfLogger.PerfLogBegin(CLASS_NAME, PerfLogger.GET_SPLITS);
init(job);
List result = new ArrayList<>();
Path[] paths = getInputPaths(job);
List nonCombinablePaths = new ArrayList<>(paths.length / 2);
List combinablePaths = new ArrayList<>(paths.length / 2);
int numThreads = Math.min(MAX_CHECK_NONCOMBINABLE_THREAD_NUM,
(int) Math.ceil((double) paths.length / DEFAULT_NUM_PATH_PER_THREAD));
// This check is necessary because for Spark branch, the result array from
// getInputPaths() above could be empty, and therefore numThreads could be 0.
// In that case, Executors.newFixedThreadPool will fail.
if (numThreads > 0) {
try {
Set nonCombinablePathIndices = getNonCombinablePathIndices(job, paths, numThreads);
for (int i = 0; i < paths.length; i++) {
if (nonCombinablePathIndices.contains(i)) {
nonCombinablePaths.add(paths[i]);
} else {
combinablePaths.add(paths[i]);
}
}
} catch (Exception e) {
LOG.error("Error checking non-combinable path", e);
perfLogger.PerfLogEnd(CLASS_NAME, PerfLogger.GET_SPLITS);
throw new IOException(e);
}
}
// Store the previous value for the path specification
String oldPaths = job.get(org.apache.hadoop.mapreduce.lib.input.FileInputFormat.INPUT_DIR);
if (LOG.isDebugEnabled()) {
LOG.debug("The received input paths are: [" + oldPaths + "] against the property "
+ org.apache.hadoop.mapreduce.lib.input.FileInputFormat.INPUT_DIR);
}
// Process the normal splits
if (nonCombinablePaths.size() > 0) {
FileInputFormat.setInputPaths(job, nonCombinablePaths.toArray(new Path[0]));
InputSplit[] splits = super.getSplits(job, numSplits);
Collections.addAll(result, splits);
}
// Process the combine splits
if (combinablePaths.size() > 0) {
FileInputFormat.setInputPaths(job, combinablePaths.toArray(new Path[0]));
Map pathToPartitionInfo = this.pathToPartitionInfo != null ? this.pathToPartitionInfo
: Utilities.getMapWork(job).getPathToPartitionInfo();
InputSplit[] splits = getCombineSplits(job, numSplits, pathToPartitionInfo);
Collections.addAll(result, splits);
}
// Restore the old path information back
// This is just to prevent incompatibilities with previous versions Hive
// if some application depends on the original value being set.
if (oldPaths != null) {
job.set(org.apache.hadoop.mapreduce.lib.input.FileInputFormat.INPUT_DIR, oldPaths);
}
// clear work from ThreadLocal after splits generated in case of thread is reused in pool.
Utilities.clearWorkMapForConf(job);
// build internal schema for the query
if (!result.isEmpty()) {
ArrayList uniqTablePaths = new ArrayList<>();
Arrays.stream(paths).forEach(path -> {
final HoodieStorage storage;
try {
storage = new HoodieHadoopStorage(path.getFileSystem(job));
Option tablePath = TablePathUtils.getTablePath(storage, HadoopFSUtils.convertToStoragePath(path));
if (tablePath.isPresent()) {
uniqTablePaths.add(tablePath.get().toUri().toString());
}
} catch (IOException e) {
throw new RuntimeException(e);
}
});
try {
for (String path : uniqTablePaths) {
HoodieTableMetaClient metaClient = HoodieTableMetaClient.builder().setBasePath(path).setConf(new HadoopStorageConfiguration(job)).build();
TableSchemaResolver schemaUtil = new TableSchemaResolver(metaClient);
String avroSchema = schemaUtil.getTableAvroSchema().toString();
Option internalSchema = schemaUtil.getTableInternalSchemaFromCommitMetadata();
if (internalSchema.isPresent()) {
LOG.info("Set internal and avro schema cache with path: " + path);
job.set(SCHEMA_CACHE_KEY_PREFIX + "." + path, avroSchema);
job.set(INTERNAL_SCHEMA_CACHE_KEY_PREFIX + "." + path, SerDeHelper.toJson(internalSchema.get()));
} else {
// always sets up the cache so that we can distinguish with the scenario where the cache was never set(e.g. in tests).
job.set(SCHEMA_CACHE_KEY_PREFIX + "." + path, "");
job.set(INTERNAL_SCHEMA_CACHE_KEY_PREFIX + "." + path, "");
}
}
} catch (Exception e) {
LOG.warn("Fail to set schema cache", e);
}
}
LOG.info("Number of all splits " + result.size());
perfLogger.PerfLogEnd(CLASS_NAME, PerfLogger.GET_SPLITS);
return result.toArray(new InputSplit[result.size()]);
}
private void processPaths(JobConf job, CombineFileInputFormatShim combine, List iss, Path... path)
throws IOException {
JobConf currJob = new JobConf(job);
FileInputFormat.setInputPaths(currJob, path);
iss.addAll(Arrays.asList(combine.getSplits(currJob, 1)));
}
/**
* HiveFileFormatUtils.getPartitionDescFromPathRecursively is no longer available since Hive 3.
* This method is to make it compatible with both Hive 2 and Hive 3.
*
* @param pathToPartitionInfo
* @param dir
* @param cacheMap
* @return
* @throws IOException
*/
private static PartitionDesc getPartitionFromPath(Map pathToPartitionInfo, Path dir,
Map