org.apache.hudi.hadoop.HoodieParquetInputFormat Maven / Gradle / Ivy
The newest version!
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hudi.hadoop;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collection;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.stream.Collectors;
import org.apache.hadoop.conf.Configurable;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hive.ql.io.parquet.MapredParquetInputFormat;
import org.apache.hadoop.io.ArrayWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.mapred.InputSplit;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.RecordReader;
import org.apache.hadoop.mapred.Reporter;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hudi.common.model.HoodieCommitMetadata;
import org.apache.hudi.common.model.HoodieBaseFile;
import org.apache.hudi.common.model.HoodiePartitionMetadata;
import org.apache.hudi.common.table.HoodieTableMetaClient;
import org.apache.hudi.common.table.HoodieTimeline;
import org.apache.hudi.common.table.TableFileSystemView.BaseFileOnlyView;
import org.apache.hudi.common.table.timeline.HoodieInstant;
import org.apache.hudi.common.table.view.HoodieTableFileSystemView;
import org.apache.hudi.common.util.StringUtils;
import org.apache.hudi.exception.HoodieIOException;
import org.apache.log4j.LogManager;
import org.apache.log4j.Logger;
/**
* HoodieInputFormat which understands the Hoodie File Structure and filters files based on the Hoodie Mode. If paths
* that does not correspond to a hoodie table then they are passed in as is (as what FileInputFormat.listStatus()
* would do). The JobConf could have paths from multipe Hoodie/Non-Hoodie tables
*/
@UseFileSplitsFromInputFormat
public class HoodieParquetInputFormat extends MapredParquetInputFormat implements Configurable {
private static final Logger LOG = LogManager.getLogger(HoodieParquetInputFormat.class);
protected Configuration conf;
@Override
public FileStatus[] listStatus(JobConf job) throws IOException {
// Segregate inputPaths[] to incremental, snapshot and non hoodie paths
List incrementalTables = HoodieHiveUtil.getIncrementalTableNames(Job.getInstance(job));
InputPathHandler inputPathHandler = new InputPathHandler(conf, getInputPaths(job), incrementalTables);
List returns = new ArrayList<>();
Map tableMetaClientMap = inputPathHandler.getTableMetaClientMap();
// process incremental pulls first
for (String table : incrementalTables) {
HoodieTableMetaClient metaClient = tableMetaClientMap.get(table);
if (metaClient == null) {
/* This can happen when the INCREMENTAL mode is set for a table but there were no InputPaths
* in the jobConf
*/
continue;
}
List inputPaths = inputPathHandler.getGroupedIncrementalPaths().get(metaClient);
List result = listStatusForIncrementalMode(job, metaClient, inputPaths);
if (result != null) {
returns.addAll(result);
}
}
// process non hoodie Paths next.
List nonHoodiePaths = inputPathHandler.getNonHoodieInputPaths();
if (nonHoodiePaths.size() > 0) {
setInputPaths(job, nonHoodiePaths.toArray(new Path[nonHoodiePaths.size()]));
FileStatus[] fileStatuses = super.listStatus(job);
returns.addAll(Arrays.asList(fileStatuses));
}
// process snapshot queries next.
List snapshotPaths = inputPathHandler.getSnapshotPaths();
if (snapshotPaths.size() > 0) {
setInputPaths(job, snapshotPaths.toArray(new Path[snapshotPaths.size()]));
FileStatus[] fileStatuses = super.listStatus(job);
Map> groupedFileStatus =
groupFileStatusForSnapshotPaths(fileStatuses, tableMetaClientMap.values());
LOG.info("Found a total of " + groupedFileStatus.size() + " groups");
for (Map.Entry> entry : groupedFileStatus.entrySet()) {
List result = filterFileStatusForSnapshotMode(entry.getKey(), entry.getValue());
if (result != null) {
returns.addAll(result);
}
}
}
return returns.toArray(new FileStatus[returns.size()]);
}
/**
* Achieves listStatus functionality for an incrementally queried table. Instead of listing all
* partitions and then filtering based on the commits of interest, this logic first extracts the
* partitions touched by the desired commits and then lists only those partitions.
*/
private List listStatusForIncrementalMode(
JobConf job, HoodieTableMetaClient tableMetaClient, List inputPaths) throws IOException {
String tableName = tableMetaClient.getTableConfig().getTableName();
HoodieTimeline timeline = tableMetaClient.getActiveTimeline().getCommitsTimeline().filterCompletedInstants();
String lastIncrementalTs = HoodieHiveUtil.readStartCommitTime(Job.getInstance(job), tableName);
// Total number of commits to return in this batch. Set this to -1 to get all the commits.
Integer maxCommits = HoodieHiveUtil.readMaxCommits(Job.getInstance(job), tableName);
LOG.info("Last Incremental timestamp was set as " + lastIncrementalTs);
List commitsToCheck = timeline.findInstantsAfter(lastIncrementalTs, maxCommits)
.getInstants().collect(Collectors.toList());
// Extract partitions touched by the commitsToCheck
Set partitionsToList = new HashSet<>();
for (HoodieInstant commit : commitsToCheck) {
HoodieCommitMetadata commitMetadata = HoodieCommitMetadata.fromBytes(timeline.getInstantDetails(commit).get(),
HoodieCommitMetadata.class);
partitionsToList.addAll(commitMetadata.getPartitionToWriteStats().keySet());
}
if (partitionsToList.isEmpty()) {
return null;
}
String incrementalInputPaths = partitionsToList.stream()
.map(s -> tableMetaClient.getBasePath() + Path.SEPARATOR + s)
.filter(s -> {
/*
* Ensure to return only results from the original input path that has incremental changes
* This check is needed for the following corner case - When the caller invokes
* HoodieInputFormat.listStatus multiple times (with small batches of Hive partitions each
* time. Ex. Hive fetch task calls listStatus for every partition once) we do not want to
* accidentally return all incremental changes for the entire table in every listStatus()
* call. This will create redundant splits. Instead we only want to return the incremental
* changes (if so any) in that batch of input paths.
*
* NOTE on Hive queries that are executed using Fetch task:
* Since Fetch tasks invoke InputFormat.listStatus() per partition, Hoodie metadata can be
* listed in every such listStatus() call. In order to avoid this, it might be useful to
* disable fetch tasks using the hive session property for incremental queries:
* `set hive.fetch.task.conversion=none;`
* This would ensure Map Reduce execution is chosen for a Hive query, which combines
* partitions (comma separated) and calls InputFormat.listStatus() only once with all
* those partitions.
*/
for (Path path : inputPaths) {
if (path.toString().contains(s)) {
return true;
}
}
return false;
})
.collect(Collectors.joining(","));
if (StringUtils.isNullOrEmpty(incrementalInputPaths)) {
return null;
}
// Mutate the JobConf to set the input paths to only partitions touched by incremental pull.
setInputPaths(job, incrementalInputPaths);
FileStatus[] fileStatuses = super.listStatus(job);
BaseFileOnlyView roView = new HoodieTableFileSystemView(tableMetaClient, timeline, fileStatuses);
List commitsList = commitsToCheck.stream().map(HoodieInstant::getTimestamp).collect(Collectors.toList());
List filteredFiles = roView.getLatestBaseFilesInRange(commitsList).collect(Collectors.toList());
List returns = new ArrayList<>();
for (HoodieBaseFile filteredFile : filteredFiles) {
LOG.debug("Processing incremental hoodie file - " + filteredFile.getPath());
filteredFile = checkFileStatus(filteredFile);
returns.add(filteredFile.getFileStatus());
}
LOG.info("Total paths to process after hoodie incremental filter " + filteredFiles.size());
return returns;
}
/**
* Takes in a list of filesStatus and a list of table metadatas. Groups the files status list
* based on given table metadata.
* @param fileStatuses
* @param metaClientList
* @return
* @throws IOException
*/
private Map> groupFileStatusForSnapshotPaths(
FileStatus[] fileStatuses, Collection metaClientList) {
// This assumes the paths for different tables are grouped together
Map> grouped = new HashMap<>();
HoodieTableMetaClient metadata = null;
for (FileStatus status : fileStatuses) {
Path inputPath = status.getPath();
if (!inputPath.getName().endsWith(".parquet")) {
//FIXME(vc): skip non parquet files for now. This wont be needed once log file name start
// with "."
continue;
}
if ((metadata == null) || (!inputPath.toString().contains(metadata.getBasePath()))) {
for (HoodieTableMetaClient metaClient : metaClientList) {
if (inputPath.toString().contains(metaClient.getBasePath())) {
metadata = metaClient;
if (!grouped.containsKey(metadata)) {
grouped.put(metadata, new ArrayList<>());
}
break;
}
}
}
grouped.get(metadata).add(status);
}
return grouped;
}
/**
* Filters data files for a snapshot queried table.
*/
private List filterFileStatusForSnapshotMode(
HoodieTableMetaClient metadata, List fileStatuses) {
FileStatus[] statuses = fileStatuses.toArray(new FileStatus[0]);
if (LOG.isDebugEnabled()) {
LOG.debug("Hoodie Metadata initialized with completed commit Ts as :" + metadata);
}
// Get all commits, delta commits, compactions, as all of them produce a base parquet file today
HoodieTimeline timeline = metadata.getActiveTimeline().getCommitsTimeline().filterCompletedInstants();
BaseFileOnlyView roView = new HoodieTableFileSystemView(metadata, timeline, statuses);
// filter files on the latest commit found
List filteredFiles = roView.getLatestBaseFiles().collect(Collectors.toList());
LOG.info("Total paths to process after hoodie filter " + filteredFiles.size());
List returns = new ArrayList<>();
for (HoodieBaseFile filteredFile : filteredFiles) {
if (LOG.isDebugEnabled()) {
LOG.debug("Processing latest hoodie file - " + filteredFile.getPath());
}
filteredFile = checkFileStatus(filteredFile);
returns.add(filteredFile.getFileStatus());
}
return returns;
}
/**
* Checks the file status for a race condition which can set the file size to 0. 1. HiveInputFormat does
* super.listStatus() and gets back a FileStatus[] 2. Then it creates the HoodieTableMetaClient for the paths listed.
* 3. Generation of splits looks at FileStatus size to create splits, which skips this file
*/
private HoodieBaseFile checkFileStatus(HoodieBaseFile dataFile) {
Path dataPath = dataFile.getFileStatus().getPath();
try {
if (dataFile.getFileSize() == 0) {
FileSystem fs = dataPath.getFileSystem(conf);
LOG.info("Refreshing file status " + dataFile.getPath());
return new HoodieBaseFile(fs.getFileStatus(dataPath));
}
return dataFile;
} catch (IOException e) {
throw new HoodieIOException("Could not get FileStatus on path " + dataPath);
}
}
public void setConf(Configuration conf) {
this.conf = conf;
}
@Override
public Configuration getConf() {
return conf;
}
@Override
public RecordReader getRecordReader(final InputSplit split, final JobConf job,
final Reporter reporter) throws IOException {
// TODO enable automatic predicate pushdown after fixing issues
// FileSplit fileSplit = (FileSplit) split;
// HoodieTableMetadata metadata = getTableMetadata(fileSplit.getPath().getParent());
// String tableName = metadata.getTableName();
// String mode = HoodieHiveUtil.readMode(job, tableName);
// if (HoodieHiveUtil.INCREMENTAL_SCAN_MODE.equals(mode)) {
// FilterPredicate predicate = constructHoodiePredicate(job, tableName, split);
// LOG.info("Setting parquet predicate push down as " + predicate);
// ParquetInputFormat.setFilterPredicate(job, predicate);
// clearOutExistingPredicate(job);
// }
return super.getRecordReader(split, job, reporter);
}
/**
* Read the table metadata from a data path. This assumes certain hierarchy of files which should be changed once a
* better way is figured out to pass in the hoodie meta directory
*/
protected static HoodieTableMetaClient getTableMetaClient(FileSystem fs, Path dataPath) throws IOException {
int levels = HoodieHiveUtil.DEFAULT_LEVELS_TO_BASEPATH;
if (HoodiePartitionMetadata.hasPartitionMetadata(fs, dataPath)) {
HoodiePartitionMetadata metadata = new HoodiePartitionMetadata(fs, dataPath);
metadata.readFromFS();
levels = metadata.getPartitionDepth();
}
Path baseDir = HoodieHiveUtil.getNthParent(dataPath, levels);
LOG.info("Reading hoodie metadata from path " + baseDir.toString());
return new HoodieTableMetaClient(fs.getConf(), baseDir.toString());
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy