org.apache.hudi.hadoop.HoodieCopyOnWriteTableInputFormat Maven / Gradle / Ivy
The newest version!
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hudi.hadoop;
import org.apache.hudi.common.config.TypedProperties;
import org.apache.hudi.common.engine.HoodieLocalEngineContext;
import org.apache.hudi.common.model.FileSlice;
import org.apache.hudi.common.model.HoodieBaseFile;
import org.apache.hudi.common.model.HoodieLogFile;
import org.apache.hudi.common.model.HoodieTableQueryType;
import org.apache.hudi.common.table.HoodieTableMetaClient;
import org.apache.hudi.common.table.timeline.HoodieInstant;
import org.apache.hudi.common.table.timeline.HoodieTimeline;
import org.apache.hudi.common.table.view.FileSystemViewManager;
import org.apache.hudi.common.table.view.HoodieTableFileSystemView;
import org.apache.hudi.common.util.Option;
import org.apache.hudi.exception.HoodieIOException;
import org.apache.hudi.hadoop.fs.HadoopFSUtils;
import org.apache.hudi.hadoop.utils.HoodieHiveUtils;
import org.apache.hudi.hadoop.utils.HoodieInputFormatUtils;
import org.apache.hudi.metadata.HoodieTableMetadataUtil;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.ArrayWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.mapred.FileInputFormat;
import org.apache.hadoop.mapred.FileSplit;
import org.apache.hadoop.mapred.InputSplit;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.RecordReader;
import org.apache.hadoop.mapred.Reporter;
import org.apache.hadoop.mapreduce.Job;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import javax.annotation.Nonnull;
import java.io.IOException;
import java.io.UnsupportedEncodingException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collection;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Properties;
import java.util.stream.Collectors;
import static org.apache.hudi.common.config.HoodieMetadataConfig.ENABLE;
/**
* Base implementation of the Hive's {@link FileInputFormat} allowing for reading of Hudi's
* Copy-on-Write (COW) tables in various configurations:
*
*
* - Snapshot mode: reading table's state as of particular timestamp (or instant, in Hudi's terms)
* - Incremental mode: reading table's state as of particular timestamp (or instant, in Hudi's terms)
* - External mode: reading non-Hudi partitions
*
*
* NOTE: This class is invariant of the underlying file-format of the files being read
*/
public class HoodieCopyOnWriteTableInputFormat extends HoodieTableInputFormat {
private static final Logger LOG = LoggerFactory.getLogger(HoodieCopyOnWriteTableInputFormat.class);
@Override
protected boolean isSplitable(FileSystem fs, Path filename) {
return !(filename instanceof PathWithBootstrapFileStatus);
}
@Override
protected FileSplit makeSplit(Path file, long start, long length,
String[] hosts) {
FileSplit split = new FileSplit(file, start, length, hosts);
if (file instanceof PathWithBootstrapFileStatus) {
return makeExternalFileSplit((PathWithBootstrapFileStatus)file, split);
}
return split;
}
@Override
protected FileSplit makeSplit(Path file, long start, long length,
String[] hosts, String[] inMemoryHosts) {
FileSplit split = new FileSplit(file, start, length, hosts, inMemoryHosts);
if (file instanceof PathWithBootstrapFileStatus) {
return makeExternalFileSplit((PathWithBootstrapFileStatus)file, split);
}
return split;
}
@Override
public FileStatus[] listStatus(JobConf job) throws IOException {
// Segregate inputPaths[] to incremental, snapshot and non hoodie paths
List incrementalTables = HoodieHiveUtils.getIncrementalTableNames(Job.getInstance(job));
InputPathHandler inputPathHandler = new InputPathHandler(conf, getInputPaths(job), incrementalTables);
List returns = new ArrayList<>();
Map tableMetaClientMap = inputPathHandler.getTableMetaClientMap();
// process incremental pulls first
for (String table : incrementalTables) {
HoodieTableMetaClient metaClient = tableMetaClientMap.get(table);
if (metaClient == null) {
/* This can happen when the INCREMENTAL mode is set for a table but there were no InputPaths
* in the jobConf
*/
continue;
}
List inputPaths = inputPathHandler.getGroupedIncrementalPaths().get(metaClient);
List result = listStatusForIncrementalMode(job, metaClient, inputPaths, table);
if (result != null) {
returns.addAll(result);
}
}
// process non hoodie Paths next.
List nonHoodiePaths = inputPathHandler.getNonHoodieInputPaths();
if (nonHoodiePaths.size() > 0) {
setInputPaths(job, nonHoodiePaths.toArray(new Path[nonHoodiePaths.size()]));
FileStatus[] fileStatuses = listStatusForNonHoodiePaths(job);
returns.addAll(Arrays.asList(fileStatuses));
}
// process snapshot queries next.
List snapshotPaths = inputPathHandler.getSnapshotPaths();
if (snapshotPaths.size() > 0) {
returns.addAll(listStatusForSnapshotMode(job, tableMetaClientMap, snapshotPaths));
}
return returns.toArray(new FileStatus[0]);
}
@Override
public RecordReader getRecordReader(InputSplit split, JobConf job, Reporter reporter) throws IOException {
throw new UnsupportedEncodingException("not implemented");
}
/**
* Abstracts and exposes {@link FileInputFormat#listStatus(JobConf)} operation to subclasses that
* lists files (returning an array of {@link FileStatus}) corresponding to the input paths specified
* as part of provided {@link JobConf}
*/
protected final FileStatus[] doListStatus(JobConf job) throws IOException {
return super.listStatus(job);
}
/**
* return non hoodie paths
* @param job
* @return
* @throws IOException
*/
public FileStatus[] listStatusForNonHoodiePaths(JobConf job) throws IOException {
return doListStatus(job);
}
/**
* Achieves listStatus functionality for an incrementally queried table. Instead of listing all
* partitions and then filtering based on the commits of interest, this logic first extracts the
* partitions touched by the desired commits and then lists only those partitions.
*/
protected List listStatusForIncrementalMode(JobConf job,
HoodieTableMetaClient tableMetaClient,
List inputPaths,
String incrementalTable) throws IOException {
Job jobContext = Job.getInstance(job);
Option timeline = HoodieInputFormatUtils.getFilteredCommitsTimeline(jobContext, tableMetaClient);
if (!timeline.isPresent()) {
return null;
}
Option> commitsToCheck = HoodieInputFormatUtils.getCommitsForIncrementalQuery(jobContext, incrementalTable, timeline.get());
if (!commitsToCheck.isPresent()) {
return null;
}
Option incrementalInputPaths = HoodieInputFormatUtils.getAffectedPartitions(commitsToCheck.get(), tableMetaClient, timeline.get(), inputPaths);
// Mutate the JobConf to set the input paths to only partitions touched by incremental pull.
if (!incrementalInputPaths.isPresent()) {
return null;
}
setInputPaths(job, incrementalInputPaths.get());
FileStatus[] fileStatuses = doListStatus(job);
return HoodieInputFormatUtils.filterIncrementalFileStatus(jobContext, tableMetaClient, timeline.get(), fileStatuses, commitsToCheck.get());
}
protected FileStatus createFileStatusUnchecked(FileSlice fileSlice, Option latestCompletedInstantOpt,
String tableBasePath, HoodieTableMetaClient metaClient) {
Option baseFileOpt = fileSlice.getBaseFile();
if (baseFileOpt.isPresent()) {
return getFileStatusUnchecked(baseFileOpt.get());
} else {
throw new IllegalStateException("Invalid state: base-file has to be present");
}
}
private BootstrapBaseFileSplit makeExternalFileSplit(PathWithBootstrapFileStatus file, FileSplit split) {
try {
LOG.info("Making external data split for " + file);
FileStatus externalFileStatus = file.getBootstrapFileStatus();
FileSplit externalFileSplit = makeSplit(externalFileStatus.getPath(), 0, externalFileStatus.getLen(),
new String[0], new String[0]);
return new BootstrapBaseFileSplit(split, externalFileSplit);
} catch (IOException e) {
throw new HoodieIOException(e.getMessage(), e);
}
}
@Nonnull
private List listStatusForSnapshotMode(JobConf job,
Map tableMetaClientMap,
List snapshotPaths) {
HoodieLocalEngineContext engineContext = new HoodieLocalEngineContext(HadoopFSUtils.getStorageConf(job));
List targetFiles = new ArrayList<>();
TypedProperties props = new TypedProperties(new Properties());
Map> groupedPaths =
HoodieInputFormatUtils.groupSnapshotPathsByMetaClient(tableMetaClientMap.values(), snapshotPaths);
for (Map.Entry> entry : groupedPaths.entrySet()) {
HoodieTableMetaClient tableMetaClient = entry.getKey();
List partitionPaths = entry.getValue();
// Hive job might specify a max commit instant up to which table's state
// should be examined. We simply pass it as query's instant to the file-index
Option queryCommitInstant =
HoodieHiveUtils.getMaxCommit(job, tableMetaClient.getTableConfig().getTableName());
boolean shouldIncludePendingCommits =
HoodieHiveUtils.shouldIncludePendingCommits(job, tableMetaClient.getTableConfig().getTableName());
if (HoodieTableMetadataUtil.isFilesPartitionAvailable(tableMetaClient) || conf.getBoolean(ENABLE.key(), ENABLE.defaultValue())) {
HiveHoodieTableFileIndex fileIndex =
new HiveHoodieTableFileIndex(
engineContext,
tableMetaClient,
props,
HoodieTableQueryType.SNAPSHOT,
partitionPaths.stream().map(HadoopFSUtils::convertToStoragePath).collect(Collectors.toList()),
queryCommitInstant,
shouldIncludePendingCommits);
Map> partitionedFileSlices = fileIndex.listFileSlices();
targetFiles.addAll(
partitionedFileSlices.values()
.stream()
.flatMap(Collection::stream)
.filter(fileSlice -> checkIfValidFileSlice(fileSlice))
.map(fileSlice -> createFileStatusUnchecked(fileSlice, fileIndex.getLatestCompletedInstant(),
fileIndex.getBasePath().toString(), tableMetaClient))
.collect(Collectors.toList())
);
} else {
// If hoodie.metadata.enabled is set to false and the table doesn't have the metadata,
// read the table using fs view cache instead of file index.
// This is because there's no file index in non-metadata table.
String basePath = tableMetaClient.getBasePath().toString();
Map fsViewCache = new HashMap<>();
HoodieTimeline timeline = getActiveTimeline(tableMetaClient, shouldIncludePendingCommits);
Option queryInstant = queryCommitInstant.or(() -> timeline.lastInstant().map(HoodieInstant::getTimestamp));
validateInstant(timeline, queryInstant);
try {
HoodieTableFileSystemView fsView = fsViewCache.computeIfAbsent(tableMetaClient, hoodieTableMetaClient ->
FileSystemViewManager.createInMemoryFileSystemViewWithTimeline(engineContext, hoodieTableMetaClient,
HoodieInputFormatUtils.buildMetadataConfig(job), timeline));
List filteredFileSlices = new ArrayList<>();
for (Path p : entry.getValue()) {
String relativePartitionPath = HadoopFSUtils.getRelativePartitionPath(new Path(basePath), p);
List fileSlices = queryInstant.map(
instant -> fsView.getLatestMergedFileSlicesBeforeOrOn(relativePartitionPath, instant))
.orElseGet(() -> fsView.getLatestFileSlices(relativePartitionPath))
.collect(Collectors.toList());
filteredFileSlices.addAll(fileSlices);
}
targetFiles.addAll(
filteredFileSlices.stream()
.filter(fileSlice -> checkIfValidFileSlice(fileSlice))
.map(fileSlice -> createFileStatusUnchecked(fileSlice, timeline.filterCompletedInstants().lastInstant(),
basePath, tableMetaClient))
.collect(Collectors.toList()));
} finally {
fsViewCache.forEach(((metaClient, fsView) -> fsView.close()));
}
}
}
return targetFiles;
}
private static HoodieTimeline getActiveTimeline(HoodieTableMetaClient metaClient, boolean shouldIncludePendingCommits) {
HoodieTimeline timeline = metaClient.getCommitsAndCompactionTimeline();
if (shouldIncludePendingCommits) {
return timeline;
} else {
return timeline.filterCompletedAndCompactionInstants();
}
}
private static void validateInstant(HoodieTimeline activeTimeline, Option queryInstant) {
if (queryInstant.isPresent() && !activeTimeline.containsInstant(queryInstant.get())) {
throw new HoodieIOException(String.format("Query instant (%s) not found in the timeline", queryInstant.get()));
}
}
protected boolean checkIfValidFileSlice(FileSlice fileSlice) {
Option baseFileOpt = fileSlice.getBaseFile();
Option latestLogFileOpt = fileSlice.getLatestLogFile();
if (baseFileOpt.isPresent()) {
return true;
} else if (latestLogFileOpt.isPresent()) {
// It happens when reading optimized query to mor.
return false;
} else {
throw new IllegalStateException("Invalid state: base-file has to be present for " + fileSlice.getFileId());
}
}
@Nonnull
protected static FileStatus getFileStatusUnchecked(HoodieBaseFile baseFile) {
try {
return HoodieInputFormatUtils.getFileStatus(baseFile);
} catch (IOException ioe) {
throw new HoodieIOException("Failed to get file-status", ioe);
}
}
}