All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.hudi.hadoop.HoodieCopyOnWriteTableInputFormat Maven / Gradle / Ivy

The newest version!
/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.hudi.hadoop;

import org.apache.hudi.common.config.TypedProperties;
import org.apache.hudi.common.engine.HoodieLocalEngineContext;
import org.apache.hudi.common.model.FileSlice;
import org.apache.hudi.common.model.HoodieBaseFile;
import org.apache.hudi.common.model.HoodieLogFile;
import org.apache.hudi.common.model.HoodieTableQueryType;
import org.apache.hudi.common.table.HoodieTableMetaClient;
import org.apache.hudi.common.table.timeline.HoodieInstant;
import org.apache.hudi.common.table.timeline.HoodieTimeline;
import org.apache.hudi.common.table.view.FileSystemViewManager;
import org.apache.hudi.common.table.view.HoodieTableFileSystemView;
import org.apache.hudi.common.util.Option;
import org.apache.hudi.exception.HoodieIOException;
import org.apache.hudi.hadoop.fs.HadoopFSUtils;
import org.apache.hudi.hadoop.utils.HoodieHiveUtils;
import org.apache.hudi.hadoop.utils.HoodieInputFormatUtils;
import org.apache.hudi.metadata.HoodieTableMetadataUtil;

import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.ArrayWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.mapred.FileInputFormat;
import org.apache.hadoop.mapred.FileSplit;
import org.apache.hadoop.mapred.InputSplit;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.RecordReader;
import org.apache.hadoop.mapred.Reporter;
import org.apache.hadoop.mapreduce.Job;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import javax.annotation.Nonnull;

import java.io.IOException;
import java.io.UnsupportedEncodingException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collection;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Properties;
import java.util.stream.Collectors;

import static org.apache.hudi.common.config.HoodieMetadataConfig.ENABLE;

/**
 * Base implementation of the Hive's {@link FileInputFormat} allowing for reading of Hudi's
 * Copy-on-Write (COW) tables in various configurations:
 *
 * 
    *
  • Snapshot mode: reading table's state as of particular timestamp (or instant, in Hudi's terms)
  • *
  • Incremental mode: reading table's state as of particular timestamp (or instant, in Hudi's terms)
  • *
  • External mode: reading non-Hudi partitions
  • *
* * NOTE: This class is invariant of the underlying file-format of the files being read */ public class HoodieCopyOnWriteTableInputFormat extends HoodieTableInputFormat { private static final Logger LOG = LoggerFactory.getLogger(HoodieCopyOnWriteTableInputFormat.class); @Override protected boolean isSplitable(FileSystem fs, Path filename) { return !(filename instanceof PathWithBootstrapFileStatus); } @Override protected FileSplit makeSplit(Path file, long start, long length, String[] hosts) { FileSplit split = new FileSplit(file, start, length, hosts); if (file instanceof PathWithBootstrapFileStatus) { return makeExternalFileSplit((PathWithBootstrapFileStatus)file, split); } return split; } @Override protected FileSplit makeSplit(Path file, long start, long length, String[] hosts, String[] inMemoryHosts) { FileSplit split = new FileSplit(file, start, length, hosts, inMemoryHosts); if (file instanceof PathWithBootstrapFileStatus) { return makeExternalFileSplit((PathWithBootstrapFileStatus)file, split); } return split; } @Override public FileStatus[] listStatus(JobConf job) throws IOException { // Segregate inputPaths[] to incremental, snapshot and non hoodie paths List incrementalTables = HoodieHiveUtils.getIncrementalTableNames(Job.getInstance(job)); InputPathHandler inputPathHandler = new InputPathHandler(conf, getInputPaths(job), incrementalTables); List returns = new ArrayList<>(); Map tableMetaClientMap = inputPathHandler.getTableMetaClientMap(); // process incremental pulls first for (String table : incrementalTables) { HoodieTableMetaClient metaClient = tableMetaClientMap.get(table); if (metaClient == null) { /* This can happen when the INCREMENTAL mode is set for a table but there were no InputPaths * in the jobConf */ continue; } List inputPaths = inputPathHandler.getGroupedIncrementalPaths().get(metaClient); List result = listStatusForIncrementalMode(job, metaClient, inputPaths, table); if (result != null) { returns.addAll(result); } } // process non hoodie Paths next. List nonHoodiePaths = inputPathHandler.getNonHoodieInputPaths(); if (nonHoodiePaths.size() > 0) { setInputPaths(job, nonHoodiePaths.toArray(new Path[nonHoodiePaths.size()])); FileStatus[] fileStatuses = listStatusForNonHoodiePaths(job); returns.addAll(Arrays.asList(fileStatuses)); } // process snapshot queries next. List snapshotPaths = inputPathHandler.getSnapshotPaths(); if (snapshotPaths.size() > 0) { returns.addAll(listStatusForSnapshotMode(job, tableMetaClientMap, snapshotPaths)); } return returns.toArray(new FileStatus[0]); } @Override public RecordReader getRecordReader(InputSplit split, JobConf job, Reporter reporter) throws IOException { throw new UnsupportedEncodingException("not implemented"); } /** * Abstracts and exposes {@link FileInputFormat#listStatus(JobConf)} operation to subclasses that * lists files (returning an array of {@link FileStatus}) corresponding to the input paths specified * as part of provided {@link JobConf} */ protected final FileStatus[] doListStatus(JobConf job) throws IOException { return super.listStatus(job); } /** * return non hoodie paths * @param job * @return * @throws IOException */ public FileStatus[] listStatusForNonHoodiePaths(JobConf job) throws IOException { return doListStatus(job); } /** * Achieves listStatus functionality for an incrementally queried table. Instead of listing all * partitions and then filtering based on the commits of interest, this logic first extracts the * partitions touched by the desired commits and then lists only those partitions. */ protected List listStatusForIncrementalMode(JobConf job, HoodieTableMetaClient tableMetaClient, List inputPaths, String incrementalTable) throws IOException { Job jobContext = Job.getInstance(job); Option timeline = HoodieInputFormatUtils.getFilteredCommitsTimeline(jobContext, tableMetaClient); if (!timeline.isPresent()) { return null; } Option> commitsToCheck = HoodieInputFormatUtils.getCommitsForIncrementalQuery(jobContext, incrementalTable, timeline.get()); if (!commitsToCheck.isPresent()) { return null; } Option incrementalInputPaths = HoodieInputFormatUtils.getAffectedPartitions(commitsToCheck.get(), tableMetaClient, timeline.get(), inputPaths); // Mutate the JobConf to set the input paths to only partitions touched by incremental pull. if (!incrementalInputPaths.isPresent()) { return null; } setInputPaths(job, incrementalInputPaths.get()); FileStatus[] fileStatuses = doListStatus(job); return HoodieInputFormatUtils.filterIncrementalFileStatus(jobContext, tableMetaClient, timeline.get(), fileStatuses, commitsToCheck.get()); } protected FileStatus createFileStatusUnchecked(FileSlice fileSlice, Option latestCompletedInstantOpt, String tableBasePath, HoodieTableMetaClient metaClient) { Option baseFileOpt = fileSlice.getBaseFile(); if (baseFileOpt.isPresent()) { return getFileStatusUnchecked(baseFileOpt.get()); } else { throw new IllegalStateException("Invalid state: base-file has to be present"); } } private BootstrapBaseFileSplit makeExternalFileSplit(PathWithBootstrapFileStatus file, FileSplit split) { try { LOG.info("Making external data split for " + file); FileStatus externalFileStatus = file.getBootstrapFileStatus(); FileSplit externalFileSplit = makeSplit(externalFileStatus.getPath(), 0, externalFileStatus.getLen(), new String[0], new String[0]); return new BootstrapBaseFileSplit(split, externalFileSplit); } catch (IOException e) { throw new HoodieIOException(e.getMessage(), e); } } @Nonnull private List listStatusForSnapshotMode(JobConf job, Map tableMetaClientMap, List snapshotPaths) { HoodieLocalEngineContext engineContext = new HoodieLocalEngineContext(HadoopFSUtils.getStorageConf(job)); List targetFiles = new ArrayList<>(); TypedProperties props = new TypedProperties(new Properties()); Map> groupedPaths = HoodieInputFormatUtils.groupSnapshotPathsByMetaClient(tableMetaClientMap.values(), snapshotPaths); for (Map.Entry> entry : groupedPaths.entrySet()) { HoodieTableMetaClient tableMetaClient = entry.getKey(); List partitionPaths = entry.getValue(); // Hive job might specify a max commit instant up to which table's state // should be examined. We simply pass it as query's instant to the file-index Option queryCommitInstant = HoodieHiveUtils.getMaxCommit(job, tableMetaClient.getTableConfig().getTableName()); boolean shouldIncludePendingCommits = HoodieHiveUtils.shouldIncludePendingCommits(job, tableMetaClient.getTableConfig().getTableName()); if (HoodieTableMetadataUtil.isFilesPartitionAvailable(tableMetaClient) || conf.getBoolean(ENABLE.key(), ENABLE.defaultValue())) { HiveHoodieTableFileIndex fileIndex = new HiveHoodieTableFileIndex( engineContext, tableMetaClient, props, HoodieTableQueryType.SNAPSHOT, partitionPaths.stream().map(HadoopFSUtils::convertToStoragePath).collect(Collectors.toList()), queryCommitInstant, shouldIncludePendingCommits); Map> partitionedFileSlices = fileIndex.listFileSlices(); targetFiles.addAll( partitionedFileSlices.values() .stream() .flatMap(Collection::stream) .filter(fileSlice -> checkIfValidFileSlice(fileSlice)) .map(fileSlice -> createFileStatusUnchecked(fileSlice, fileIndex.getLatestCompletedInstant(), fileIndex.getBasePath().toString(), tableMetaClient)) .collect(Collectors.toList()) ); } else { // If hoodie.metadata.enabled is set to false and the table doesn't have the metadata, // read the table using fs view cache instead of file index. // This is because there's no file index in non-metadata table. String basePath = tableMetaClient.getBasePath().toString(); Map fsViewCache = new HashMap<>(); HoodieTimeline timeline = getActiveTimeline(tableMetaClient, shouldIncludePendingCommits); Option queryInstant = queryCommitInstant.or(() -> timeline.lastInstant().map(HoodieInstant::getTimestamp)); validateInstant(timeline, queryInstant); try { HoodieTableFileSystemView fsView = fsViewCache.computeIfAbsent(tableMetaClient, hoodieTableMetaClient -> FileSystemViewManager.createInMemoryFileSystemViewWithTimeline(engineContext, hoodieTableMetaClient, HoodieInputFormatUtils.buildMetadataConfig(job), timeline)); List filteredFileSlices = new ArrayList<>(); for (Path p : entry.getValue()) { String relativePartitionPath = HadoopFSUtils.getRelativePartitionPath(new Path(basePath), p); List fileSlices = queryInstant.map( instant -> fsView.getLatestMergedFileSlicesBeforeOrOn(relativePartitionPath, instant)) .orElseGet(() -> fsView.getLatestFileSlices(relativePartitionPath)) .collect(Collectors.toList()); filteredFileSlices.addAll(fileSlices); } targetFiles.addAll( filteredFileSlices.stream() .filter(fileSlice -> checkIfValidFileSlice(fileSlice)) .map(fileSlice -> createFileStatusUnchecked(fileSlice, timeline.filterCompletedInstants().lastInstant(), basePath, tableMetaClient)) .collect(Collectors.toList())); } finally { fsViewCache.forEach(((metaClient, fsView) -> fsView.close())); } } } return targetFiles; } private static HoodieTimeline getActiveTimeline(HoodieTableMetaClient metaClient, boolean shouldIncludePendingCommits) { HoodieTimeline timeline = metaClient.getCommitsAndCompactionTimeline(); if (shouldIncludePendingCommits) { return timeline; } else { return timeline.filterCompletedAndCompactionInstants(); } } private static void validateInstant(HoodieTimeline activeTimeline, Option queryInstant) { if (queryInstant.isPresent() && !activeTimeline.containsInstant(queryInstant.get())) { throw new HoodieIOException(String.format("Query instant (%s) not found in the timeline", queryInstant.get())); } } protected boolean checkIfValidFileSlice(FileSlice fileSlice) { Option baseFileOpt = fileSlice.getBaseFile(); Option latestLogFileOpt = fileSlice.getLatestLogFile(); if (baseFileOpt.isPresent()) { return true; } else if (latestLogFileOpt.isPresent()) { // It happens when reading optimized query to mor. return false; } else { throw new IllegalStateException("Invalid state: base-file has to be present for " + fileSlice.getFileId()); } } @Nonnull protected static FileStatus getFileStatusUnchecked(HoodieBaseFile baseFile) { try { return HoodieInputFormatUtils.getFileStatus(baseFile); } catch (IOException ioe) { throw new HoodieIOException("Failed to get file-status", ioe); } } }




© 2015 - 2024 Weber Informatics LLC | Privacy Policy