All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.hudi.BaseHoodieTableFileIndex Maven / Gradle / Ivy

There is a newer version: 1.0.0-beta2
Show newest version
/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.hudi;

import org.apache.hudi.common.config.HoodieMetadataConfig;
import org.apache.hudi.common.config.TypedProperties;
import org.apache.hudi.common.engine.HoodieEngineContext;
import org.apache.hudi.common.fs.FSUtils;
import org.apache.hudi.common.model.BaseFile;
import org.apache.hudi.common.model.FileSlice;
import org.apache.hudi.common.model.HoodieLogFile;
import org.apache.hudi.common.model.HoodieTableQueryType;
import org.apache.hudi.common.model.HoodieTableType;
import org.apache.hudi.common.table.HoodieTableMetaClient;
import org.apache.hudi.common.table.timeline.HoodieInstant;
import org.apache.hudi.common.table.timeline.HoodieTimeline;
import org.apache.hudi.common.table.view.FileSystemViewStorageConfig;
import org.apache.hudi.common.table.view.HoodieTableFileSystemView;
import org.apache.hudi.common.util.CollectionUtils;
import org.apache.hudi.common.util.Option;
import org.apache.hudi.exception.HoodieException;
import org.apache.hudi.exception.HoodieIOException;
import org.apache.hudi.hadoop.CachingPath;
import org.apache.hudi.metadata.HoodieTableMetadata;

import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.Path;
import org.apache.log4j.LogManager;
import org.apache.log4j.Logger;

import java.io.IOException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collection;
import java.util.Collections;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Objects;
import java.util.function.Function;
import java.util.stream.Collectors;

import static org.apache.hudi.hadoop.CachingPath.createPathUnsafe;

/**
 * Common (engine-agnostic) File Index implementation enabling individual query engines to
 * list Hudi Table contents based on the
 *
 * 
    *
  • Table type (MOR, COW)
  • *
  • Query type (snapshot, read_optimized, incremental)
  • *
  • Query instant/range
  • *
*/ public abstract class BaseHoodieTableFileIndex implements AutoCloseable { private static final Logger LOG = LogManager.getLogger(BaseHoodieTableFileIndex.class); private final String[] partitionColumns; protected final HoodieMetadataConfig metadataConfig; private final HoodieTableQueryType queryType; private final Option specifiedQueryInstant; protected final List queryPaths; private final boolean shouldIncludePendingCommits; private final boolean shouldValidateInstant; private final HoodieTableType tableType; protected final Path basePath; private final HoodieTableMetaClient metaClient; private final HoodieEngineContext engineContext; private final transient FileStatusCache fileStatusCache; protected transient volatile long cachedFileSize = 0L; protected transient volatile Map> cachedAllInputFileSlices; protected volatile boolean queryAsNonePartitionedTable = false; private transient volatile HoodieTableFileSystemView fileSystemView = null; private transient HoodieTableMetadata tableMetadata = null; /** * @param engineContext Hudi engine-specific context * @param metaClient Hudi table's meta-client * @param configProperties unifying configuration (in the form of generic properties) * @param queryType target query type * @param queryPaths target DFS paths being queried * @param specifiedQueryInstant instant as of which table is being queried * @param shouldIncludePendingCommits flags whether file-index should exclude any pending operations * @param shouldValidateInstant flags to validate whether query instant is present in the timeline * @param fileStatusCache transient cache of fetched [[FileStatus]]es */ public BaseHoodieTableFileIndex(HoodieEngineContext engineContext, HoodieTableMetaClient metaClient, TypedProperties configProperties, HoodieTableQueryType queryType, List queryPaths, Option specifiedQueryInstant, boolean shouldIncludePendingCommits, boolean shouldValidateInstant, FileStatusCache fileStatusCache) { this.partitionColumns = metaClient.getTableConfig().getPartitionFields() .orElse(new String[0]); this.metadataConfig = HoodieMetadataConfig.newBuilder() .fromProperties(configProperties) .build(); this.queryType = queryType; this.queryPaths = queryPaths; this.specifiedQueryInstant = specifiedQueryInstant; this.shouldIncludePendingCommits = shouldIncludePendingCommits; this.shouldValidateInstant = shouldValidateInstant; this.tableType = metaClient.getTableType(); this.basePath = metaClient.getBasePathV2(); this.metaClient = metaClient; this.engineContext = engineContext; this.fileStatusCache = fileStatusCache; doRefresh(); } protected abstract Object[] parsePartitionColumnValues(String[] partitionColumns, String partitionPath); /** * Returns latest completed instant as seen by this instance of the file-index */ public Option getLatestCompletedInstant() { return getActiveTimeline().filterCompletedInstants().lastInstant(); } /** * Returns table's base-path */ public String getBasePath() { return basePath.toString(); } /** * Fetch list of latest base files and log files per partition. * * @return mapping from string partition paths to its base/log files */ public Map> listFileSlices() { return cachedAllInputFileSlices.entrySet() .stream() .collect(Collectors.toMap(e -> e.getKey().path, Map.Entry::getValue)); } public int getFileSlicesCount() { return cachedAllInputFileSlices.values().stream() .mapToInt(List::size).sum(); } @Override public void close() throws Exception { resetTableMetadata(null); } protected List getAllQueryPartitionPaths() { List queryRelativePartitionPaths = queryPaths.stream() .map(path -> FSUtils.getRelativePartitionPath(basePath, path)) .collect(Collectors.toList()); // Load all the partition path from the basePath, and filter by the query partition path. // TODO load files from the queryRelativePartitionPaths directly. List matchedPartitionPaths = getAllPartitionPathsUnchecked() .stream() .filter(path -> queryRelativePartitionPaths.stream().anyMatch(path::startsWith)) .collect(Collectors.toList()); // Convert partition's path into partition descriptor return matchedPartitionPaths.stream() .map(partitionPath -> { Object[] partitionColumnValues = parsePartitionColumnValues(partitionColumns, partitionPath); return new PartitionPath(partitionPath, partitionColumnValues); }) .collect(Collectors.toList()); } protected void refresh() { fileStatusCache.invalidate(); doRefresh(); } protected HoodieTimeline getActiveTimeline() { // NOTE: We have to use commits and compactions timeline, to make sure that we're properly // handling the following case: when records are inserted into the new log-file w/in the file-group // that is under the pending compaction process, new log-file will bear the compaction's instant (on the // timeline) in its name, as opposed to the base-file's commit instant. To make sure we're not filtering // such log-file we have to _always_ include pending compaction instants into consideration // TODO(HUDI-3302) re-evaluate whether we should filter any commits in here HoodieTimeline timeline = metaClient.getCommitsAndCompactionTimeline(); if (shouldIncludePendingCommits) { return timeline; } else { return timeline.filterCompletedAndCompactionInstants(); } } /** * Load all partition paths and it's files under the query table path. */ private Map loadPartitionPathFiles() { // List files in all partition paths List pathToFetch = new ArrayList<>(); Map cachedPartitionToFiles = new HashMap<>(); // Fetch from the FileStatusCache List partitionPaths = getAllQueryPartitionPaths(); partitionPaths.forEach(partitionPath -> { Option filesInPartition = fileStatusCache.get(partitionPath.fullPartitionPath(basePath)); if (filesInPartition.isPresent()) { cachedPartitionToFiles.put(partitionPath, filesInPartition.get()); } else { pathToFetch.add(partitionPath); } }); Map fetchedPartitionToFiles; if (pathToFetch.isEmpty()) { fetchedPartitionToFiles = Collections.emptyMap(); } else { Map fullPartitionPathsMapToFetch = pathToFetch.stream() .collect(Collectors.toMap( partitionPath -> partitionPath.fullPartitionPath(basePath).toString(), Function.identity()) ); fetchedPartitionToFiles = getAllFilesInPartitionsUnchecked(fullPartitionPathsMapToFetch.keySet()) .entrySet() .stream() .collect(Collectors.toMap(e -> fullPartitionPathsMapToFetch.get(e.getKey()), e -> e.getValue())); } // Update the fileStatusCache fetchedPartitionToFiles.forEach((partitionPath, filesInPartition) -> { fileStatusCache.put(partitionPath.fullPartitionPath(basePath), filesInPartition); }); return CollectionUtils.combine(cachedPartitionToFiles, fetchedPartitionToFiles); } private void doRefresh() { long startTime = System.currentTimeMillis(); HoodieTableMetadata newTableMetadata = HoodieTableMetadata.create(engineContext, metadataConfig, basePath.toString(), FileSystemViewStorageConfig.SPILLABLE_DIR.defaultValue()); resetTableMetadata(newTableMetadata); Map partitionFiles = loadPartitionPathFiles(); FileStatus[] allFiles = partitionFiles.values().stream().flatMap(Arrays::stream).toArray(FileStatus[]::new); metaClient.reloadActiveTimeline(); HoodieTimeline activeTimeline = getActiveTimeline(); Option latestInstant = activeTimeline.lastInstant(); // TODO we can optimize the flow by: // - First fetch list of files from instants of interest // - Load FileStatus's this.fileSystemView = new HoodieTableFileSystemView(metaClient, activeTimeline, allFiles); Option queryInstant = specifiedQueryInstant.or(() -> latestInstant.map(HoodieInstant::getTimestamp)); validate(activeTimeline, queryInstant); // NOTE: For MOR table, when the compaction is inflight, we need to not only fetch the // latest slices, but also include the base and log files of the second-last version of // the file slice in the same file group as the latest file slice that is under compaction. // This logic is realized by `AbstractTableFileSystemView::getLatestMergedFileSlicesBeforeOrOn` // API. Note that for COW table, the merging logic of two slices does not happen as there // is no compaction, thus there is no performance impact. cachedAllInputFileSlices = partitionFiles.keySet().stream() .collect(Collectors.toMap( Function.identity(), partitionPath -> queryInstant.map(instant -> fileSystemView.getLatestMergedFileSlicesBeforeOrOn(partitionPath.path, queryInstant.get()) ) .orElse(fileSystemView.getLatestFileSlices(partitionPath.path)) .collect(Collectors.toList()) ) ); cachedFileSize = cachedAllInputFileSlices.values().stream() .flatMap(Collection::stream) .mapToLong(BaseHoodieTableFileIndex::fileSliceSize) .sum(); // If the partition value contains InternalRow.empty, we query it as a non-partitioned table. queryAsNonePartitionedTable = partitionFiles.keySet().stream().anyMatch(p -> p.values.length == 0); long duration = System.currentTimeMillis() - startTime; LOG.info(String.format("Refresh table %s, spent: %d ms", metaClient.getTableConfig().getTableName(), duration)); } private Map getAllFilesInPartitionsUnchecked(Collection fullPartitionPathsMapToFetch) { try { return tableMetadata.getAllFilesInPartitions(new ArrayList<>(fullPartitionPathsMapToFetch)); } catch (IOException e) { throw new HoodieIOException("Failed to list partition paths for a table", e); } } private List getAllPartitionPathsUnchecked() { try { return isPartitionedTable() ? tableMetadata.getAllPartitionPaths() : Collections.singletonList(""); } catch (IOException e) { throw new HoodieIOException("Failed to fetch partition paths for a table", e); } } private void validate(HoodieTimeline activeTimeline, Option queryInstant) { if (shouldValidateInstant) { if (queryInstant.isPresent() && !activeTimeline.containsInstant(queryInstant.get())) { throw new HoodieIOException(String.format("Query instant (%s) not found in the timeline", queryInstant.get())); } } } private static long fileSliceSize(FileSlice fileSlice) { long logFileSize = fileSlice.getLogFiles().map(HoodieLogFile::getFileSize) .filter(s -> s > 0) .reduce(0L, Long::sum); return fileSlice.getBaseFile().map(BaseFile::getFileLen).orElse(0L) + logFileSize; } private void resetTableMetadata(HoodieTableMetadata newTableMetadata) { if (tableMetadata != null) { try { tableMetadata.close(); } catch (Exception e) { throw new HoodieException("Failed to close HoodieTableMetadata instance", e); } } tableMetadata = newTableMetadata; } private boolean isPartitionedTable() { return partitionColumns.length > 0 || HoodieTableMetadata.isMetadataTable(basePath.toString()); } public static final class PartitionPath { final String path; final Object[] values; public PartitionPath(String path, Object[] values) { this.path = path; this.values = values; } public String getPath() { return path; } Path fullPartitionPath(Path basePath) { if (!path.isEmpty()) { // NOTE: Since we now that the path is a proper relative path that doesn't require // normalization we create Hadoop's Path using more performant unsafe variant return new CachingPath(basePath, createPathUnsafe(path)); } return basePath; } @Override public boolean equals(Object other) { return other instanceof PartitionPath && Objects.equals(path, ((PartitionPath) other).path) && Arrays.equals(values, ((PartitionPath) other).values); } @Override public int hashCode() { return path.hashCode() * 1103 + Arrays.hashCode(values); } } protected interface FileStatusCache { Option get(Path path); void put(Path path, FileStatus[] leafFiles); void invalidate(); } }




© 2015 - 2024 Weber Informatics LLC | Privacy Policy