org.apache.hudi.index.simple.HoodieGlobalSimpleIndex Maven / Gradle / Ivy
The newest version!
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package org.apache.hudi.index.simple;
import org.apache.hudi.common.data.HoodieData;
import org.apache.hudi.common.data.HoodiePairData;
import org.apache.hudi.common.engine.HoodieEngineContext;
import org.apache.hudi.common.fs.FSUtils;
import org.apache.hudi.common.model.HoodieBaseFile;
import org.apache.hudi.common.model.HoodieRecord;
import org.apache.hudi.common.model.HoodieRecordGlobalLocation;
import org.apache.hudi.common.table.HoodieTableMetaClient;
import org.apache.hudi.common.util.Option;
import org.apache.hudi.common.util.collection.Pair;
import org.apache.hudi.config.HoodieWriteConfig;
import org.apache.hudi.io.HoodieKeyLocationFetchHandle;
import org.apache.hudi.keygen.BaseKeyGenerator;
import org.apache.hudi.table.HoodieTable;
import java.util.List;
import static org.apache.hudi.common.model.HoodieTableType.MERGE_ON_READ;
import static org.apache.hudi.index.HoodieIndexUtils.getLatestBaseFilesForAllPartitions;
import static org.apache.hudi.index.HoodieIndexUtils.tagGlobalLocationBackToRecords;
/**
* A global simple index which reads interested fields(record key and partition path) from base files and
* joins with incoming records to find the tagged location.
*/
public class HoodieGlobalSimpleIndex extends HoodieSimpleIndex {
public HoodieGlobalSimpleIndex(HoodieWriteConfig config, Option keyGeneratorOpt) {
super(config, keyGeneratorOpt);
}
@Override
public HoodieData> tagLocation(
HoodieData> records, HoodieEngineContext context,
HoodieTable hoodieTable) {
return tagLocationInternal(records, context, hoodieTable);
}
/**
* Tags records location for incoming records.
*
* @param inputRecords {@link HoodieData} of incoming records
* @param context instance of {@link HoodieEngineContext} to use
* @param hoodieTable instance of {@link HoodieTable} to use
* @return {@link HoodieData} of records with record locations set
*/
@Override
protected HoodieData> tagLocationInternal(
HoodieData> inputRecords, HoodieEngineContext context,
HoodieTable hoodieTable) {
List> latestBaseFiles = getAllBaseFilesInTable(context, hoodieTable);
int configuredSimpleIndexParallelism = config.getGlobalSimpleIndexParallelism();
int fetchParallelism =
configuredSimpleIndexParallelism > 0 ? configuredSimpleIndexParallelism : inputRecords.deduceNumPartitions();
HoodiePairData allKeysAndLocations =
fetchRecordGlobalLocations(context, hoodieTable, fetchParallelism, latestBaseFiles);
boolean mayContainDuplicateLookup = hoodieTable.getMetaClient().getTableType() == MERGE_ON_READ;
boolean shouldUpdatePartitionPath = config.getGlobalSimpleIndexUpdatePartitionPath() && hoodieTable.isPartitioned();
return tagGlobalLocationBackToRecords(inputRecords, allKeysAndLocations,
mayContainDuplicateLookup, shouldUpdatePartitionPath, config, hoodieTable);
}
private HoodiePairData fetchRecordGlobalLocations(
HoodieEngineContext context, HoodieTable hoodieTable, int parallelism,
List> baseFiles) {
int fetchParallelism = Math.max(1, Math.min(baseFiles.size(), parallelism));
return context.parallelize(baseFiles, fetchParallelism)
.flatMap(partitionPathBaseFile -> new HoodieKeyLocationFetchHandle(config, hoodieTable, partitionPathBaseFile, keyGeneratorOpt)
.globalLocations().iterator())
.mapToPair(e -> (Pair) e);
}
/**
* Load all files for all partitions as pair data.
*/
private List> getAllBaseFilesInTable(
final HoodieEngineContext context, final HoodieTable hoodieTable) {
HoodieTableMetaClient metaClient = hoodieTable.getMetaClient();
List allPartitionPaths = FSUtils.getAllPartitionPaths(context, metaClient.getStorage(), config.getMetadataConfig(), metaClient.getBasePath());
// Obtain the latest data files from all the partitions.
return getLatestBaseFilesForAllPartitions(allPartitionPaths, context, hoodieTable);
}
@Override
public boolean isGlobal() {
return true;
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy