org.apache.hudi.client.utils.SparkValidatorUtils Maven / Gradle / Ivy
The newest version!
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hudi.client.utils;
import org.apache.hudi.AvroConversionUtils;
import org.apache.hudi.client.WriteStatus;
import org.apache.hudi.client.common.HoodieSparkEngineContext;
import org.apache.hudi.client.validator.SparkPreCommitValidator;
import org.apache.hudi.common.data.HoodieData;
import org.apache.hudi.common.engine.HoodieEngineContext;
import org.apache.hudi.common.model.BaseFile;
import org.apache.hudi.common.model.HoodieWriteStat;
import org.apache.hudi.common.table.TableSchemaResolver;
import org.apache.hudi.common.table.view.HoodieTablePreCommitFileSystemView;
import org.apache.hudi.common.util.ReflectionUtils;
import org.apache.hudi.common.util.StringUtils;
import org.apache.hudi.config.HoodieWriteConfig;
import org.apache.hudi.exception.HoodieValidationException;
import org.apache.hudi.table.HoodieSparkTable;
import org.apache.hudi.table.HoodieTable;
import org.apache.hudi.table.action.HoodieWriteMetadata;
import org.apache.hudi.table.action.commit.BaseSparkCommitActionExecutor;
import org.apache.hudi.util.JavaScalaConverters;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Row;
import org.apache.spark.sql.SQLContext;
import org.apache.spark.sql.types.StructType;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.util.Arrays;
import java.util.List;
import java.util.Set;
import java.util.concurrent.CompletableFuture;
import java.util.stream.Collectors;
import java.util.stream.Stream;
/**
* Spark validator utils to verify and run any pre-commit validators configured.
*/
public class SparkValidatorUtils {
private static final Logger LOG = LoggerFactory.getLogger(BaseSparkCommitActionExecutor.class);
/**
* Check configured pre-commit validators and run them. Note that this only works for COW tables
*
* Throw error if there are validation failures.
*/
public static void runValidators(HoodieWriteConfig config,
HoodieWriteMetadata> writeMetadata,
HoodieEngineContext context,
HoodieTable table,
String instantTime) {
if (StringUtils.isNullOrEmpty(config.getPreCommitValidators())) {
LOG.info("no validators configured.");
} else {
if (!writeMetadata.getWriteStats().isPresent()) {
writeMetadata.setWriteStats(writeMetadata.getWriteStatuses().map(WriteStatus::getStat).collectAsList());
}
Set partitionsModified = writeMetadata.getWriteStats().get().stream().map(HoodieWriteStat::getPartitionPath).collect(Collectors.toSet());
SQLContext sqlContext = new SQLContext(HoodieSparkEngineContext.getSparkContext(context));
// Refresh timeline to ensure validator sees the any other operations done on timeline (async operations such as other clustering/compaction/rollback)
table.getMetaClient().reloadActiveTimeline();
Dataset afterState = getRecordsFromPendingCommits(sqlContext, partitionsModified, writeMetadata, table, instantTime);
Dataset beforeState = getRecordsFromCommittedFiles(sqlContext, partitionsModified, table, afterState.schema());
Stream validators = Arrays.stream(config.getPreCommitValidators().split(","))
.map(validatorClass -> ((SparkPreCommitValidator) ReflectionUtils.loadClass(validatorClass,
new Class>[] {HoodieSparkTable.class, HoodieEngineContext.class, HoodieWriteConfig.class},
table, context, config)));
boolean allSuccess = validators.map(v -> runValidatorAsync(v, writeMetadata, beforeState, afterState, instantTime)).map(CompletableFuture::join)
.reduce(true, Boolean::logicalAnd);
if (allSuccess) {
LOG.info("All validations succeeded");
} else {
LOG.error("At least one pre-commit validation failed");
throw new HoodieValidationException("At least one pre-commit validation failed");
}
}
}
/**
* Run validators in a separate thread pool for parallelism. Each of validator can submit a distributed spark job if needed.
*/
private static CompletableFuture runValidatorAsync(SparkPreCommitValidator validator, HoodieWriteMetadata> writeMetadata,
Dataset beforeState, Dataset afterState, String instantTime) {
return CompletableFuture.supplyAsync(() -> {
try {
validator.validate(instantTime, writeMetadata, beforeState, afterState);
LOG.info("validation complete for " + validator.getClass().getName());
return true;
} catch (HoodieValidationException e) {
LOG.error("validation failed for " + validator.getClass().getName(), e);
return false;
}
});
}
/**
* Get records from partitions modified as a dataset.
* Note that this only works for COW tables.
*
* @param sqlContext Spark {@link SQLContext} instance.
* @param partitionsAffected A set of affected partitions.
* @param table {@link HoodieTable} instance.
* @param newStructTypeSchema The {@link StructType} schema from after state.
* @return The records in Dataframe from committed files.
*/
public static Dataset getRecordsFromCommittedFiles(SQLContext sqlContext,
Set partitionsAffected,
HoodieTable table,
StructType newStructTypeSchema) {
List committedFiles = partitionsAffected.stream()
.flatMap(partition -> table.getBaseFileOnlyView().getLatestBaseFiles(partition).map(BaseFile::getPath))
.collect(Collectors.toList());
if (committedFiles.isEmpty()) {
try {
return sqlContext.createDataFrame(
sqlContext.emptyDataFrame().rdd(),
AvroConversionUtils.convertAvroSchemaToStructType(
new TableSchemaResolver(table.getMetaClient()).getTableAvroSchema()));
} catch (Exception e) {
LOG.warn("Cannot get table schema from before state.", e);
LOG.warn("Use the schema from after state (current transaction) to create the empty Spark "
+ "dataframe: " + newStructTypeSchema);
return sqlContext.createDataFrame(
sqlContext.emptyDataFrame().rdd(), newStructTypeSchema);
}
}
return readRecordsForBaseFiles(sqlContext, committedFiles);
}
/**
* Get records from specified list of data files.
*/
public static Dataset readRecordsForBaseFiles(SQLContext sqlContext, List baseFilePaths) {
return sqlContext.read().parquet(JavaScalaConverters.convertJavaListToScalaSeq(baseFilePaths));
}
/**
* Get reads from partitions modified including any inflight commits.
* Note that this only works for COW tables
*/
public static Dataset getRecordsFromPendingCommits(SQLContext sqlContext,
Set partitionsAffected,
HoodieWriteMetadata> writeMetadata,
HoodieTable table,
String instantTime) {
// build file system view with pending commits
HoodieTablePreCommitFileSystemView fsView = new HoodieTablePreCommitFileSystemView(table.getMetaClient(),
table.getHoodieView(),
writeMetadata.getWriteStats().get(),
writeMetadata.getPartitionToReplaceFileIds(),
instantTime);
List newFiles = partitionsAffected.stream()
.flatMap(partition -> fsView.getLatestBaseFiles(partition).map(BaseFile::getPath))
.collect(Collectors.toList());
if (newFiles.isEmpty()) {
return sqlContext.emptyDataFrame();
}
return readRecordsForBaseFiles(sqlContext, newFiles);
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy