
org.apache.hudi.metadata.SparkHoodieBackedTableMetadataWriter Maven / Gradle / Ivy
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hudi.metadata;
import org.apache.hudi.AvroConversionUtils;
import org.apache.hudi.HoodieSparkExpressionIndex;
import org.apache.hudi.client.BaseHoodieWriteClient;
import org.apache.hudi.client.SparkRDDWriteClient;
import org.apache.hudi.client.common.HoodieSparkEngineContext;
import org.apache.hudi.client.utils.SparkMetadataWriterUtils;
import org.apache.hudi.common.data.HoodieData;
import org.apache.hudi.common.engine.EngineType;
import org.apache.hudi.common.engine.HoodieEngineContext;
import org.apache.hudi.common.fs.FSUtils;
import org.apache.hudi.common.function.SerializableFunction;
import org.apache.hudi.common.metrics.Registry;
import org.apache.hudi.common.model.HoodieFailedWritesCleaningPolicy;
import org.apache.hudi.common.model.HoodieIndexDefinition;
import org.apache.hudi.common.model.HoodieRecord;
import org.apache.hudi.common.model.HoodieTableType;
import org.apache.hudi.common.model.WriteOperationType;
import org.apache.hudi.common.table.HoodieTableMetaClient;
import org.apache.hudi.common.util.CommitUtils;
import org.apache.hudi.common.util.Option;
import org.apache.hudi.common.util.collection.Pair;
import org.apache.hudi.config.HoodieWriteConfig;
import org.apache.hudi.data.HoodieJavaRDD;
import org.apache.hudi.index.functional.HoodieExpressionIndex;
import org.apache.hudi.metrics.DistributedRegistry;
import org.apache.hudi.metrics.MetricsReporterType;
import org.apache.hudi.storage.StorageConfiguration;
import org.apache.hudi.storage.StoragePath;
import org.apache.hudi.table.HoodieSparkTable;
import org.apache.hudi.table.HoodieTable;
import org.apache.avro.Schema;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.sql.Column;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Row;
import org.apache.spark.sql.SQLContext;
import org.apache.spark.sql.types.DataTypes;
import org.apache.spark.sql.types.Metadata;
import org.apache.spark.sql.types.StructField;
import org.apache.spark.sql.types.StructType;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.stream.Collectors;
import static org.apache.hudi.client.utils.SparkMetadataWriterUtils.readRecordsAsRows;
import static org.apache.hudi.common.model.HoodieFailedWritesCleaningPolicy.EAGER;
import static org.apache.hudi.metadata.HoodieTableMetadataUtil.PARTITION_NAME_BLOOM_FILTERS;
import static org.apache.hudi.metadata.HoodieTableMetadataUtil.PARTITION_NAME_COLUMN_STATS;
public class SparkHoodieBackedTableMetadataWriter extends HoodieBackedTableMetadataWriter> {
private static final Logger LOG = LoggerFactory.getLogger(SparkHoodieBackedTableMetadataWriter.class);
/**
* Return a Spark based implementation of {@code HoodieTableMetadataWriter} which can be used to
* write to the metadata table.
*
* If the metadata table does not exist, an attempt is made to bootstrap it but there is no guaranteed that
* table will end up bootstrapping at this time.
*
* @param conf
* @param writeConfig
* @param context
* @param inflightInstantTimestamp Timestamp of an instant which is in-progress. This instant is ignored while
* attempting to bootstrap the table.
* @return An instance of the {@code HoodieTableMetadataWriter}
*/
public static HoodieTableMetadataWriter create(StorageConfiguration> conf,
HoodieWriteConfig writeConfig,
HoodieEngineContext context,
Option inflightInstantTimestamp) {
return new SparkHoodieBackedTableMetadataWriter(
conf, writeConfig, EAGER, context, inflightInstantTimestamp);
}
public static HoodieTableMetadataWriter create(StorageConfiguration> conf,
HoodieWriteConfig writeConfig,
HoodieFailedWritesCleaningPolicy failedWritesCleaningPolicy,
HoodieEngineContext context,
Option inflightInstantTimestamp) {
return new SparkHoodieBackedTableMetadataWriter(
conf, writeConfig, failedWritesCleaningPolicy, context, inflightInstantTimestamp);
}
public static HoodieTableMetadataWriter create(StorageConfiguration> conf, HoodieWriteConfig writeConfig,
HoodieEngineContext context) {
return create(conf, writeConfig, context, Option.empty());
}
SparkHoodieBackedTableMetadataWriter(StorageConfiguration> hadoopConf,
HoodieWriteConfig writeConfig,
HoodieFailedWritesCleaningPolicy failedWritesCleaningPolicy,
HoodieEngineContext engineContext,
Option inflightInstantTimestamp) {
super(hadoopConf, writeConfig, failedWritesCleaningPolicy, engineContext, inflightInstantTimestamp);
}
@Override
protected void initRegistry() {
if (metadataWriteConfig.isMetricsOn()) {
Registry registry;
if (metadataWriteConfig.isExecutorMetricsEnabled() && metadataWriteConfig.getMetricsReporterType() != MetricsReporterType.INMEMORY) {
registry = Registry.getRegistry("HoodieMetadata", DistributedRegistry.class.getName());
HoodieSparkEngineContext sparkEngineContext = (HoodieSparkEngineContext) engineContext;
((DistributedRegistry) registry).register(sparkEngineContext.getJavaSparkContext());
} else {
registry = Registry.getRegistry("HoodieMetadata");
}
this.metrics = Option.of(new HoodieMetadataMetrics(metadataWriteConfig.getMetricsConfig(), dataMetaClient.getStorage()));
} else {
this.metrics = Option.empty();
}
}
@Override
protected void commit(String instantTime, Map> partitionRecordsMap) {
commitInternal(instantTime, partitionRecordsMap, false, Option.empty());
}
@Override
protected JavaRDD convertHoodieDataToEngineSpecificData(HoodieData records) {
return HoodieJavaRDD.getJavaRDD(records);
}
@Override
protected void bulkCommit(
String instantTime, String partitionName, HoodieData records,
int fileGroupCount) {
SparkHoodieMetadataBulkInsertPartitioner partitioner = new SparkHoodieMetadataBulkInsertPartitioner(fileGroupCount);
commitInternal(instantTime, Collections.singletonMap(partitionName, records), true, Option.of(partitioner));
}
@Override
public void deletePartitions(String instantTime, List partitions) {
List partitionsToDrop = partitions.stream().map(MetadataPartitionType::getPartitionPath).collect(Collectors.toList());
LOG.info("Deleting Metadata Table partitions: {}", partitionsToDrop);
SparkRDDWriteClient writeClient = (SparkRDDWriteClient) getWriteClient();
String actionType = CommitUtils.getCommitActionType(WriteOperationType.DELETE_PARTITION, HoodieTableType.MERGE_ON_READ);
writeClient.startCommitWithTime(instantTime, actionType);
writeClient.deletePartitions(partitionsToDrop, instantTime);
}
@Override
protected HoodieData getExpressionIndexRecords(List>> partitionFilePathAndSizeTriplet,
HoodieIndexDefinition indexDefinition,
HoodieTableMetaClient metaClient, int parallelism,
Schema readerSchema, StorageConfiguration> storageConf,
String instantTime) {
HoodieSparkEngineContext sparkEngineContext = (HoodieSparkEngineContext) engineContext;
if (indexDefinition.getSourceFields().isEmpty()) {
// In case there are no columns to index, bail
return sparkEngineContext.emptyHoodieData();
}
// NOTE: We are assuming that the index expression is operating on a single column
// HUDI-6994 will address this.
String columnToIndex = indexDefinition.getSourceFields().get(0);
SQLContext sqlContext = sparkEngineContext.getSqlContext();
// Read records and append expression index metadata to every row
HoodieData rowData = sparkEngineContext.parallelize(partitionFilePathAndSizeTriplet, parallelism)
.flatMap((SerializableFunction>, Iterator>) entry -> {
String partition = entry.getKey();
Pair filePathSizePair = entry.getValue();
String filePath = filePathSizePair.getKey();
String relativeFilePath = FSUtils.getRelativePartitionPath(metaClient.getBasePath(), new StoragePath(filePath));
long fileSize = filePathSizePair.getValue();
List rowsForFilePath = readRecordsAsRows(new StoragePath[] {new StoragePath(filePath)}, sqlContext, metaClient, readerSchema, dataWriteConfig,
FSUtils.isBaseFile(new StoragePath(filePath.substring(filePath.lastIndexOf("/") + 1))));
List rowsWithIndexMetadata = SparkMetadataWriterUtils.getRowsWithExpressionIndexMetadata(rowsForFilePath, partition, relativeFilePath, fileSize);
return rowsWithIndexMetadata.iterator();
});
// Generate dataset with expression index metadata
StructType structType = AvroConversionUtils.convertAvroSchemaToStructType(readerSchema)
.add(StructField.apply(HoodieExpressionIndex.HOODIE_EXPRESSION_INDEX_PARTITION, DataTypes.StringType, false, Metadata.empty()))
.add(StructField.apply(HoodieExpressionIndex.HOODIE_EXPRESSION_INDEX_RELATIVE_FILE_PATH, DataTypes.StringType, false, Metadata.empty()))
.add(StructField.apply(HoodieExpressionIndex.HOODIE_EXPRESSION_INDEX_FILE_SIZE, DataTypes.LongType, false, Metadata.empty()));
Dataset rowDataset = sparkEngineContext.getSqlContext().createDataFrame(HoodieJavaRDD.getJavaRDD(rowData).rdd(), structType);
// Apply expression index and generate the column to index
HoodieExpressionIndex expressionIndex =
new HoodieSparkExpressionIndex(indexDefinition.getIndexName(), indexDefinition.getIndexFunction(), indexDefinition.getSourceFields(), indexDefinition.getIndexOptions());
Column indexedColumn = expressionIndex.apply(Collections.singletonList(rowDataset.col(columnToIndex)));
rowDataset = rowDataset.withColumn(columnToIndex, indexedColumn);
// Generate expression index records
if (indexDefinition.getIndexType().equalsIgnoreCase(PARTITION_NAME_COLUMN_STATS)) {
return SparkMetadataWriterUtils.getExpressionIndexRecordsUsingColumnStats(rowDataset, expressionIndex, columnToIndex);
} else if (indexDefinition.getIndexType().equalsIgnoreCase(PARTITION_NAME_BLOOM_FILTERS)) {
return SparkMetadataWriterUtils.getExpressionIndexRecordsUsingBloomFilter(rowDataset, columnToIndex, metadataWriteConfig, instantTime, indexDefinition.getIndexName());
} else {
throw new UnsupportedOperationException(indexDefinition.getIndexType() + " is not yet supported");
}
}
@Override
protected HoodieTable getTable(HoodieWriteConfig writeConfig, HoodieTableMetaClient metaClient) {
return HoodieSparkTable.create(writeConfig, engineContext, metaClient);
}
@Override
public BaseHoodieWriteClient, JavaRDD, ?, ?> initializeWriteClient() {
return new SparkRDDWriteClient(engineContext, metadataWriteConfig, Option.empty());
}
@Override
protected EngineType getEngineType() {
return EngineType.SPARK;
}
@Override
public HoodieData getDeletedSecondaryRecordMapping(HoodieEngineContext engineContext, Map recordKeySecondaryKeyMap, HoodieIndexDefinition indexDefinition) {
HoodieSparkEngineContext sparkEngineContext = (HoodieSparkEngineContext) engineContext;
if (recordKeySecondaryKeyMap.isEmpty()) {
return sparkEngineContext.emptyHoodieData();
}
List deletedRecords = new ArrayList<>();
recordKeySecondaryKeyMap.forEach((key, value) -> {
HoodieRecord siRecord = HoodieMetadataPayload.createSecondaryIndexRecord(key, value, indexDefinition.getIndexName(), true);
deletedRecords.add(siRecord);
});
return HoodieJavaRDD.of(deletedRecords, sparkEngineContext, 1);
}
}