Please wait. This can take some minutes ...
Many resources are needed to download a project. Please understand that we have to compensate our server costs. Thank you in advance.
Project price only 1 $
You can buy this project and download/modify it how often you want.
org.apache.hudi.client.SparkRDDWriteClient Maven / Gradle / Ivy
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hudi.client;
import org.apache.hudi.client.common.HoodieSparkEngineContext;
import org.apache.hudi.client.embedded.EmbeddedTimelineService;
import org.apache.hudi.client.utils.SparkReleaseResources;
import org.apache.hudi.common.data.HoodieData;
import org.apache.hudi.common.engine.HoodieEngineContext;
import org.apache.hudi.common.metrics.Registry;
import org.apache.hudi.common.model.HoodieCommitMetadata;
import org.apache.hudi.common.model.HoodieKey;
import org.apache.hudi.common.model.HoodieRecord;
import org.apache.hudi.common.model.HoodieWriteStat;
import org.apache.hudi.common.model.WriteOperationType;
import org.apache.hudi.common.table.HoodieTableConfig;
import org.apache.hudi.common.table.HoodieTableMetaClient;
import org.apache.hudi.common.table.timeline.HoodieTimeline;
import org.apache.hudi.common.util.Option;
import org.apache.hudi.config.HoodieWriteConfig;
import org.apache.hudi.data.HoodieJavaRDD;
import org.apache.hudi.exception.HoodieException;
import org.apache.hudi.hadoop.fs.HoodieWrapperFileSystem;
import org.apache.hudi.index.HoodieIndex;
import org.apache.hudi.index.SparkHoodieIndexFactory;
import org.apache.hudi.metadata.HoodieTableMetadataWriter;
import org.apache.hudi.metadata.MetadataPartitionType;
import org.apache.hudi.metadata.SparkHoodieBackedTableMetadataWriter;
import org.apache.hudi.metrics.DistributedRegistry;
import org.apache.hudi.metrics.HoodieMetrics;
import org.apache.hudi.table.BulkInsertPartitioner;
import org.apache.hudi.table.HoodieSparkTable;
import org.apache.hudi.table.HoodieTable;
import org.apache.hudi.table.action.HoodieWriteMetadata;
import org.apache.hudi.table.upgrade.SparkUpgradeDowngradeHelper;
import com.codahale.metrics.Timer;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.util.List;
import java.util.Map;
import java.util.function.BiConsumer;
@SuppressWarnings("checkstyle:LineLength")
public class SparkRDDWriteClient extends
BaseHoodieWriteClient>, JavaRDD, JavaRDD> {
private static final Logger LOG = LoggerFactory.getLogger(SparkRDDWriteClient.class);
public SparkRDDWriteClient(HoodieEngineContext context, HoodieWriteConfig clientConfig) {
this(context, clientConfig, Option.empty());
}
public SparkRDDWriteClient(HoodieEngineContext context, HoodieWriteConfig writeConfig,
Option timelineService) {
super(context, writeConfig, timelineService, SparkUpgradeDowngradeHelper.getInstance());
this.tableServiceClient = new SparkRDDTableServiceClient(context, writeConfig, getTimelineServer());
}
@Override
protected HoodieIndex createIndex(HoodieWriteConfig writeConfig) {
return SparkHoodieIndexFactory.createIndex(config);
}
/**
* Complete changes performed at the given instantTime marker with specified action.
*/
@Override
public boolean commit(String instantTime, JavaRDD writeStatuses, Option> extraMetadata,
String commitActionType, Map> partitionToReplacedFileIds,
Option> extraPreCommitFunc) {
context.setJobStatus(this.getClass().getSimpleName(), "Committing stats: " + config.getTableName());
List writeStats = writeStatuses.map(WriteStatus::getStat).collect();
return commitStats(instantTime, writeStats, extraMetadata, commitActionType, partitionToReplacedFileIds, extraPreCommitFunc);
}
@Override
protected HoodieTable createTable(HoodieWriteConfig config) {
return createTableAndValidate(config, HoodieSparkTable::create);
}
@Override
protected HoodieTable createTable(HoodieWriteConfig config, HoodieTableMetaClient metaClient) {
return createTableAndValidate(config, metaClient, HoodieSparkTable::create);
}
@Override
public JavaRDD> filterExists(JavaRDD> hoodieRecords) {
// Create a Hoodie table which encapsulated the commits and files visible
HoodieSparkTable table = HoodieSparkTable.create(config, context);
Timer.Context indexTimer = metrics.getIndexCtx();
JavaRDD> recordsWithLocation = HoodieJavaRDD.getJavaRDD(
getIndex().tagLocation(HoodieJavaRDD.of(hoodieRecords), context, table));
metrics.updateIndexMetrics(LOOKUP_STR, metrics.getDurationInMs(indexTimer == null ? 0L : indexTimer.stop()));
return recordsWithLocation.filter(v1 -> !v1.isCurrentLocationKnown());
}
/**
* Main API to run bootstrap to hudi.
*/
@Override
public void bootstrap(Option> extraMetadata) {
initTable(WriteOperationType.UPSERT, Option.ofNullable(HoodieTimeline.METADATA_BOOTSTRAP_INSTANT_TS)).bootstrap(context, extraMetadata);
}
@Override
public JavaRDD upsert(JavaRDD> records, String instantTime) {
HoodieTable>, HoodieData, HoodieData> table =
initTable(WriteOperationType.UPSERT, Option.ofNullable(instantTime));
maybeDisableWriteRecordPositions(table.getMetaClient());
table.validateUpsertSchema();
preWrite(instantTime, WriteOperationType.UPSERT, table.getMetaClient());
HoodieWriteMetadata> result = table.upsert(context, instantTime, HoodieJavaRDD.of(records));
HoodieWriteMetadata> resultRDD = result.clone(HoodieJavaRDD.getJavaRDD(result.getWriteStatuses()));
if (result.getSourceReadAndIndexDurationMs().isPresent()) {
metrics.updateSourceReadAndIndexMetrics(HoodieMetrics.DURATION_STR, result.getSourceReadAndIndexDurationMs().get());
}
return postWrite(resultRDD, instantTime, table);
}
@Override
public JavaRDD upsertPreppedRecords(JavaRDD> preppedRecords, String instantTime) {
HoodieTable>, HoodieData, HoodieData> table =
initTable(WriteOperationType.UPSERT_PREPPED, Option.ofNullable(instantTime));
maybeDisableWriteRecordPositions(table.getMetaClient());
table.validateUpsertSchema();
preWrite(instantTime, WriteOperationType.UPSERT_PREPPED, table.getMetaClient());
HoodieWriteMetadata> result = table.upsertPrepped(context, instantTime, HoodieJavaRDD.of(preppedRecords));
HoodieWriteMetadata> resultRDD = result.clone(HoodieJavaRDD.getJavaRDD(result.getWriteStatuses()));
return postWrite(resultRDD, instantTime, table);
}
@Override
public JavaRDD insert(JavaRDD> records, String instantTime) {
HoodieTable>, HoodieData, HoodieData> table =
initTable(WriteOperationType.INSERT, Option.ofNullable(instantTime));
table.validateInsertSchema();
preWrite(instantTime, WriteOperationType.INSERT, table.getMetaClient());
HoodieWriteMetadata> result = table.insert(context, instantTime, HoodieJavaRDD.of(records));
HoodieWriteMetadata> resultRDD = result.clone(HoodieJavaRDD.getJavaRDD(result.getWriteStatuses()));
return postWrite(resultRDD, instantTime, table);
}
@Override
public JavaRDD insertPreppedRecords(JavaRDD> preppedRecords, String instantTime) {
HoodieTable>, HoodieData, HoodieData> table =
initTable(WriteOperationType.INSERT_PREPPED, Option.ofNullable(instantTime));
table.validateInsertSchema();
preWrite(instantTime, WriteOperationType.INSERT_PREPPED, table.getMetaClient());
HoodieWriteMetadata> result = table.insertPrepped(context, instantTime, HoodieJavaRDD.of(preppedRecords));
HoodieWriteMetadata> resultRDD = result.clone(HoodieJavaRDD.getJavaRDD(result.getWriteStatuses()));
return postWrite(resultRDD, instantTime, table);
}
/**
* Removes all existing records from the partitions affected and inserts the given HoodieRecords, into the table.
*
* @param records HoodieRecords to insert
* @param instantTime Instant time of the commit
* @return JavaRDD[WriteStatus] - RDD of WriteStatus to inspect errors and counts
*/
public HoodieWriteResult insertOverwrite(JavaRDD> records, final String instantTime) {
HoodieTable>, HoodieData, HoodieData> table = initTable(WriteOperationType.INSERT_OVERWRITE, Option.ofNullable(instantTime));
table.validateInsertSchema();
preWrite(instantTime, WriteOperationType.INSERT_OVERWRITE, table.getMetaClient());
HoodieWriteMetadata> result = table.insertOverwrite(context, instantTime, HoodieJavaRDD.of(records));
HoodieWriteMetadata> resultRDD = result.clone(HoodieJavaRDD.getJavaRDD(result.getWriteStatuses()));
return new HoodieWriteResult(postWrite(resultRDD, instantTime, table), result.getPartitionToReplaceFileIds());
}
/**
* Removes all existing records of the Hoodie table and inserts the given HoodieRecords, into the table.
*
* @param records HoodieRecords to insert
* @param instantTime Instant time of the commit
* @return JavaRDD[WriteStatus] - RDD of WriteStatus to inspect errors and counts
*/
public HoodieWriteResult insertOverwriteTable(JavaRDD> records, final String instantTime) {
HoodieTable>, HoodieData, HoodieData> table = initTable(WriteOperationType.INSERT_OVERWRITE_TABLE, Option.ofNullable(instantTime));
table.validateInsertSchema();
preWrite(instantTime, WriteOperationType.INSERT_OVERWRITE_TABLE, table.getMetaClient());
HoodieWriteMetadata> result = table.insertOverwriteTable(context, instantTime, HoodieJavaRDD.of(records));
HoodieWriteMetadata> resultRDD = result.clone(HoodieJavaRDD.getJavaRDD(result.getWriteStatuses()));
return new HoodieWriteResult(postWrite(resultRDD, instantTime, table), result.getPartitionToReplaceFileIds());
}
@Override
public JavaRDD bulkInsert(JavaRDD> records, String instantTime) {
return bulkInsert(records, instantTime, Option.empty());
}
@Override
public JavaRDD bulkInsert(JavaRDD> records, String instantTime, Option userDefinedBulkInsertPartitioner) {
HoodieTable>, HoodieData, HoodieData> table =
initTable(WriteOperationType.BULK_INSERT, Option.ofNullable(instantTime));
table.validateInsertSchema();
preWrite(instantTime, WriteOperationType.BULK_INSERT, table.getMetaClient());
HoodieWriteMetadata> result = table.bulkInsert(context, instantTime, HoodieJavaRDD.of(records), userDefinedBulkInsertPartitioner);
HoodieWriteMetadata> resultRDD = result.clone(HoodieJavaRDD.getJavaRDD(result.getWriteStatuses()));
return postWrite(resultRDD, instantTime, table);
}
@Override
public JavaRDD bulkInsertPreppedRecords(JavaRDD> preppedRecords, String instantTime, Option bulkInsertPartitioner) {
HoodieTable>, HoodieData, HoodieData> table =
initTable(WriteOperationType.BULK_INSERT_PREPPED, Option.ofNullable(instantTime));
table.validateInsertSchema();
preWrite(instantTime, WriteOperationType.BULK_INSERT_PREPPED, table.getMetaClient());
HoodieWriteMetadata> result = table.bulkInsertPrepped(context, instantTime, HoodieJavaRDD.of(preppedRecords), bulkInsertPartitioner);
HoodieWriteMetadata> resultRDD = result.clone(HoodieJavaRDD.getJavaRDD(result.getWriteStatuses()));
return postWrite(resultRDD, instantTime, table);
}
@Override
public JavaRDD delete(JavaRDD keys, String instantTime) {
HoodieTable>, HoodieData, HoodieData> table = initTable(WriteOperationType.DELETE, Option.ofNullable(instantTime));
maybeDisableWriteRecordPositions(table.getMetaClient());
preWrite(instantTime, WriteOperationType.DELETE, table.getMetaClient());
HoodieWriteMetadata> result = table.delete(context, instantTime, HoodieJavaRDD.of(keys));
HoodieWriteMetadata> resultRDD = result.clone(HoodieJavaRDD.getJavaRDD(result.getWriteStatuses()));
return postWrite(resultRDD, instantTime, table);
}
@Override
public JavaRDD deletePrepped(JavaRDD> preppedRecord, String instantTime) {
HoodieTable>, HoodieData, HoodieData> table = initTable(WriteOperationType.DELETE_PREPPED, Option.ofNullable(instantTime));
maybeDisableWriteRecordPositions(table.getMetaClient());
preWrite(instantTime, WriteOperationType.DELETE_PREPPED, table.getMetaClient());
HoodieWriteMetadata> result = table.deletePrepped(context,instantTime, HoodieJavaRDD.of(preppedRecord));
HoodieWriteMetadata> resultRDD = result.clone(HoodieJavaRDD.getJavaRDD(result.getWriteStatuses()));
return postWrite(resultRDD, instantTime, table);
}
public HoodieWriteResult deletePartitions(List partitions, String instantTime) {
HoodieTable>, HoodieData, HoodieData> table = initTable(WriteOperationType.DELETE_PARTITION, Option.ofNullable(instantTime));
preWrite(instantTime, WriteOperationType.DELETE_PARTITION, table.getMetaClient());
HoodieWriteMetadata> result = table.deletePartitions(context, instantTime, partitions);
HoodieWriteMetadata> resultRDD = result.clone(HoodieJavaRDD.getJavaRDD(result.getWriteStatuses()));
return new HoodieWriteResult(postWrite(resultRDD, instantTime, table), result.getPartitionToReplaceFileIds());
}
public HoodieWriteResult managePartitionTTL(String instantTime) {
HoodieTable>, HoodieData, HoodieData> table = initTable(WriteOperationType.DELETE_PARTITION, Option.ofNullable(instantTime));
preWrite(instantTime, WriteOperationType.DELETE_PARTITION, table.getMetaClient());
HoodieWriteMetadata> result = table.managePartitionTTL(context, instantTime);
HoodieWriteMetadata> resultRDD = result.clone(HoodieJavaRDD.getJavaRDD(result.getWriteStatuses()));
return new HoodieWriteResult(postWrite(resultRDD, instantTime, table), result.getPartitionToReplaceFileIds());
}
@Override
protected void initMetadataTable(Option instantTime, HoodieTableMetaClient metaClient) {
// Initialize Metadata Table to make sure it's bootstrapped _before_ the operation,
// if it didn't exist before
// See https://issues.apache.org/jira/browse/HUDI-3343 for more details
initializeMetadataTable(instantTime, metaClient);
}
/**
* Initialize the metadata table if needed. Creating the metadata table writer
* will trigger the initial bootstrapping from the data table.
*
* @param inFlightInstantTimestamp - The in-flight action responsible for the metadata table initialization
*/
private void initializeMetadataTable(Option inFlightInstantTimestamp, HoodieTableMetaClient metaClient) {
if (!config.isMetadataTableEnabled()) {
return;
}
// if metadata table is enabled, emit enablement metrics
HoodieTableConfig tableConfig = metaClient.getTableConfig();
if (tableConfig.isMetadataTableAvailable()) {
// if metadata table is available, lets emit partitions of interest
boolean isMetadataColStatsAvailable = false;
boolean isMetadataBloomFilterAvailable = false;
boolean isMetadataRliAvailable = false;
if (tableConfig.getMetadataPartitions().contains(MetadataPartitionType.COLUMN_STATS.getPartitionPath())) {
isMetadataColStatsAvailable = true;
}
if (tableConfig.getMetadataPartitions().contains(MetadataPartitionType.BLOOM_FILTERS.getPartitionPath())) {
isMetadataBloomFilterAvailable = true;
}
if (tableConfig.getMetadataPartitions().contains(MetadataPartitionType.RECORD_INDEX.getPartitionPath())) {
isMetadataRliAvailable = true;
}
metrics.emitMetadataEnablementMetrics(true, isMetadataColStatsAvailable, isMetadataBloomFilterAvailable, isMetadataRliAvailable);
}
try (HoodieTableMetadataWriter writer = SparkHoodieBackedTableMetadataWriter.create(
context.getStorageConf(), config, context, inFlightInstantTimestamp)) {
if (writer.isInitialized()) {
writer.performTableServices(inFlightInstantTimestamp);
}
} catch (Exception e) {
throw new HoodieException("Failed to instantiate Metadata table ", e);
}
}
@Override
protected void initWrapperFSMetrics() {
if (config.isMetricsOn()) {
Registry registry;
Registry registryMeta;
JavaSparkContext jsc = ((HoodieSparkEngineContext) context).getJavaSparkContext();
if (config.isExecutorMetricsEnabled()) {
// Create a distributed registry for HoodieWrapperFileSystem
registry = Registry.getRegistry(HoodieWrapperFileSystem.class.getSimpleName(),
DistributedRegistry.class.getName());
((DistributedRegistry) registry).register(jsc);
registryMeta = Registry.getRegistry(HoodieWrapperFileSystem.class.getSimpleName() + "MetaFolder",
DistributedRegistry.class.getName());
((DistributedRegistry) registryMeta).register(jsc);
} else {
registry = Registry.getRegistry(HoodieWrapperFileSystem.class.getSimpleName());
registryMeta = Registry.getRegistry(HoodieWrapperFileSystem.class.getSimpleName() + "MetaFolder");
}
HoodieWrapperFileSystem.setMetricsRegistry(registry, registryMeta);
}
}
@Override
protected void releaseResources(String instantTime) {
SparkReleaseResources.releaseCachedData(context, config, basePath, instantTime);
}
}