All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.hudi.table.HoodieSparkCopyOnWriteTable Maven / Gradle / Ivy

The newest version!
/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.hudi.table;

import org.apache.hudi.avro.model.HoodieCleanMetadata;
import org.apache.hudi.avro.model.HoodieCleanerPlan;
import org.apache.hudi.avro.model.HoodieClusteringPlan;
import org.apache.hudi.avro.model.HoodieCompactionPlan;
import org.apache.hudi.avro.model.HoodieIndexCommitMetadata;
import org.apache.hudi.avro.model.HoodieIndexPlan;
import org.apache.hudi.avro.model.HoodieRestoreMetadata;
import org.apache.hudi.avro.model.HoodieRestorePlan;
import org.apache.hudi.avro.model.HoodieRollbackMetadata;
import org.apache.hudi.avro.model.HoodieRollbackPlan;
import org.apache.hudi.avro.model.HoodieSavepointMetadata;
import org.apache.hudi.client.WriteStatus;
import org.apache.hudi.client.common.HoodieSparkEngineContext;
import org.apache.hudi.common.data.HoodieData;
import org.apache.hudi.common.engine.HoodieEngineContext;
import org.apache.hudi.common.engine.HoodieReaderContext;
import org.apache.hudi.common.model.CompactionOperation;
import org.apache.hudi.common.model.HoodieBaseFile;
import org.apache.hudi.common.model.HoodieKey;
import org.apache.hudi.common.model.HoodieRecord;
import org.apache.hudi.common.table.HoodieTableMetaClient;
import org.apache.hudi.common.table.timeline.HoodieInstant;
import org.apache.hudi.common.table.timeline.HoodieTimeline;
import org.apache.hudi.common.util.Option;
import org.apache.hudi.config.HoodieWriteConfig;
import org.apache.hudi.exception.HoodieException;
import org.apache.hudi.exception.HoodieMetadataException;
import org.apache.hudi.exception.HoodieNotSupportedException;
import org.apache.hudi.io.HoodieCreateHandle;
import org.apache.hudi.io.HoodieMergeHandle;
import org.apache.hudi.io.HoodieMergeHandleFactory;
import org.apache.hudi.io.HoodieSparkFileGroupReaderBasedMergeHandle;
import org.apache.hudi.keygen.BaseKeyGenerator;
import org.apache.hudi.keygen.factory.HoodieSparkKeyGeneratorFactory;
import org.apache.hudi.metadata.MetadataPartitionType;
import org.apache.hudi.table.action.HoodieWriteMetadata;
import org.apache.hudi.table.action.bootstrap.HoodieBootstrapWriteMetadata;
import org.apache.hudi.table.action.bootstrap.SparkBootstrapCommitActionExecutor;
import org.apache.hudi.table.action.clean.CleanActionExecutor;
import org.apache.hudi.table.action.clean.CleanPlanActionExecutor;
import org.apache.hudi.table.action.cluster.ClusteringPlanActionExecutor;
import org.apache.hudi.table.action.cluster.SparkExecuteClusteringCommitActionExecutor;
import org.apache.hudi.table.action.commit.SparkBulkInsertCommitActionExecutor;
import org.apache.hudi.table.action.commit.SparkBulkInsertPreppedCommitActionExecutor;
import org.apache.hudi.table.action.commit.SparkDeleteCommitActionExecutor;
import org.apache.hudi.table.action.commit.SparkDeletePartitionCommitActionExecutor;
import org.apache.hudi.table.action.commit.SparkDeletePreppedCommitActionExecutor;
import org.apache.hudi.table.action.commit.SparkInsertCommitActionExecutor;
import org.apache.hudi.table.action.commit.SparkInsertOverwriteCommitActionExecutor;
import org.apache.hudi.table.action.commit.SparkInsertOverwriteTableCommitActionExecutor;
import org.apache.hudi.table.action.commit.SparkInsertPreppedCommitActionExecutor;
import org.apache.hudi.table.action.commit.SparkPartitionTTLActionExecutor;
import org.apache.hudi.table.action.commit.SparkUpsertCommitActionExecutor;
import org.apache.hudi.table.action.commit.SparkUpsertPreppedCommitActionExecutor;
import org.apache.hudi.table.action.index.RunIndexActionExecutor;
import org.apache.hudi.table.action.index.ScheduleIndexActionExecutor;
import org.apache.hudi.table.action.restore.CopyOnWriteRestoreActionExecutor;
import org.apache.hudi.table.action.rollback.BaseRollbackPlanActionExecutor;
import org.apache.hudi.table.action.rollback.CopyOnWriteRollbackActionExecutor;
import org.apache.hudi.table.action.rollback.RestorePlanActionExecutor;
import org.apache.hudi.table.action.savepoint.SavepointActionExecutor;

import org.apache.hadoop.conf.Configuration;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import java.io.IOException;
import java.util.Collections;
import java.util.Iterator;
import java.util.List;
import java.util.Map;

import static org.apache.hudi.metadata.HoodieTableMetadataUtil.deleteMetadataTable;

/**
 * Implementation of a very heavily read-optimized Hoodie Table where, all data is stored in base files, with
 * zero read amplification.
 * 

* INSERTS - Produce new files, block aligned to desired size (or) Merge with the smallest existing file, to expand it *

* UPDATES - Produce a new version of the file, just replacing the updated records with new values */ public class HoodieSparkCopyOnWriteTable extends HoodieSparkTable implements HoodieCompactionHandler { private static final Logger LOG = LoggerFactory.getLogger(HoodieSparkCopyOnWriteTable.class); public HoodieSparkCopyOnWriteTable(HoodieWriteConfig config, HoodieEngineContext context, HoodieTableMetaClient metaClient) { super(config, context, metaClient); } @Override public HoodieWriteMetadata> upsert(HoodieEngineContext context, String instantTime, HoodieData> records) { return new SparkUpsertCommitActionExecutor<>((HoodieSparkEngineContext) context, config, this, instantTime, records).execute(); } @Override public HoodieWriteMetadata> insert(HoodieEngineContext context, String instantTime, HoodieData> records) { return new SparkInsertCommitActionExecutor<>((HoodieSparkEngineContext)context, config, this, instantTime, records).execute(); } @Override public HoodieWriteMetadata> bulkInsert(HoodieEngineContext context, String instantTime, HoodieData> records, Option userDefinedBulkInsertPartitioner) { return new SparkBulkInsertCommitActionExecutor<>((HoodieSparkEngineContext) context, config, this, instantTime, records, userDefinedBulkInsertPartitioner).execute(); } @Override public HoodieWriteMetadata> delete(HoodieEngineContext context, String instantTime, HoodieData keys) { return new SparkDeleteCommitActionExecutor<>((HoodieSparkEngineContext) context, config, this, instantTime, keys).execute(); } @Override public HoodieWriteMetadata> deletePrepped(HoodieEngineContext context, String instantTime, HoodieData> preppedRecords) { return new SparkDeletePreppedCommitActionExecutor<>((HoodieSparkEngineContext) context, config, this, instantTime, preppedRecords).execute(); } @Override public HoodieWriteMetadata> deletePartitions(HoodieEngineContext context, String instantTime, List partitions) { return new SparkDeletePartitionCommitActionExecutor<>(context, config, this, instantTime, partitions).execute(); } @Override public HoodieWriteMetadata> upsertPrepped(HoodieEngineContext context, String instantTime, HoodieData> preppedRecords) { return new SparkUpsertPreppedCommitActionExecutor<>((HoodieSparkEngineContext) context, config, this, instantTime, preppedRecords).execute(); } @Override public HoodieWriteMetadata> insertPrepped(HoodieEngineContext context, String instantTime, HoodieData> preppedRecords) { return new SparkInsertPreppedCommitActionExecutor<>((HoodieSparkEngineContext) context, config, this, instantTime, preppedRecords).execute(); } @Override public HoodieWriteMetadata> bulkInsertPrepped(HoodieEngineContext context, String instantTime, HoodieData> preppedRecords, Option userDefinedBulkInsertPartitioner) { return new SparkBulkInsertPreppedCommitActionExecutor((HoodieSparkEngineContext) context, config, this, instantTime, preppedRecords, userDefinedBulkInsertPartitioner).execute(); } @Override public HoodieWriteMetadata insertOverwrite(HoodieEngineContext context, String instantTime, HoodieData> records) { return new SparkInsertOverwriteCommitActionExecutor(context, config, this, instantTime, records).execute(); } @Override public HoodieWriteMetadata> insertOverwriteTable(HoodieEngineContext context, String instantTime, HoodieData> records) { return new SparkInsertOverwriteTableCommitActionExecutor(context, config, this, instantTime, records).execute(); } @Override public Option scheduleCompaction(HoodieEngineContext context, String instantTime, Option> extraMetadata) { throw new HoodieNotSupportedException("Compaction is not supported on a CopyOnWrite table"); } @Override public HoodieWriteMetadata> compact( HoodieEngineContext context, String compactionInstantTime) { throw new HoodieNotSupportedException("Compaction is not supported on a CopyOnWrite table"); } @Override public Option scheduleClustering(HoodieEngineContext context, String instantTime, Option> extraMetadata) { return new ClusteringPlanActionExecutor<>(context, config,this, instantTime, extraMetadata).execute(); } @Override public HoodieWriteMetadata> cluster(HoodieEngineContext context, String clusteringInstantTime) { return new SparkExecuteClusteringCommitActionExecutor<>(context, config, this, clusteringInstantTime).execute(); } @Override public HoodieBootstrapWriteMetadata> bootstrap(HoodieEngineContext context, Option> extraMetadata) { return new SparkBootstrapCommitActionExecutor((HoodieSparkEngineContext) context, config, this, extraMetadata).execute(); } @Override public void rollbackBootstrap(HoodieEngineContext context, String instantTime) { // Delete metadata table to rollback a failed bootstrap. re-attempt of bootstrap will re-initialize the mdt. try { LOG.info("Deleting metadata table because we are rolling back failed bootstrap. "); deleteMetadataTable(config.getBasePath(), context); } catch (HoodieMetadataException e) { throw new HoodieException("Failed to delete metadata table.", e); } new RestorePlanActionExecutor<>(context, config, this, instantTime, HoodieTimeline.INIT_INSTANT_TS).execute(); new CopyOnWriteRestoreActionExecutor<>(context, config, this, instantTime, HoodieTimeline.INIT_INSTANT_TS).execute(); } @Override public Option scheduleCleaning(HoodieEngineContext context, String instantTime, Option> extraMetadata) { return new CleanPlanActionExecutor<>(context, config, this, instantTime, extraMetadata).execute(); } @Override public Option scheduleRollback(HoodieEngineContext context, String instantTime, HoodieInstant instantToRollback, boolean skipTimelinePublish, boolean shouldRollbackUsingMarkers, boolean isRestore) { return new BaseRollbackPlanActionExecutor<>(context, config, this, instantTime, instantToRollback, skipTimelinePublish, shouldRollbackUsingMarkers, isRestore).execute(); } /** * Delete expired partition by config * @param context HoodieEngineContext * @param instantTime Instant Time for the action * @return HoodieWriteMetadata */ public HoodieWriteMetadata> managePartitionTTL(HoodieEngineContext context, String instantTime) { return new SparkPartitionTTLActionExecutor<>(context, config, this, instantTime).execute(); } @Override public Iterator> handleUpdate( String instantTime, String partitionPath, String fileId, Map> keyToNewRecords, HoodieBaseFile oldDataFile) throws IOException { // these are updates HoodieMergeHandle upsertHandle = getUpdateHandle(instantTime, partitionPath, fileId, keyToNewRecords, oldDataFile); return handleUpdateInternal(upsertHandle, instantTime, fileId); } protected Iterator> handleUpdateInternal(HoodieMergeHandle upsertHandle, String instantTime, String fileId) throws IOException { runMerge(upsertHandle, instantTime, fileId); return upsertHandle.getWriteStatusesAsIterator(); } protected HoodieMergeHandle getUpdateHandle(String instantTime, String partitionPath, String fileId, Map> keyToNewRecords, HoodieBaseFile dataFileToBeMerged) { Option keyGeneratorOpt = HoodieSparkKeyGeneratorFactory.createBaseKeyGenerator(config); return HoodieMergeHandleFactory.create(config, instantTime, this, keyToNewRecords, partitionPath, fileId, dataFileToBeMerged, taskContextSupplier, keyGeneratorOpt); } @Override public Iterator> handleInsert( String instantTime, String partitionPath, String fileId, Map> recordMap) { HoodieCreateHandle createHandle = new HoodieCreateHandle(config, instantTime, this, partitionPath, fileId, recordMap, taskContextSupplier); createHandle.write(); return Collections.singletonList(createHandle.close()).iterator(); } @Override public List compactUsingFileGroupReader(String instantTime, CompactionOperation operation, HoodieReaderContext readerContext, Configuration conf) { Option keyGeneratorOpt = HoodieSparkKeyGeneratorFactory.createBaseKeyGenerator(config); HoodieSparkFileGroupReaderBasedMergeHandle mergeHandle = new HoodieSparkFileGroupReaderBasedMergeHandle(config, instantTime, this, operation, taskContextSupplier, keyGeneratorOpt, readerContext, conf); mergeHandle.write(); return mergeHandle.close(); } @Override public HoodieCleanMetadata clean(HoodieEngineContext context, String cleanInstantTime) { return new CleanActionExecutor<>(context, config, this, cleanInstantTime, false).execute(); } @Override public HoodieRollbackMetadata rollback(HoodieEngineContext context, String rollbackInstantTime, HoodieInstant commitInstant, boolean deleteInstants, boolean skipLocking) { return new CopyOnWriteRollbackActionExecutor<>(context, config, this, rollbackInstantTime, commitInstant, deleteInstants, skipLocking).execute(); } @Override public Option scheduleIndexing(HoodieEngineContext context, String indexInstantTime, List partitionsToIndex, List partitionPaths) { return new ScheduleIndexActionExecutor<>(context, config, this, indexInstantTime, partitionsToIndex, partitionPaths).execute(); } @Override public Option index(HoodieEngineContext context, String indexInstantTime) { return new RunIndexActionExecutor<>(context, config, this, indexInstantTime).execute(); } @Override public HoodieSavepointMetadata savepoint(HoodieEngineContext context, String instantToSavepoint, String user, String comment) { return new SavepointActionExecutor<>(context, config, this, instantToSavepoint, user, comment).execute(); } @Override public HoodieRestoreMetadata restore(HoodieEngineContext context, String restoreInstantTimestamp, String savepointToRestoreTimestamp) { return new CopyOnWriteRestoreActionExecutor<>(context, config, this, restoreInstantTimestamp, savepointToRestoreTimestamp).execute(); } @Override public Option scheduleRestore(HoodieEngineContext context, String restoreInstantTimestamp, String savepointToRestoreTimestamp) { return new RestorePlanActionExecutor<>(context, config, this, restoreInstantTimestamp, savepointToRestoreTimestamp).execute(); } }





© 2015 - 2025 Weber Informatics LLC | Privacy Policy