All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.hudi.client.HoodieFlinkWriteClient Maven / Gradle / Ivy

/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.hudi.client;

import org.apache.hudi.client.common.HoodieFlinkEngineContext;
import org.apache.hudi.client.utils.TransactionUtils;
import org.apache.hudi.common.data.HoodieListData;
import org.apache.hudi.common.engine.HoodieEngineContext;
import org.apache.hudi.common.fs.FSUtils;
import org.apache.hudi.common.model.FileSlice;
import org.apache.hudi.common.model.HoodieCommitMetadata;
import org.apache.hudi.common.model.HoodieKey;
import org.apache.hudi.common.model.HoodieRecord;
import org.apache.hudi.common.model.HoodieReplaceCommitMetadata;
import org.apache.hudi.common.model.HoodieWriteStat;
import org.apache.hudi.common.model.TableServiceType;
import org.apache.hudi.common.model.WriteOperationType;
import org.apache.hudi.common.table.HoodieTableMetaClient;
import org.apache.hudi.common.table.HoodieTableVersion;
import org.apache.hudi.common.util.Option;
import org.apache.hudi.config.HoodieWriteConfig;
import org.apache.hudi.exception.HoodieNotSupportedException;
import org.apache.hudi.index.FlinkHoodieIndexFactory;
import org.apache.hudi.index.HoodieIndex;
import org.apache.hudi.io.FlinkWriteHandleFactory;
import org.apache.hudi.io.HoodieWriteHandle;
import org.apache.hudi.io.MiniBatchHandle;
import org.apache.hudi.table.BulkInsertPartitioner;
import org.apache.hudi.table.HoodieFlinkTable;
import org.apache.hudi.table.HoodieTable;
import org.apache.hudi.table.action.HoodieWriteMetadata;
import org.apache.hudi.table.upgrade.FlinkUpgradeDowngradeHelper;
import org.apache.hudi.table.upgrade.UpgradeDowngrade;
import org.apache.hudi.util.WriteStatMerger;

import com.codahale.metrics.Timer;
import org.apache.hadoop.fs.Path;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import java.util.Collection;
import java.util.Comparator;
import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.function.BiConsumer;
import java.util.stream.Collectors;

/**
 * Flink hoodie write client.
 *
 * 

The client is used both on driver (for starting/committing transactions) * and executor (for writing dataset). * * @param type of the payload */ @SuppressWarnings("checkstyle:LineLength") public class HoodieFlinkWriteClient extends BaseHoodieWriteClient>, List, List> { private static final Logger LOG = LoggerFactory.getLogger(HoodieFlinkWriteClient.class); /** * FileID to write handle mapping in order to record the write handles for each file group, * so that we can append the mini-batch data buffer incrementally. */ private final Map bucketToHandles; public HoodieFlinkWriteClient(HoodieEngineContext context, HoodieWriteConfig writeConfig) { super(context, writeConfig, FlinkUpgradeDowngradeHelper.getInstance()); this.bucketToHandles = new HashMap<>(); this.tableServiceClient = new HoodieFlinkTableServiceClient<>(context, writeConfig, getTimelineServer()); } /** * Complete changes performed at the given instantTime marker with specified action. */ @Override protected HoodieIndex createIndex(HoodieWriteConfig writeConfig) { return FlinkHoodieIndexFactory.createIndex((HoodieFlinkEngineContext) context, config); } @Override public boolean commit(String instantTime, List writeStatuses, Option> extraMetadata, String commitActionType, Map> partitionToReplacedFileIds, Option> extraPreCommitFunc) { List writeStats = writeStatuses.parallelStream().map(WriteStatus::getStat).collect(Collectors.toList()); // for eager flush, multiple write stat may share one file path. List merged = writeStats.stream() .collect(Collectors.groupingBy(writeStat -> writeStat.getPartitionPath() + writeStat.getPath())) .values().stream() .map(duplicates -> duplicates.stream().reduce(WriteStatMerger::merge).get()) .collect(Collectors.toList()); return commitStats(instantTime, merged, extraMetadata, commitActionType, partitionToReplacedFileIds, extraPreCommitFunc); } @Override protected HoodieTable createTable(HoodieWriteConfig config) { return createTableAndValidate(config, HoodieFlinkTable::create); } @Override protected HoodieTable createTable(HoodieWriteConfig config, HoodieTableMetaClient metaClient) { return createTableAndValidate(config, metaClient, HoodieFlinkTable::create); } @Override public List> filterExists(List> hoodieRecords) { // Create a Hoodie table which encapsulated the commits and files visible HoodieFlinkTable table = getHoodieTable(); Timer.Context indexTimer = metrics.getIndexCtx(); List> recordsWithLocation = getIndex().tagLocation(HoodieListData.eager(hoodieRecords), context, table).collectAsList(); metrics.updateIndexMetrics(LOOKUP_STR, metrics.getDurationInMs(indexTimer == null ? 0L : indexTimer.stop())); return recordsWithLocation.stream().filter(v1 -> !v1.isCurrentLocationKnown()).collect(Collectors.toList()); } @Override public void bootstrap(Option> extraMetadata) { throw new HoodieNotSupportedException("Bootstrap operation is not supported yet"); } @Override public List upsert(List> records, String instantTime) { HoodieTable>, List, List> table = initTable(WriteOperationType.UPSERT, Option.ofNullable(instantTime)); table.validateUpsertSchema(); preWrite(instantTime, WriteOperationType.UPSERT, table.getMetaClient()); HoodieWriteMetadata> result; try (AutoCloseableWriteHandle closeableHandle = new AutoCloseableWriteHandle(records, instantTime, table)) { result = ((HoodieFlinkTable) table).upsert(context, closeableHandle.getWriteHandle(), instantTime, records); } if (result.getIndexLookupDuration().isPresent()) { metrics.updateIndexMetrics(LOOKUP_STR, result.getIndexLookupDuration().get().toMillis()); } return postWrite(result, instantTime, table); } @Override public List upsertPreppedRecords(List> preppedRecords, String instantTime) { // only used for metadata table, the upsert happens in single thread HoodieTable>, List, List> table = initTable(WriteOperationType.UPSERT, Option.ofNullable(instantTime)); table.validateUpsertSchema(); preWrite(instantTime, WriteOperationType.UPSERT_PREPPED, table.getMetaClient()); Map>> preppedRecordsByFileId = preppedRecords.stream().parallel() .collect(Collectors.groupingBy(r -> r.getCurrentLocation().getFileId())); return preppedRecordsByFileId.values().stream().parallel().map(records -> { HoodieWriteMetadata> result; try (AutoCloseableWriteHandle closeableHandle = new AutoCloseableWriteHandle(records, instantTime, table)) { result = ((HoodieFlinkTable) table).upsertPrepped(context, closeableHandle.getWriteHandle(), instantTime, records); } return postWrite(result, instantTime, table); }).flatMap(Collection::stream).collect(Collectors.toList()); } @Override public List insert(List> records, String instantTime) { HoodieTable>, List, List> table = initTable(WriteOperationType.INSERT, Option.ofNullable(instantTime)); table.validateInsertSchema(); preWrite(instantTime, WriteOperationType.INSERT, table.getMetaClient()); // create the write handle if not exists HoodieWriteMetadata> result; try (AutoCloseableWriteHandle closeableHandle = new AutoCloseableWriteHandle(records, instantTime, table)) { result = ((HoodieFlinkTable) table).insert(context, closeableHandle.getWriteHandle(), instantTime, records); } if (result.getIndexLookupDuration().isPresent()) { metrics.updateIndexMetrics(LOOKUP_STR, result.getIndexLookupDuration().get().toMillis()); } return postWrite(result, instantTime, table); } /** * Removes all existing records from the partitions affected and inserts the given HoodieRecords, into the table. * * @param records HoodieRecords to insert * @param instantTime Instant time of the commit * @return list of WriteStatus to inspect errors and counts */ public List insertOverwrite( List> records, final String instantTime) { HoodieTable>, List, List> table = initTable(WriteOperationType.INSERT_OVERWRITE, Option.ofNullable(instantTime)); table.validateInsertSchema(); preWrite(instantTime, WriteOperationType.INSERT_OVERWRITE, table.getMetaClient()); // create the write handle if not exists HoodieWriteMetadata> result; try (AutoCloseableWriteHandle closeableHandle = new AutoCloseableWriteHandle(records, instantTime, table, true)) { result = ((HoodieFlinkTable) table).insertOverwrite(context, closeableHandle.getWriteHandle(), instantTime, records); } return postWrite(result, instantTime, table); } /** * Removes all existing records of the Hoodie table and inserts the given HoodieRecords, into the table. * * @param records HoodieRecords to insert * @param instantTime Instant time of the commit * @return list of WriteStatus to inspect errors and counts */ public List insertOverwriteTable( List> records, final String instantTime) { HoodieTable table = initTable(WriteOperationType.INSERT_OVERWRITE_TABLE, Option.ofNullable(instantTime)); table.validateInsertSchema(); preWrite(instantTime, WriteOperationType.INSERT_OVERWRITE_TABLE, table.getMetaClient()); // create the write handle if not exists HoodieWriteMetadata> result; try (AutoCloseableWriteHandle closeableHandle = new AutoCloseableWriteHandle(records, instantTime, table, true)) { result = ((HoodieFlinkTable) table).insertOverwriteTable(context, closeableHandle.getWriteHandle(), instantTime, records); } return postWrite(result, instantTime, table); } @Override public List insertPreppedRecords(List> preppedRecords, String instantTime) { throw new HoodieNotSupportedException("InsertPrepped operation is not supported yet"); } @Override public List bulkInsert(List> records, String instantTime) { throw new HoodieNotSupportedException("BulkInsert operation is not supported yet"); } @Override public List bulkInsert(List> records, String instantTime, Option userDefinedBulkInsertPartitioner) { throw new HoodieNotSupportedException("BulkInsert operation is not supported yet"); } @Override public List bulkInsertPreppedRecords(List> preppedRecords, String instantTime, Option bulkInsertPartitioner) { // only used for metadata table, the bulk_insert happens in single JVM process HoodieTable>, List, List> table = initTable(WriteOperationType.BULK_INSERT_PREPPED, Option.ofNullable(instantTime)); table.validateInsertSchema(); preWrite(instantTime, WriteOperationType.BULK_INSERT_PREPPED, table.getMetaClient()); Map>> preppedRecordsByFileId = preppedRecords.stream().parallel() .collect(Collectors.groupingBy(r -> r.getCurrentLocation().getFileId())); return preppedRecordsByFileId.values().stream().parallel().map(records -> { records.sort(Comparator.comparing(HoodieRecord::getRecordKey)); HoodieWriteMetadata> result; records.get(0).getCurrentLocation().setInstantTime("I"); try (AutoCloseableWriteHandle closeableHandle = new AutoCloseableWriteHandle(records, instantTime, table, true)) { result = ((HoodieFlinkTable) table).bulkInsertPrepped(context, closeableHandle.getWriteHandle(), instantTime, records); } return postWrite(result, instantTime, table); }).flatMap(Collection::stream).collect(Collectors.toList()); } @Override public List delete(List keys, String instantTime) { HoodieTable>, List, List> table = initTable(WriteOperationType.DELETE, Option.ofNullable(instantTime)); preWrite(instantTime, WriteOperationType.DELETE, table.getMetaClient()); HoodieWriteMetadata> result = table.delete(context, instantTime, keys); return postWrite(result, instantTime, table); } @Override public List deletePrepped(List> preppedRecords, final String instantTime) { HoodieTable>, List, List> table = initTable(WriteOperationType.DELETE_PREPPED, Option.ofNullable(instantTime)); preWrite(instantTime, WriteOperationType.DELETE_PREPPED, table.getMetaClient()); HoodieWriteMetadata> result = table.deletePrepped(context, instantTime, preppedRecords); return postWrite(result, instantTime, table); } public List deletePartitions(List partitions, String instantTime) { HoodieTable>, List, List> table = initTable(WriteOperationType.DELETE_PARTITION, Option.ofNullable(instantTime)); preWrite(instantTime, WriteOperationType.DELETE_PARTITION, table.getMetaClient()); HoodieWriteMetadata> result = table.deletePartitions(context, instantTime, partitions); return postWrite(result, instantTime, table); } @Override public void preWrite(String instantTime, WriteOperationType writeOperationType, HoodieTableMetaClient metaClient) { setOperationType(writeOperationType); // Note: the code to read the commit metadata is not thread safe for JSON deserialization, // remove the table metadata sync // remove the async cleaning } /** * Refresh the last transaction metadata, * should be called before the Driver starts a new transaction. */ public void preTxn(WriteOperationType operationType, HoodieTableMetaClient metaClient) { if (txnManager.isLockRequired() && config.needResolveWriteConflict(operationType)) { // refresh the meta client which is reused metaClient.reloadActiveTimeline(); this.lastCompletedTxnAndMetadata = TransactionUtils.getLastCompletedTxnInstantAndMetadata(metaClient); this.pendingInflightAndRequestedInstants = TransactionUtils.getInflightAndRequestedInstants(metaClient); } tableServiceClient.startAsyncArchiveService(this); } /** * Initialized the metadata table on start up, should only be called once on driver. */ public void initMetadataTable() { ((HoodieFlinkTableServiceClient) tableServiceClient).initMetadataTable(); } /** * Starts async cleaning service for finished commits. * *

The Flink write client is designed to write data set as buckets * but cleaning action should trigger after all the write actions within a * checkpoint finish. */ public void startAsyncCleaning() { tableServiceClient.startAsyncCleanerService(this); } /** * Blocks and wait for the async cleaning service to finish. * *

The Flink write client is designed to write data set as buckets * but cleaning action should trigger after all the write actions within a * checkpoint finish. */ public void waitForCleaningFinish() { if (tableServiceClient.asyncCleanerService != null) { LOG.info("Cleaner has been spawned already. Waiting for it to finish"); tableServiceClient.asyncClean(); LOG.info("Cleaner has finished"); } } @Override public List postWrite(HoodieWriteMetadata> result, String instantTime, HoodieTable hoodieTable) { if (result.getIndexLookupDuration().isPresent()) { metrics.updateIndexMetrics(getOperationType().name(), result.getIndexUpdateDuration().get().toMillis()); } return result.getWriteStatuses(); } @Override protected void mayBeCleanAndArchive(HoodieTable table) { autoArchiveOnCommit(table); } @Override protected HoodieWriteMetadata> compact(String compactionInstantTime, boolean shouldComplete) { // only used for metadata table, the compaction happens in single thread return tableServiceClient.compact(compactionInstantTime, shouldComplete); } @Override public HoodieWriteMetadata> cluster(final String clusteringInstant, final boolean shouldComplete) { throw new HoodieNotSupportedException("Clustering is not supported yet"); } private void completeClustering( HoodieReplaceCommitMetadata metadata, HoodieTable>, List, List> table, String clusteringCommitTime) { ((HoodieFlinkTableServiceClient) tableServiceClient).completeClustering(metadata, table, clusteringCommitTime); } @Override protected void doInitTable(WriteOperationType operationType, HoodieTableMetaClient metaClient, Option instantTime) { // do nothing. // flink executes the upgrade/downgrade once when initializing the first instant on start up, // no need to execute the upgrade/downgrade on each write in streaming. // flink performs metadata table bootstrap on the coordinator when it starts up. } public void completeTableService( TableServiceType tableServiceType, HoodieCommitMetadata metadata, HoodieTable>, List, List> table, String commitInstant) { switch (tableServiceType) { case CLUSTER: completeClustering((HoodieReplaceCommitMetadata) metadata, table, commitInstant); break; case COMPACT: completeCompaction(metadata, table, commitInstant); break; default: throw new IllegalArgumentException("This table service is not valid " + tableServiceType); } } /** * Upgrade downgrade the Hoodie table. * *

This action should only be executed once for each commit. * The modification of the table properties is not thread safe. */ public void upgradeDowngrade(String instantTime, HoodieTableMetaClient metaClient) { new UpgradeDowngrade(metaClient, config, context, FlinkUpgradeDowngradeHelper.getInstance()) .run(HoodieTableVersion.current(), instantTime); } /** * Clean the write handles within a checkpoint interval. * All the handles should have been closed already. */ public void cleanHandles() { this.bucketToHandles.clear(); } @Override public void close() { super.close(); cleanHandles(); } /** * Get or create a new write handle in order to reuse the file handles. * * @param record The first record in the bucket * @param config Write config * @param instantTime The instant time * @param table The table * @param recordItr Record iterator * @param overwrite Whether this is an overwrite operation * @return Existing write handle or create a new one */ private HoodieWriteHandle getOrCreateWriteHandle( HoodieRecord record, HoodieWriteConfig config, String instantTime, HoodieTable>, List, List> table, Iterator> recordItr, boolean overwrite) { // caution: it's not a good practice to modify the handles internal. FlinkWriteHandleFactory.Factory>, List, List> writeHandleFactory = FlinkWriteHandleFactory.getFactory(table.getMetaClient().getTableConfig(), config, overwrite); return writeHandleFactory.create(this.bucketToHandles, record, config, instantTime, table, recordItr); } public HoodieFlinkTable getHoodieTable() { return HoodieFlinkTable.create(config, (HoodieFlinkEngineContext) context); } public Map> getPartitionToReplacedFileIds( WriteOperationType writeOperationType, List writeStatuses) { HoodieFlinkTable table = getHoodieTable(); switch (writeOperationType) { case INSERT_OVERWRITE: return writeStatuses.stream().map(status -> status.getStat().getPartitionPath()).distinct() .collect( Collectors.toMap( partition -> partition, partitionPath -> getAllExistingFileIds(table, partitionPath))); case INSERT_OVERWRITE_TABLE: Map> partitionToExistingFileIds = new HashMap<>(); List partitionPaths = FSUtils.getAllPartitionPaths(context, table.getStorage(), config.getMetadataConfig(), table.getMetaClient().getBasePath()); if (partitionPaths != null && partitionPaths.size() > 0) { context.setJobStatus(this.getClass().getSimpleName(), "Getting ExistingFileIds of all partitions: " + config.getTableName()); partitionToExistingFileIds = partitionPaths.stream().parallel() .collect( Collectors.toMap( partition -> partition, partition -> getAllExistingFileIds(table, partition))); } return partitionToExistingFileIds; default: throw new AssertionError(); } } private List getAllExistingFileIds(HoodieFlinkTable table, String partitionPath) { // because new commit is not complete. it is safe to mark all existing file Ids as old files return table.getSliceView().getLatestFileSlices(partitionPath).map(FileSlice::getFileId).distinct().collect(Collectors.toList()); } private final class AutoCloseableWriteHandle implements AutoCloseable { private final HoodieWriteHandle writeHandle; AutoCloseableWriteHandle( List> records, String instantTime, HoodieTable>, List, List> table) { this(records, instantTime, table, false); } AutoCloseableWriteHandle( List> records, String instantTime, HoodieTable>, List, List> table, boolean overwrite) { this.writeHandle = getOrCreateWriteHandle(records.get(0), getConfig(), instantTime, table, records.listIterator(), overwrite); } HoodieWriteHandle getWriteHandle() { return writeHandle; } @Override public void close() { ((MiniBatchHandle) writeHandle).closeGracefully(); } } }





© 2015 - 2025 Weber Informatics LLC | Privacy Policy