org.apache.hudi.table.action.commit.SparkBulkInsertHelper Maven / Gradle / Ivy
The newest version!
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hudi.table.action.commit;
import org.apache.hudi.client.WriteStatus;
import org.apache.hudi.common.data.HoodieData;
import org.apache.hudi.common.model.HoodieKey;
import org.apache.hudi.common.model.HoodieRecord;
import org.apache.hudi.common.table.timeline.HoodieInstant;
import org.apache.hudi.common.util.Option;
import org.apache.hudi.config.HoodieWriteConfig;
import org.apache.hudi.data.HoodieJavaRDD;
import org.apache.hudi.execution.bulkinsert.BulkInsertInternalPartitionerFactory;
import org.apache.hudi.execution.bulkinsert.BulkInsertMapFunction;
import org.apache.hudi.io.CreateHandleFactory;
import org.apache.hudi.io.WriteHandleFactory;
import org.apache.hudi.table.BulkInsertPartitioner;
import org.apache.hudi.table.HoodieTable;
import org.apache.hudi.table.action.HoodieWriteMetadata;
import org.apache.spark.api.java.JavaRDD;
import java.util.List;
/**
* A spark implementation of {@link BaseBulkInsertHelper}.
*
* @param
*/
@SuppressWarnings("checkstyle:LineLength")
public class SparkBulkInsertHelper extends BaseBulkInsertHelper>,
HoodieData, HoodieData, R> {
private SparkBulkInsertHelper() {
super(HoodieData::getNumPartitions);
}
private static class BulkInsertHelperHolder {
private static final SparkBulkInsertHelper HOODIE_BULK_INSERT_HELPER = new SparkBulkInsertHelper<>();
}
public static SparkBulkInsertHelper newInstance() {
return BulkInsertHelperHolder.HOODIE_BULK_INSERT_HELPER;
}
@Override
public HoodieWriteMetadata> bulkInsert(final HoodieData> inputRecords,
final String instantTime,
final HoodieTable>, HoodieData, HoodieData> table,
final HoodieWriteConfig config,
final BaseCommitActionExecutor>, HoodieData, HoodieData, R> executor,
final boolean performDedupe,
final Option userDefinedBulkInsertPartitioner) {
HoodieWriteMetadata result = new HoodieWriteMetadata();
// Transition bulk_insert state to inflight
table.getActiveTimeline().transitionRequestedToInflight(table.getInstantGenerator().createNewInstant(HoodieInstant.State.REQUESTED,
executor.getCommitActionType(), instantTime), Option.empty(),
config.shouldAllowMultiWriteOnSameInstant());
BulkInsertPartitioner partitioner = userDefinedBulkInsertPartitioner.orElseGet(() -> BulkInsertInternalPartitionerFactory.get(table, config));
// Write new files
HoodieData writeStatuses =
bulkInsert(inputRecords, instantTime, table, config, performDedupe, partitioner, false,
config.getBulkInsertShuffleParallelism(), new CreateHandleFactory(false));
// Update index
((BaseSparkCommitActionExecutor) executor).updateIndexAndCommitIfNeeded(writeStatuses, result);
return result;
}
/**
* Do bulk insert using WriteHandleFactory from the partitioner (i.e., partitioner.getWriteHandleFactory)
*/
public HoodieData bulkInsert(HoodieData> inputRecords,
String instantTime,
HoodieTable>, HoodieData, HoodieData> table,
HoodieWriteConfig config,
boolean performDedupe,
BulkInsertPartitioner partitioner,
boolean useWriterSchema,
int parallelism) {
return bulkInsert(inputRecords, instantTime, table, config, performDedupe, partitioner, useWriterSchema, parallelism, null);
}
@Override
public HoodieData bulkInsert(HoodieData> inputRecords,
String instantTime,
HoodieTable>, HoodieData, HoodieData> table,
HoodieWriteConfig config,
boolean performDedupe,
BulkInsertPartitioner partitioner,
boolean useWriterSchema,
int configuredParallelism,
WriteHandleFactory writeHandleFactory) {
// De-dupe/merge if needed
HoodieData> dedupedRecords = inputRecords;
int targetParallelism = deduceShuffleParallelism(inputRecords, configuredParallelism);
if (performDedupe) {
dedupedRecords = (HoodieData>) HoodieWriteHelper.newInstance()
.combineOnCondition(config.shouldCombineBeforeInsert(), inputRecords, targetParallelism, table);
}
// only JavaRDD is supported for Spark partitioner, but it is not enforced by BulkInsertPartitioner API. To improve this, TODO HUDI-3463
final HoodieData> repartitionedRecords =
HoodieJavaRDD.of((JavaRDD>) partitioner.repartitionRecords(HoodieJavaRDD.getJavaRDD(dedupedRecords), targetParallelism));
JavaRDD writeStatusRDD = HoodieJavaRDD.getJavaRDD(repartitionedRecords)
.mapPartitionsWithIndex(new BulkInsertMapFunction<>(instantTime,
partitioner.arePartitionRecordsSorted(), config, table, useWriterSchema, partitioner, writeHandleFactory), true)
.flatMap(List::iterator);
return HoodieJavaRDD.of(writeStatusRDD);
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy