All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.hudi.commit.BaseDatasetBulkInsertCommitActionExecutor Maven / Gradle / Ivy

The newest version!
/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.hudi.commit;

import org.apache.hudi.DataSourceUtils;
import org.apache.hudi.DataSourceWriteOptions;
import org.apache.hudi.HoodieDatasetBulkInsertHelper;
import org.apache.hudi.client.HoodieWriteResult;
import org.apache.hudi.client.SparkRDDWriteClient;
import org.apache.hudi.client.WriteStatus;
import org.apache.hudi.common.data.HoodieData;
import org.apache.hudi.common.model.WriteOperationType;
import org.apache.hudi.common.table.HoodieTableConfig;
import org.apache.hudi.common.util.CommitUtils;
import org.apache.hudi.common.util.Option;
import org.apache.hudi.config.HoodieWriteConfig;
import org.apache.hudi.data.HoodieJavaRDD;
import org.apache.hudi.exception.HoodieException;
import org.apache.hudi.execution.bulkinsert.BucketIndexBulkInsertPartitionerWithRows;
import org.apache.hudi.execution.bulkinsert.BulkInsertInternalPartitionerWithRowsFactory;
import org.apache.hudi.execution.bulkinsert.ConsistentBucketIndexBulkInsertPartitionerWithRows;
import org.apache.hudi.execution.bulkinsert.NonSortPartitionerWithRows;
import org.apache.hudi.index.HoodieIndex;
import org.apache.hudi.table.BulkInsertPartitioner;
import org.apache.hudi.table.HoodieTable;
import org.apache.hudi.table.action.HoodieWriteMetadata;

import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Row;

import java.io.Serializable;
import java.util.Collections;
import java.util.List;
import java.util.Map;

import static org.apache.hudi.config.HoodieWriteConfig.WRITE_STATUS_STORAGE_LEVEL_VALUE;

public abstract class BaseDatasetBulkInsertCommitActionExecutor implements Serializable {

  protected final transient HoodieWriteConfig writeConfig;
  protected final transient SparkRDDWriteClient writeClient;
  protected final String instantTime;
  protected HoodieTable table;

  public BaseDatasetBulkInsertCommitActionExecutor(HoodieWriteConfig config,
                                                   SparkRDDWriteClient writeClient,
                                                   String instantTime) {
    this.writeConfig = config;
    this.writeClient = writeClient;
    this.instantTime = instantTime;
  }

  protected void preExecute() {
    table.validateInsertSchema();
    writeClient.startCommitWithTime(instantTime, getCommitActionType());
    writeClient.preWrite(instantTime, getWriteOperationType(), table.getMetaClient());
  }

  protected abstract Option> doExecute(Dataset records, boolean arePartitionRecordsSorted);

  protected void afterExecute(HoodieWriteMetadata> result) {
    writeClient.postWrite(result, instantTime, table);
  }

  private HoodieWriteMetadata> buildHoodieWriteMetadata(Option> writeStatuses) {
    return writeStatuses.map(statuses -> {
      // cache writeStatusRDD, so that all actions before this are not triggered again for future
      statuses.persist(writeConfig.getString(WRITE_STATUS_STORAGE_LEVEL_VALUE), writeClient.getEngineContext(), HoodieData.HoodieDataCacheKey.of(writeConfig.getBasePath(), instantTime));
      HoodieWriteMetadata> hoodieWriteMetadata = new HoodieWriteMetadata<>();
      hoodieWriteMetadata.setWriteStatuses(HoodieJavaRDD.getJavaRDD(statuses));
      hoodieWriteMetadata.setPartitionToReplaceFileIds(getPartitionToReplacedFileIds(statuses));
      return hoodieWriteMetadata;
    }).orElseGet(HoodieWriteMetadata::new);
  }

  public final HoodieWriteResult execute(Dataset records, boolean isTablePartitioned) {
    if (writeConfig.getBoolean(DataSourceWriteOptions.INSERT_DROP_DUPS())) {
      throw new HoodieException("Dropping duplicates with bulk_insert in row writer path is not supported yet");
    }

    boolean populateMetaFields = writeConfig.getBoolean(HoodieTableConfig.POPULATE_META_FIELDS);

    table = writeClient.initTable(getWriteOperationType(), Option.ofNullable(instantTime));

    BulkInsertPartitioner> bulkInsertPartitionerRows = getPartitioner(populateMetaFields, isTablePartitioned);
    Dataset hoodieDF = HoodieDatasetBulkInsertHelper.prepareForBulkInsert(records, writeConfig, bulkInsertPartitionerRows, instantTime);

    preExecute();
    HoodieWriteMetadata> result = buildHoodieWriteMetadata(doExecute(hoodieDF, bulkInsertPartitionerRows.arePartitionRecordsSorted()));
    afterExecute(result);

    return new HoodieWriteResult(result.getWriteStatuses(), result.getPartitionToReplaceFileIds());
  }

  public abstract WriteOperationType getWriteOperationType();

  public String getCommitActionType() {
    return CommitUtils.getCommitActionType(getWriteOperationType(), writeClient.getConfig().getTableType());
  }

  protected BulkInsertPartitioner> getPartitioner(boolean populateMetaFields, boolean isTablePartitioned) {
    if (populateMetaFields) {
      if (writeConfig.getIndexType() == HoodieIndex.IndexType.BUCKET) {
        if (writeConfig.getBucketIndexEngineType() == HoodieIndex.BucketIndexEngineType.SIMPLE) {
          return new BucketIndexBulkInsertPartitionerWithRows(writeConfig.getBucketIndexHashFieldWithDefault(),
              writeConfig.getBucketIndexNumBuckets());
        } else {
          return new ConsistentBucketIndexBulkInsertPartitionerWithRows(table, Collections.emptyMap(), true);
        }
      } else {
        return DataSourceUtils
            .createUserDefinedBulkInsertPartitionerWithRows(writeConfig)
            .orElseGet(() -> BulkInsertInternalPartitionerWithRowsFactory.get(writeConfig, isTablePartitioned));
      }
    } else {
      // Sort modes are not yet supported when meta fields are disabled
      return new NonSortPartitionerWithRows();
    }
  }

  protected Map> getPartitionToReplacedFileIds(HoodieData writeStatuses) {
    return Collections.emptyMap();
  }
}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy