org.apache.hudi.utilities.streamer.SparkSampleWritesUtils Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of hudi-utilities_2.13 Show documentation
The newest version!
/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *   http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied.  See the License for the
 * specific language governing permissions and limitations
 * under the License.
 */

package org.apache.hudi.utilities.streamer;

import org.apache.hudi.client.SparkRDDWriteClient;
import org.apache.hudi.client.WriteStatus;
import org.apache.hudi.client.common.HoodieSparkEngineContext;
import org.apache.hudi.common.config.HoodieMetadataConfig;
import org.apache.hudi.common.config.TypedProperties;
import org.apache.hudi.common.model.HoodieCommitMetadata;
import org.apache.hudi.common.model.HoodieRecord;
import org.apache.hudi.common.model.HoodieTableType;
import org.apache.hudi.common.table.HoodieTableMetaClient;
import org.apache.hudi.common.table.timeline.HoodieInstant;
import org.apache.hudi.common.util.Option;
import org.apache.hudi.common.util.collection.Pair;
import org.apache.hudi.config.HoodieWriteConfig;
import org.apache.hudi.hadoop.fs.HadoopFSUtils;
import org.apache.hudi.storage.HoodieStorage;
import org.apache.hudi.storage.StoragePath;

import org.apache.hadoop.fs.FileSystem;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import java.io.IOException;
import java.time.Instant;
import java.time.ZoneId;
import java.util.List;

import static org.apache.hudi.common.table.HoodieTableMetaClient.SAMPLE_WRITES_FOLDER_PATH;
import static org.apache.hudi.common.table.timeline.HoodieInstantTimeGenerator.getInstantFromTemporalAccessor;
import static org.apache.hudi.common.util.ValidationUtils.checkState;
import static org.apache.hudi.config.HoodieCompactionConfig.COPY_ON_WRITE_RECORD_SIZE_ESTIMATE;
import static org.apache.hudi.utilities.config.HoodieStreamerConfig.SAMPLE_WRITES_ENABLED;
import static org.apache.hudi.utilities.config.HoodieStreamerConfig.SAMPLE_WRITES_SIZE;

/**
 * The utilities class is dedicated to estimating average record size by writing sample incoming records
 * to `.hoodie/.aux/.sample_writes//` and reading the commit metadata.
 * 
 * TODO handle sample_writes sub-path clean-up w.r.t. rollback and insert overwrite. (HUDI-6044)
 */
public class SparkSampleWritesUtils {

  private static final Logger LOG = LoggerFactory.getLogger(SparkSampleWritesUtils.class);

  public static Option getWriteConfigWithRecordSizeEstimate(JavaSparkContext jsc, Option> recordsOpt, HoodieWriteConfig writeConfig) {
    if (!writeConfig.getBoolean(SAMPLE_WRITES_ENABLED)) {
      LOG.debug("Skip overwriting record size estimate as it's disabled.");
      return Option.empty();
    }
    HoodieTableMetaClient metaClient = getMetaClient(jsc, writeConfig.getBasePath());
    if (metaClient.isTimelineNonEmpty()) {
      LOG.info("Skip overwriting record size estimate due to timeline is non-empty.");
      return Option.empty();
    }
    try {
      String instantTime = getInstantFromTemporalAccessor(Instant.now().atZone(ZoneId.systemDefault()));
      Pair result = doSampleWrites(jsc, recordsOpt, writeConfig, instantTime);
      if (result.getLeft()) {
        long avgSize = getAvgSizeFromSampleWrites(jsc, result.getRight());
        LOG.info("Overwriting record size estimate to {}", avgSize);
        TypedProperties props = writeConfig.getProps();
        props.put(COPY_ON_WRITE_RECORD_SIZE_ESTIMATE.key(), String.valueOf(avgSize));
        return Option.of(HoodieWriteConfig.newBuilder().withProperties(props).build());
      }
    } catch (IOException e) {
      LOG.error(String.format("Not overwriting record size estimate for table %s due to error when doing sample writes.", writeConfig.getTableName()), e);
    }
    return Option.empty();
  }

  private static Pair doSampleWrites(JavaSparkContext jsc, Option> recordsOpt, HoodieWriteConfig writeConfig, String instantTime)
      throws IOException {
    final String sampleWritesBasePath = getSampleWritesBasePath(jsc, writeConfig, instantTime);
    HoodieTableMetaClient.newTableBuilder()
        .setTableType(HoodieTableType.COPY_ON_WRITE)
        .setTableName(String.format("%s_samples_%s", writeConfig.getTableName(), instantTime))
        .setCDCEnabled(false)
        .initTable(HadoopFSUtils.getStorageConfWithCopy(jsc.hadoopConfiguration()), sampleWritesBasePath);
    TypedProperties props = writeConfig.getProps();
    props.put(SAMPLE_WRITES_ENABLED.key(), "false");
    final HoodieWriteConfig sampleWriteConfig = HoodieWriteConfig.newBuilder()
        .withProps(props)
        .withTableServicesEnabled(false)
        .withMetadataConfig(HoodieMetadataConfig.newBuilder().enable(false).build())
        .withSchemaEvolutionEnable(false)
        .withBulkInsertParallelism(1)
        .withAutoCommit(true)
        .withPath(sampleWritesBasePath)
        .build();
    Pair emptyRes = Pair.of(false, null);
    try (SparkRDDWriteClient sampleWriteClient = new SparkRDDWriteClient(new HoodieSparkEngineContext(jsc), sampleWriteConfig, Option.empty())) {
      int size = writeConfig.getIntOrDefault(SAMPLE_WRITES_SIZE);
      return recordsOpt.map(records -> {
        List samples = records.coalesce(1).take(size);
        if (samples.isEmpty()) {
          return emptyRes;
        }
        sampleWriteClient.startCommitWithTime(instantTime);
        JavaRDD writeStatusRDD = sampleWriteClient.bulkInsert(jsc.parallelize(samples, 1), instantTime);
        if (writeStatusRDD.filter(WriteStatus::hasErrors).count() > 0) {
          LOG.error("sample writes for table {} failed with errors.", writeConfig.getTableName());
          if (LOG.isTraceEnabled()) {
            LOG.trace("Printing out the top 100 errors");
            writeStatusRDD.filter(WriteStatus::hasErrors).take(100).forEach(ws -> {
              LOG.trace("Global error :", ws.getGlobalError());
              ws.getErrors().forEach((key, throwable) ->
                  LOG.trace(String.format("Error for key: %s", key), throwable));
            });
          }
          return emptyRes;
        } else {
          return Pair.of(true, sampleWritesBasePath);
        }
      }).orElse(emptyRes);
    }
  }

  private static String getSampleWritesBasePath(JavaSparkContext jsc, HoodieWriteConfig writeConfig, String instantTime) throws IOException {
    StoragePath basePath = new StoragePath(writeConfig.getBasePath(), SAMPLE_WRITES_FOLDER_PATH + StoragePath.SEPARATOR + instantTime);
    HoodieStorage storage = getMetaClient(jsc, writeConfig.getBasePath()).getStorage();
    if (storage.exists(basePath)) {
      storage.deleteDirectory(basePath);
    }
    return basePath.toString();
  }

  private static long getAvgSizeFromSampleWrites(JavaSparkContext jsc, String sampleWritesBasePath) throws IOException {
    HoodieTableMetaClient metaClient = getMetaClient(jsc, sampleWritesBasePath);
    Option lastInstantOpt = metaClient.getCommitTimeline().filterCompletedInstants().lastInstant();
    checkState(lastInstantOpt.isPresent(), "The only completed instant should be present in sample_writes table.");
    HoodieInstant instant = lastInstantOpt.get();
    HoodieCommitMetadata commitMetadata = metaClient.getCommitMetadataSerDe()
        .deserialize(instant, metaClient.getCommitTimeline().getInstantDetails(instant).get(), HoodieCommitMetadata.class);
    long totalBytesWritten = commitMetadata.fetchTotalBytesWritten();
    long totalRecordsWritten = commitMetadata.fetchTotalRecordsWritten();
    return (long) Math.ceil((1.0 * totalBytesWritten) / totalRecordsWritten);
  }

  private static HoodieTableMetaClient getMetaClient(JavaSparkContext jsc, String basePath) {
    FileSystem fs = HadoopFSUtils.getFs(basePath, jsc.hadoopConfiguration());
    return HoodieTableMetaClient.builder()
        .setConf(HadoopFSUtils.getStorageConfWithCopy(fs.getConf())).setBasePath(basePath).build();
  }
}