All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.hudi.utilities.streamer.SparkSampleWritesUtils Maven / Gradle / Ivy

/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *   http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied.  See the License for the
 * specific language governing permissions and limitations
 * under the License.
 */

package org.apache.hudi.utilities.streamer;

import org.apache.hudi.client.SparkRDDWriteClient;
import org.apache.hudi.client.WriteStatus;
import org.apache.hudi.client.common.HoodieSparkEngineContext;
import org.apache.hudi.common.config.HoodieMetadataConfig;
import org.apache.hudi.common.config.TypedProperties;
import org.apache.hudi.common.model.HoodieCommitMetadata;
import org.apache.hudi.common.model.HoodieRecord;
import org.apache.hudi.common.model.HoodieTableType;
import org.apache.hudi.common.table.HoodieTableMetaClient;
import org.apache.hudi.common.table.timeline.HoodieInstant;
import org.apache.hudi.common.util.Option;
import org.apache.hudi.common.util.collection.Pair;
import org.apache.hudi.config.HoodieWriteConfig;
import org.apache.hudi.hadoop.fs.HadoopFSUtils;
import org.apache.hudi.storage.HoodieStorage;
import org.apache.hudi.storage.StoragePath;

import org.apache.hadoop.fs.FileSystem;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import java.io.IOException;
import java.time.Instant;
import java.time.ZoneId;
import java.util.List;

import static org.apache.hudi.common.table.HoodieTableMetaClient.SAMPLE_WRITES_FOLDER_PATH;
import static org.apache.hudi.common.table.timeline.HoodieInstantTimeGenerator.getInstantFromTemporalAccessor;
import static org.apache.hudi.common.util.ValidationUtils.checkState;
import static org.apache.hudi.config.HoodieCompactionConfig.COPY_ON_WRITE_RECORD_SIZE_ESTIMATE;
import static org.apache.hudi.utilities.config.HoodieStreamerConfig.SAMPLE_WRITES_ENABLED;
import static org.apache.hudi.utilities.config.HoodieStreamerConfig.SAMPLE_WRITES_SIZE;

/**
 * The utilities class is dedicated to estimating average record size by writing sample incoming records
 * to `.hoodie/.aux/.sample_writes//` and reading the commit metadata.
 * 

* TODO handle sample_writes sub-path clean-up w.r.t. rollback and insert overwrite. (HUDI-6044) */ public class SparkSampleWritesUtils { private static final Logger LOG = LoggerFactory.getLogger(SparkSampleWritesUtils.class); public static Option getWriteConfigWithRecordSizeEstimate(JavaSparkContext jsc, Option> recordsOpt, HoodieWriteConfig writeConfig) { if (!writeConfig.getBoolean(SAMPLE_WRITES_ENABLED)) { LOG.debug("Skip overwriting record size estimate as it's disabled."); return Option.empty(); } HoodieTableMetaClient metaClient = getMetaClient(jsc, writeConfig.getBasePath()); if (metaClient.isTimelineNonEmpty()) { LOG.info("Skip overwriting record size estimate due to timeline is non-empty."); return Option.empty(); } try { String instantTime = getInstantFromTemporalAccessor(Instant.now().atZone(ZoneId.systemDefault())); Pair result = doSampleWrites(jsc, recordsOpt, writeConfig, instantTime); if (result.getLeft()) { long avgSize = getAvgSizeFromSampleWrites(jsc, result.getRight()); LOG.info("Overwriting record size estimate to {}", avgSize); TypedProperties props = writeConfig.getProps(); props.put(COPY_ON_WRITE_RECORD_SIZE_ESTIMATE.key(), String.valueOf(avgSize)); return Option.of(HoodieWriteConfig.newBuilder().withProperties(props).build()); } } catch (IOException e) { LOG.error(String.format("Not overwriting record size estimate for table %s due to error when doing sample writes.", writeConfig.getTableName()), e); } return Option.empty(); } private static Pair doSampleWrites(JavaSparkContext jsc, Option> recordsOpt, HoodieWriteConfig writeConfig, String instantTime) throws IOException { final String sampleWritesBasePath = getSampleWritesBasePath(jsc, writeConfig, instantTime); HoodieTableMetaClient.withPropertyBuilder() .setTableType(HoodieTableType.COPY_ON_WRITE) .setTableName(String.format("%s_samples_%s", writeConfig.getTableName(), instantTime)) .setCDCEnabled(false) .initTable(HadoopFSUtils.getStorageConfWithCopy(jsc.hadoopConfiguration()), sampleWritesBasePath); TypedProperties props = writeConfig.getProps(); props.put(SAMPLE_WRITES_ENABLED.key(), "false"); final HoodieWriteConfig sampleWriteConfig = HoodieWriteConfig.newBuilder() .withProps(props) .withTableServicesEnabled(false) .withMetadataConfig(HoodieMetadataConfig.newBuilder().enable(false).build()) .withSchemaEvolutionEnable(false) .withBulkInsertParallelism(1) .withAutoCommit(true) .withPath(sampleWritesBasePath) .build(); Pair emptyRes = Pair.of(false, null); try (SparkRDDWriteClient sampleWriteClient = new SparkRDDWriteClient(new HoodieSparkEngineContext(jsc), sampleWriteConfig, Option.empty())) { int size = writeConfig.getIntOrDefault(SAMPLE_WRITES_SIZE); return recordsOpt.map(records -> { List samples = records.coalesce(1).take(size); if (samples.isEmpty()) { return emptyRes; } sampleWriteClient.startCommitWithTime(instantTime); JavaRDD writeStatusRDD = sampleWriteClient.bulkInsert(jsc.parallelize(samples, 1), instantTime); if (writeStatusRDD.filter(WriteStatus::hasErrors).count() > 0) { LOG.error("sample writes for table {} failed with errors.", writeConfig.getTableName()); if (LOG.isTraceEnabled()) { LOG.trace("Printing out the top 100 errors"); writeStatusRDD.filter(WriteStatus::hasErrors).take(100).forEach(ws -> { LOG.trace("Global error :", ws.getGlobalError()); ws.getErrors().forEach((key, throwable) -> LOG.trace(String.format("Error for key: %s", key), throwable)); }); } return emptyRes; } else { return Pair.of(true, sampleWritesBasePath); } }).orElse(emptyRes); } } private static String getSampleWritesBasePath(JavaSparkContext jsc, HoodieWriteConfig writeConfig, String instantTime) throws IOException { StoragePath basePath = new StoragePath(writeConfig.getBasePath(), SAMPLE_WRITES_FOLDER_PATH + StoragePath.SEPARATOR + instantTime); HoodieStorage storage = getMetaClient(jsc, writeConfig.getBasePath()).getStorage(); if (storage.exists(basePath)) { storage.deleteDirectory(basePath); } return basePath.toString(); } private static long getAvgSizeFromSampleWrites(JavaSparkContext jsc, String sampleWritesBasePath) throws IOException { HoodieTableMetaClient metaClient = getMetaClient(jsc, sampleWritesBasePath); Option lastInstantOpt = metaClient.getCommitTimeline().filterCompletedInstants().lastInstant(); checkState(lastInstantOpt.isPresent(), "The only completed instant should be present in sample_writes table."); HoodieInstant instant = lastInstantOpt.get(); HoodieCommitMetadata commitMetadata = HoodieCommitMetadata .fromBytes(metaClient.getCommitTimeline().getInstantDetails(instant).get(), HoodieCommitMetadata.class); long totalBytesWritten = commitMetadata.fetchTotalBytesWritten(); long totalRecordsWritten = commitMetadata.fetchTotalRecordsWritten(); return (long) Math.ceil((1.0 * totalBytesWritten) / totalRecordsWritten); } private static HoodieTableMetaClient getMetaClient(JavaSparkContext jsc, String basePath) { FileSystem fs = HadoopFSUtils.getFs(basePath, jsc.hadoopConfiguration()); return HoodieTableMetaClient.builder() .setConf(HadoopFSUtils.getStorageConfWithCopy(fs.getConf())).setBasePath(basePath).build(); } }





© 2015 - 2025 Weber Informatics LLC | Privacy Policy