All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.hudi.io.HoodieCDCLogger Maven / Gradle / Ivy

/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.hudi.io;

import org.apache.hudi.avro.HoodieAvroUtils;
import org.apache.hudi.common.fs.FSUtils;
import org.apache.hudi.common.model.HoodieAvroIndexedRecord;
import org.apache.hudi.common.model.HoodieAvroPayload;
import org.apache.hudi.common.model.HoodieRecord;
import org.apache.hudi.common.table.HoodieTableConfig;
import org.apache.hudi.common.table.cdc.HoodieCDCOperation;
import org.apache.hudi.common.table.cdc.HoodieCDCSupplementalLoggingMode;
import org.apache.hudi.common.table.cdc.HoodieCDCUtils;
import org.apache.hudi.common.table.log.AppendResult;
import org.apache.hudi.common.table.log.HoodieLogFormat;
import org.apache.hudi.common.table.log.block.HoodieCDCDataBlock;
import org.apache.hudi.common.table.log.block.HoodieLogBlock;
import org.apache.hudi.common.util.DefaultSizeEstimator;
import org.apache.hudi.common.util.Option;
import org.apache.hudi.common.util.SizeEstimator;
import org.apache.hudi.common.util.StringUtils;
import org.apache.hudi.common.util.collection.ExternalSpillableMap;
import org.apache.hudi.config.HoodieWriteConfig;
import org.apache.hudi.exception.HoodieException;
import org.apache.hudi.exception.HoodieIOException;
import org.apache.hudi.exception.HoodieUpsertException;

import org.apache.avro.Schema;
import org.apache.avro.generic.GenericData;
import org.apache.avro.generic.GenericRecord;
import org.apache.avro.generic.IndexedRecord;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;

import java.io.Closeable;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Collections;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.concurrent.atomic.AtomicInteger;
import java.util.stream.Collectors;

import static org.apache.hudi.common.table.cdc.HoodieCDCSupplementalLoggingMode.DATA_BEFORE;
import static org.apache.hudi.common.table.cdc.HoodieCDCSupplementalLoggingMode.DATA_BEFORE_AFTER;

/**
 * This class encapsulates all the cdc-writing functions.
 */
public class HoodieCDCLogger implements Closeable {

  private final String commitTime;

  private final String keyField;

  private final String partitionPath;

  private final FileSystem fs;

  private final Schema dataSchema;

  // writer for cdc data
  private final HoodieLogFormat.Writer cdcWriter;

  private final HoodieCDCSupplementalLoggingMode cdcSupplementalLoggingMode;

  private final Schema cdcSchema;

  // the cdc data
  private final Map cdcData;

  private final Map cdcDataBlockHeader;

  // the cdc record transformer
  private final CDCTransformer transformer;

  // Max block size to limit to for a log block
  private final long maxBlockSize;

  // Average cdc record size. This size is updated at the end of every log block flushed to disk
  private long averageCDCRecordSize = 0;

  // Number of records that must be written to meet the max block size for a log block
  private AtomicInteger numOfCDCRecordsInMemory = new AtomicInteger();

  private final SizeEstimator sizeEstimator;

  private final List cdcAbsPaths;

  public HoodieCDCLogger(
      String commitTime,
      HoodieWriteConfig config,
      HoodieTableConfig tableConfig,
      String partitionPath,
      FileSystem fs,
      Schema schema,
      HoodieLogFormat.Writer cdcWriter,
      long maxInMemorySizeInBytes) {
    try {
      this.commitTime = commitTime;
      this.keyField = config.populateMetaFields()
          ? HoodieRecord.RECORD_KEY_METADATA_FIELD
          : tableConfig.getRecordKeyFieldProp();
      this.partitionPath = partitionPath;
      this.fs = fs;
      this.dataSchema = HoodieAvroUtils.removeMetadataFields(schema);
      this.cdcWriter = cdcWriter;
      this.cdcSupplementalLoggingMode = tableConfig.cdcSupplementalLoggingMode();
      this.cdcSchema = HoodieCDCUtils.schemaBySupplementalLoggingMode(
          cdcSupplementalLoggingMode,
          dataSchema
      );

      this.cdcDataBlockHeader = new HashMap<>();
      this.cdcDataBlockHeader.put(HoodieLogBlock.HeaderMetadataType.INSTANT_TIME, commitTime);
      this.cdcDataBlockHeader.put(HoodieLogBlock.HeaderMetadataType.SCHEMA, cdcSchema.toString());

      this.sizeEstimator = new DefaultSizeEstimator<>();
      this.cdcData = new ExternalSpillableMap<>(
          maxInMemorySizeInBytes,
          config.getSpillableMapBasePath(),
          new DefaultSizeEstimator<>(),
          new DefaultSizeEstimator<>(),
          config.getCommonConfig().getSpillableDiskMapType(),
          config.getCommonConfig().isBitCaskDiskMapCompressionEnabled());
      this.transformer = getTransformer();
      this.maxBlockSize = config.getLogFileDataBlockMaxSize();

      this.cdcAbsPaths = new ArrayList<>();
    } catch (IOException e) {
      throw new HoodieUpsertException("Failed to initialize HoodieCDCLogger", e);
    }
  }

  public void put(HoodieRecord hoodieRecord,
                  GenericRecord oldRecord,
                  Option newRecord) {
    String recordKey = hoodieRecord.getRecordKey();
    GenericData.Record cdcRecord;
    if (newRecord.isPresent()) {
      GenericRecord record = (GenericRecord) newRecord.get();
      if (oldRecord == null) {
        // INSERT cdc record
        cdcRecord = this.transformer.transform(HoodieCDCOperation.INSERT, recordKey,
            null, record);
      } else {
        // UPDATE cdc record
        cdcRecord = this.transformer.transform(HoodieCDCOperation.UPDATE, recordKey,
            oldRecord, record);
      }
    } else {
      // DELETE cdc record
      cdcRecord = this.transformer.transform(HoodieCDCOperation.DELETE, recordKey,
          oldRecord, null);
    }

    flushIfNeeded(false);
    HoodieAvroPayload payload = new HoodieAvroPayload(Option.of(cdcRecord));
    if (cdcData.isEmpty()) {
      averageCDCRecordSize = sizeEstimator.sizeEstimate(payload);
    }
    cdcData.put(recordKey, payload);
    numOfCDCRecordsInMemory.incrementAndGet();
  }

  private void flushIfNeeded(Boolean force) {
    if (force || numOfCDCRecordsInMemory.get() * averageCDCRecordSize >= maxBlockSize) {
      try {
        List records = cdcData.values().stream()
            .map(record -> {
              try {
                return new HoodieAvroIndexedRecord(record.getInsertValue(cdcSchema).get());
              } catch (IOException e) {
                throw new HoodieIOException("Failed to get cdc record", e);
              }
            }).collect(Collectors.toList());

        HoodieLogBlock block = new HoodieCDCDataBlock(records, cdcDataBlockHeader, keyField);
        AppendResult result = cdcWriter.appendBlocks(Collections.singletonList(block));

        Path cdcAbsPath = result.logFile().getPath();
        if (!cdcAbsPaths.contains(cdcAbsPath)) {
          cdcAbsPaths.add(cdcAbsPath);
        }

        // reset stat
        cdcData.clear();
        numOfCDCRecordsInMemory = new AtomicInteger();
      } catch (Exception e) {
        throw new HoodieException("Failed to write the cdc data to " + cdcWriter.getLogFile().getPath(), e);
      }
    }
  }

  public Map getCDCWriteStats() {
    Map stats = new HashMap<>();
    try {
      for (Path cdcAbsPath : cdcAbsPaths) {
        String cdcFileName = cdcAbsPath.getName();
        String cdcPath = StringUtils.isNullOrEmpty(partitionPath) ? cdcFileName : partitionPath + "/" + cdcFileName;
        stats.put(cdcPath, FSUtils.getFileSize(fs, cdcAbsPath));
      }
    } catch (IOException e) {
      throw new HoodieUpsertException("Failed to get cdc write stat", e);
    }
    return stats;
  }

  @Override
  public void close() {
    try {
      flushIfNeeded(true);
      if (cdcWriter != null) {
        cdcWriter.close();
      }
    } catch (IOException e) {
      throw new HoodieIOException("Failed to close HoodieCDCLogger", e);
    } finally {
      // in case that crash when call `flushIfNeeded`, do the cleanup again.
      cdcData.clear();
    }
  }

  // -------------------------------------------------------------------------
  //  Utilities
  // -------------------------------------------------------------------------

  private CDCTransformer getTransformer() {
    if (cdcSupplementalLoggingMode == DATA_BEFORE_AFTER) {
      return (operation, recordKey, oldRecord, newRecord) ->
          HoodieCDCUtils.cdcRecord(cdcSchema, operation.getValue(), commitTime, removeCommitMetadata(oldRecord), removeCommitMetadata(newRecord));
    } else if (cdcSupplementalLoggingMode == DATA_BEFORE) {
      return (operation, recordKey, oldRecord, newRecord) ->
          HoodieCDCUtils.cdcRecord(cdcSchema, operation.getValue(), recordKey, removeCommitMetadata(oldRecord));
    } else {
      return (operation, recordKey, oldRecord, newRecord) ->
          HoodieCDCUtils.cdcRecord(cdcSchema, operation.getValue(), recordKey);
    }
  }

  private GenericRecord removeCommitMetadata(GenericRecord record) {
    return record == null ? null : HoodieAvroUtils.rewriteRecordWithNewSchema(record, dataSchema, Collections.emptyMap());
  }

  // -------------------------------------------------------------------------
  //  Inner Class
  // -------------------------------------------------------------------------

  /**
   * A transformer that transforms normal data records into cdc records.
   */
  private interface CDCTransformer {
    GenericData.Record transform(HoodieCDCOperation operation,
                                 String recordKey,
                                 GenericRecord oldRecord,
                                 GenericRecord newRecord);

  }
}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy