All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.hudi.utilities.streamer.HoodieStreamerUtils Maven / Gradle / Ivy

The newest version!
/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *   http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied.  See the License for the
 * specific language governing permissions and limitations
 * under the License.
 */

package org.apache.hudi.utilities.streamer;

import org.apache.hudi.AvroConversionUtils;
import org.apache.hudi.DataSourceUtils;
import org.apache.hudi.SparkAdapterSupport$;
import org.apache.hudi.avro.HoodieAvroUtils;
import org.apache.hudi.common.config.SerializableSchema;
import org.apache.hudi.common.config.TypedProperties;
import org.apache.hudi.common.model.HoodieAvroRecord;
import org.apache.hudi.common.model.HoodieKey;
import org.apache.hudi.common.model.HoodieRecord;
import org.apache.hudi.common.model.HoodieRecordPayload;
import org.apache.hudi.common.model.HoodieSparkRecord;
import org.apache.hudi.common.model.WriteOperationType;
import org.apache.hudi.common.util.ConfigUtils;
import org.apache.hudi.common.util.Either;
import org.apache.hudi.common.util.Option;
import org.apache.hudi.common.util.StringUtils;
import org.apache.hudi.common.util.collection.ClosableIterator;
import org.apache.hudi.common.util.collection.CloseableMappingIterator;
import org.apache.hudi.exception.HoodieException;
import org.apache.hudi.exception.HoodieKeyException;
import org.apache.hudi.exception.HoodieKeyGeneratorException;
import org.apache.hudi.exception.HoodieRecordCreationException;
import org.apache.hudi.keygen.BuiltinKeyGenerator;
import org.apache.hudi.keygen.KeyGenUtils;
import org.apache.hudi.keygen.constant.KeyGeneratorOptions;
import org.apache.hudi.keygen.factory.HoodieSparkKeyGeneratorFactory;
import org.apache.hudi.util.SparkKeyGenUtils;
import org.apache.hudi.utilities.schema.SchemaProvider;

import org.apache.avro.generic.GenericRecord;
import org.apache.spark.TaskContext;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.function.FlatMapFunction;
import org.apache.spark.sql.HoodieInternalRowUtils;
import org.apache.spark.sql.avro.HoodieAvroDeserializer;
import org.apache.spark.sql.catalyst.InternalRow;
import org.apache.spark.sql.types.StructType;

import java.util.Arrays;
import java.util.Iterator;
import java.util.Set;
import java.util.stream.Collectors;

import static org.apache.hudi.common.table.HoodieTableConfig.DROP_PARTITION_COLUMNS;
import static org.apache.hudi.config.HoodieErrorTableConfig.ERROR_ENABLE_VALIDATE_RECORD_CREATION;

/**
 * Util class for HoodieStreamer.
 */
public class HoodieStreamerUtils {

  /**
   * Generates HoodieRecords for the avro data read from source.
   * Takes care of dropping columns, precombine, auto key generation.
   * Both AVRO and SPARK record types are supported.
   */
  public static Option> createHoodieRecords(HoodieStreamer.Config cfg, TypedProperties props, Option> avroRDDOptional,
                                                                  SchemaProvider schemaProvider, HoodieRecord.HoodieRecordType recordType, boolean autoGenerateRecordKeys,
                                                                  String instantTime, Option errorTableWriter) {
    boolean shouldCombine = cfg.filterDupes || cfg.operation.equals(WriteOperationType.UPSERT);
    boolean shouldErrorTable = errorTableWriter.isPresent() && props.getBoolean(ERROR_ENABLE_VALIDATE_RECORD_CREATION.key(), ERROR_ENABLE_VALIDATE_RECORD_CREATION.defaultValue());
    boolean useConsistentLogicalTimestamp = ConfigUtils.getBooleanWithAltKeys(
        props, KeyGeneratorOptions.KEYGENERATOR_CONSISTENT_LOGICAL_TIMESTAMP_ENABLED);
    Set partitionColumns = getPartitionColumns(props);
    String payloadClassName = StringUtils.isNullOrEmpty(cfg.payloadClassName) ? HoodieRecordPayload.getAvroPayloadForMergeMode(cfg.recordMergeMode) : cfg.payloadClassName;
    return avroRDDOptional.map(avroRDD -> {
      SerializableSchema avroSchema = new SerializableSchema(schemaProvider.getTargetSchema());
      SerializableSchema processedAvroSchema = new SerializableSchema(isDropPartitionColumns(props) ? HoodieAvroUtils.removeMetadataFields(avroSchema.get()) : avroSchema.get());
      JavaRDD> records;
      if (recordType == HoodieRecord.HoodieRecordType.AVRO) {
        records = avroRDD.mapPartitions(
            (FlatMapFunction, Either>) genericRecordIterator -> {
              if (autoGenerateRecordKeys) {
                props.setProperty(KeyGenUtils.RECORD_KEY_GEN_PARTITION_ID_CONFIG, String.valueOf(TaskContext.getPartitionId()));
                props.setProperty(KeyGenUtils.RECORD_KEY_GEN_INSTANT_TIME_CONFIG, instantTime);
              }
              BuiltinKeyGenerator builtinKeyGenerator = (BuiltinKeyGenerator) HoodieSparkKeyGeneratorFactory.createKeyGenerator(props);
              return new CloseableMappingIterator<>(ClosableIterator.wrap(genericRecordIterator), genRec -> {
                try {
                  HoodieKey hoodieKey = new HoodieKey(builtinKeyGenerator.getRecordKey(genRec), builtinKeyGenerator.getPartitionPath(genRec));
                  GenericRecord gr = isDropPartitionColumns(props) ? HoodieAvroUtils.removeFields(genRec, partitionColumns) : genRec;
                  HoodieRecordPayload payload = shouldCombine ? DataSourceUtils.createPayload(payloadClassName, gr,
                      (Comparable) HoodieAvroUtils.getNestedFieldVal(gr, cfg.sourceOrderingField, false, useConsistentLogicalTimestamp))
                      : DataSourceUtils.createPayload(payloadClassName, gr);
                  return Either.left(new HoodieAvroRecord<>(hoodieKey, payload));
                } catch (Exception e) {
                  return generateErrorRecordOrThrowException(genRec, e, shouldErrorTable);
                }
              });
            });

      } else if (recordType == HoodieRecord.HoodieRecordType.SPARK) {
        // TODO we should remove it if we can read InternalRow from source.

        records = avroRDD.mapPartitions(itr -> {
          if (autoGenerateRecordKeys) {
            props.setProperty(KeyGenUtils.RECORD_KEY_GEN_PARTITION_ID_CONFIG, String.valueOf(TaskContext.getPartitionId()));
            props.setProperty(KeyGenUtils.RECORD_KEY_GEN_INSTANT_TIME_CONFIG, instantTime);
          }
          BuiltinKeyGenerator builtinKeyGenerator = (BuiltinKeyGenerator) HoodieSparkKeyGeneratorFactory.createKeyGenerator(props);
          StructType baseStructType = AvroConversionUtils.convertAvroSchemaToStructType(processedAvroSchema.get());
          StructType targetStructType = isDropPartitionColumns(props) ? AvroConversionUtils
              .convertAvroSchemaToStructType(HoodieAvroUtils.removeFields(processedAvroSchema.get(), partitionColumns)) : baseStructType;
          HoodieAvroDeserializer deserializer = SparkAdapterSupport$.MODULE$.sparkAdapter().createAvroDeserializer(processedAvroSchema.get(), baseStructType);

          return new CloseableMappingIterator<>(ClosableIterator.wrap(itr), rec -> {
            InternalRow row = (InternalRow) deserializer.deserialize(rec).get();
            try {
              String recordKey = builtinKeyGenerator.getRecordKey(row, baseStructType).toString();
              String partitionPath = builtinKeyGenerator.getPartitionPath(row, baseStructType).toString();
              return Either.left(new HoodieSparkRecord(new HoodieKey(recordKey, partitionPath),
                  HoodieInternalRowUtils.getCachedUnsafeProjection(baseStructType, targetStructType).apply(row), targetStructType, false));
            } catch (Exception e) {
              return generateErrorRecordOrThrowException(rec, e, shouldErrorTable);
            }
          });

        });
      } else {
        throw new UnsupportedOperationException(recordType.name());
      }
      if (shouldErrorTable) {
        errorTableWriter.get().addErrorEvents(records.filter(Either::isRight).map(Either::asRight).map(evStr -> new ErrorEvent<>(evStr,
            ErrorEvent.ErrorReason.RECORD_CREATION)));
      }
      return records.filter(Either::isLeft).map(Either::asLeft);
    });
  }

  /**
   * @param genRec Avro {@link GenericRecord} instance.
   * @return the representation of error record (empty {@link HoodieRecord} and the error record
   * String) for writing to error table.
   */
  private static Either generateErrorRecordOrThrowException(GenericRecord genRec, Exception e, boolean shouldErrorTable) {
    if (!shouldErrorTable) {
      if (e instanceof HoodieKeyException) {
        throw (HoodieKeyException) e;
      } else if (e instanceof HoodieKeyGeneratorException) {
        throw (HoodieKeyGeneratorException) e;
      } else {
        throw new HoodieRecordCreationException("Failed to create Hoodie Record", e);
      }
    }
    try {
      return Either.right(HoodieAvroUtils.safeAvroToJsonString(genRec));
    } catch (Exception ex) {
      throw new HoodieException("Failed to convert illegal record to json", ex);
    }
  }

  /**
   * Set based on hoodie.datasource.write.drop.partition.columns config.
   * When set to true, will not write the partition columns into the table.
   */
  static Boolean isDropPartitionColumns(TypedProperties props) {
    return props.getBoolean(DROP_PARTITION_COLUMNS.key(), DROP_PARTITION_COLUMNS.defaultValue());
  }

  /**
   * Get the partition columns as a set of strings.
   *
   * @param props TypedProperties
   * @return Set of partition columns.
   */
  static Set getPartitionColumns(TypedProperties props) {
    String partitionColumns = SparkKeyGenUtils.getPartitionColumns(props);
    return Arrays.stream(partitionColumns.split(",")).collect(Collectors.toSet());
  }

}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy