All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.hudi.sink.bulk.RowDataKeyGen Maven / Gradle / Ivy

/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.hudi.sink.bulk;

import org.apache.hudi.common.model.HoodieKey;
import org.apache.hudi.common.util.Option;
import org.apache.hudi.common.util.StringUtils;
import org.apache.hudi.configuration.FlinkOptions;
import org.apache.hudi.configuration.OptionsResolver;
import org.apache.hudi.exception.HoodieKeyException;
import org.apache.hudi.keygen.TimestampBasedAvroKeyGenerator;
import org.apache.hudi.util.RowDataProjection;
import org.apache.hudi.util.StreamerUtil;

import org.apache.flink.configuration.Configuration;
import org.apache.flink.table.data.RowData;
import org.apache.flink.table.data.TimestampData;
import org.apache.flink.table.types.logical.LogicalType;
import org.apache.flink.table.types.logical.RowType;

import java.io.IOException;
import java.io.Serializable;
import java.util.Arrays;
import java.util.List;

import static org.apache.hudi.common.util.PartitionPathEncodeUtils.DEFAULT_PARTITION_PATH;
import static org.apache.hudi.common.util.PartitionPathEncodeUtils.escapePathName;

/**
 * Key generator for {@link RowData}.
 */
public class RowDataKeyGen implements Serializable {
  private static final long serialVersionUID = 1L;

  // reference: NonpartitionedAvroKeyGenerator
  private static final String EMPTY_PARTITION = "";

  // reference: org.apache.hudi.keygen.KeyGenUtils
  private static final String NULL_RECORDKEY_PLACEHOLDER = "__null__";
  private static final String EMPTY_RECORDKEY_PLACEHOLDER = "__empty__";

  private static final String DEFAULT_PARTITION_PATH_SEPARATOR = "/";
  private static final String DEFAULT_FIELD_SEPARATOR = ",";

  private final String[] recordKeyFields;
  private final String[] partitionPathFields;

  private final RowDataProjection recordKeyProjection;
  private final RowDataProjection partitionPathProjection;

  private final boolean hiveStylePartitioning;
  private final boolean encodePartitionPath;
  private final boolean consistentLogicalTimestampEnabled;

  private final Option keyGenOpt;

  // efficient code path
  private boolean simpleRecordKey = false;
  private RowData.FieldGetter recordKeyFieldGetter;

  private boolean simplePartitionPath = false;
  private RowData.FieldGetter partitionPathFieldGetter;

  private boolean nonPartitioned;

  protected RowDataKeyGen(
      Option recordKeys,
      String partitionFields,
      RowType rowType,
      boolean hiveStylePartitioning,
      boolean encodePartitionPath,
      boolean consistentLogicalTimestampEnabled,
      Option keyGenOpt) {
    this.partitionPathFields = partitionFields.split(DEFAULT_FIELD_SEPARATOR);
    this.hiveStylePartitioning = hiveStylePartitioning;
    this.encodePartitionPath = encodePartitionPath;
    this.consistentLogicalTimestampEnabled = consistentLogicalTimestampEnabled;

    List fieldNames = rowType.getFieldNames();
    List fieldTypes = rowType.getChildren();

    if (!recordKeys.isPresent()) {
      this.recordKeyFields = null;
      this.recordKeyProjection = null;
    } else {
      this.recordKeyFields = recordKeys.get().split(DEFAULT_FIELD_SEPARATOR);
      if (this.recordKeyFields.length == 1) {
        // efficient code path
        this.simpleRecordKey = true;
        int recordKeyIdx = fieldNames.indexOf(this.recordKeyFields[0]);
        this.recordKeyFieldGetter = RowData.createFieldGetter(fieldTypes.get(recordKeyIdx), recordKeyIdx);
        this.recordKeyProjection = null;
      } else {
        this.recordKeyProjection = getProjection(this.recordKeyFields, fieldNames, fieldTypes);
      }
    }

    if (this.partitionPathFields.length == 1) {
      // efficient code path
      if (this.partitionPathFields[0].equals("")) {
        this.nonPartitioned = true;
      } else {
        this.simplePartitionPath = true;
        int partitionPathIdx = fieldNames.indexOf(this.partitionPathFields[0]);
        this.partitionPathFieldGetter = RowData.createFieldGetter(fieldTypes.get(partitionPathIdx), partitionPathIdx);
      }
      this.partitionPathProjection = null;
    } else {
      this.partitionPathProjection = getProjection(this.partitionPathFields, fieldNames, fieldTypes);
    }
    this.keyGenOpt = keyGenOpt;
  }

  public static RowDataKeyGen instance(Configuration conf, RowType rowType) {
    Option keyGeneratorOpt = Option.empty();
    if (TimestampBasedAvroKeyGenerator.class.getName().equals(conf.getString(FlinkOptions.KEYGEN_CLASS_NAME))) {
      try {
        keyGeneratorOpt = Option.of(new TimestampBasedAvroKeyGenerator(StreamerUtil.flinkConf2TypedProperties(conf)));
      } catch (IOException e) {
        throw new HoodieKeyException("Initialize TimestampBasedAvroKeyGenerator error", e);
      }
    }
    boolean consistentLogicalTimestampEnabled = OptionsResolver.isConsistentLogicalTimestampEnabled(conf);
    return new RowDataKeyGen(Option.of(conf.getString(FlinkOptions.RECORD_KEY_FIELD)), conf.getString(FlinkOptions.PARTITION_PATH_FIELD),
        rowType, conf.getBoolean(FlinkOptions.HIVE_STYLE_PARTITIONING), conf.getBoolean(FlinkOptions.URL_ENCODE_PARTITIONING),
        consistentLogicalTimestampEnabled, keyGeneratorOpt);
  }

  public HoodieKey getHoodieKey(RowData rowData) {
    return new HoodieKey(getRecordKey(rowData), getPartitionPath(rowData));
  }

  public String getRecordKey(RowData rowData) {
    if (this.simpleRecordKey) {
      return getRecordKey(recordKeyFieldGetter.getFieldOrNull(rowData), this.recordKeyFields[0], consistentLogicalTimestampEnabled);
    } else {
      Object[] keyValues = this.recordKeyProjection.projectAsValues(rowData);
      return getRecordKey(keyValues, this.recordKeyFields, consistentLogicalTimestampEnabled);
    }
  }

  public String getPartitionPath(RowData rowData) {
    if (this.simplePartitionPath) {
      return getPartitionPath(partitionPathFieldGetter.getFieldOrNull(rowData),
          this.partitionPathFields[0], this.hiveStylePartitioning, this.encodePartitionPath, this.keyGenOpt);
    } else if (this.nonPartitioned) {
      return EMPTY_PARTITION;
    } else {
      Object[] partValues = this.partitionPathProjection.projectAsValues(rowData);
      return getRecordPartitionPath(partValues, this.partitionPathFields, this.hiveStylePartitioning, this.encodePartitionPath);
    }
  }

  // reference: org.apache.hudi.keygen.KeyGenUtils.getRecordKey
  private static String getRecordKey(Object[] keyValues, String[] keyFields, boolean consistentLogicalTimestampEnabled) {
    boolean keyIsNullEmpty = true;
    StringBuilder recordKey = new StringBuilder();
    for (int i = 0; i < keyValues.length; i++) {
      String recordKeyField = keyFields[i];
      Object value = keyValues[i];
      value = getTimestampValue(consistentLogicalTimestampEnabled, value);
      String recordKeyValue = StringUtils.objToString(value);
      if (recordKeyValue == null) {
        recordKey.append(recordKeyField).append(":").append(NULL_RECORDKEY_PLACEHOLDER);
      } else if (recordKeyValue.isEmpty()) {
        recordKey.append(recordKeyField).append(":").append(EMPTY_RECORDKEY_PLACEHOLDER);
      } else {
        recordKey.append(recordKeyField).append(":").append(recordKeyValue);
        keyIsNullEmpty = false;
      }
      if (i != keyValues.length - 1) {
        recordKey.append(DEFAULT_FIELD_SEPARATOR);
      }
    }
    if (keyIsNullEmpty) {
      throw new HoodieKeyException(String.format("recordKey values: \"%s\" for fields: %s cannot be entirely null or empty.",
          recordKey, Arrays.toString(keyFields)));
    }
    return recordKey.toString();
  }

  private static Object getTimestampValue(boolean consistentLogicalTimestampEnabled, Object value) {
    if (!consistentLogicalTimestampEnabled && (value instanceof TimestampData)) {
      TimestampData timestampData = (TimestampData) value;
      value = timestampData.toTimestamp().toInstant().toEpochMilli();
    }
    return value;
  }

  // reference: org.apache.hudi.keygen.KeyGenUtils.getRecordPartitionPath
  private static String getRecordPartitionPath(
      Object[] partValues,
      String[] partFields,
      boolean hiveStylePartitioning,
      boolean encodePartitionPath) {
    StringBuilder partitionPath = new StringBuilder();
    for (int i = 0; i < partFields.length; i++) {
      String partField = partFields[i];
      String partValue = StringUtils.objToString(partValues[i]);
      if (partValue == null || partValue.isEmpty()) {
        partitionPath.append(hiveStylePartitioning ? partField + "=" + DEFAULT_PARTITION_PATH : DEFAULT_PARTITION_PATH);
      } else {
        if (encodePartitionPath) {
          partValue = escapePathName(partValue);
        }
        partitionPath.append(hiveStylePartitioning ? partField + "=" + partValue : partValue);
      }
      if (i != partFields.length - 1) {
        partitionPath.append(DEFAULT_PARTITION_PATH_SEPARATOR);
      }
    }
    return partitionPath.toString();
  }

  // reference: org.apache.hudi.keygen.KeyGenUtils.getRecordKey
  public static String getRecordKey(Object recordKeyValue, String recordKeyField,boolean consistentLogicalTimestampEnabled) {
    recordKeyValue = getTimestampValue(consistentLogicalTimestampEnabled, recordKeyValue);
    String recordKey = StringUtils.objToString(recordKeyValue);
    if (recordKey == null || recordKey.isEmpty()) {
      throw new HoodieKeyException(String.format("recordKey value: \"%s\" for field: \"%s\" cannot be null or empty.",
          recordKey, recordKeyField));
    }
    return recordKey;
  }

  // reference: org.apache.hudi.keygen.KeyGenUtils.getPartitionPath
  public static String getPartitionPath(
      Object partValue,
      String partField,
      boolean hiveStylePartitioning,
      boolean encodePartitionPath,
      Option keyGenOpt) {
    if (keyGenOpt.isPresent()) {
      TimestampBasedAvroKeyGenerator keyGenerator = keyGenOpt.get();
      return keyGenerator.getPartitionPath(toEpochMilli(partValue, keyGenerator));
    }
    String partitionPath = StringUtils.objToString(partValue);
    if (partitionPath == null || partitionPath.isEmpty()) {
      partitionPath = DEFAULT_PARTITION_PATH;
    }
    if (encodePartitionPath) {
      partitionPath = escapePathName(partitionPath);
    }
    if (hiveStylePartitioning) {
      partitionPath = partField + "=" + partitionPath;
    }
    return partitionPath;
  }

  private static Object toEpochMilli(Object val, TimestampBasedAvroKeyGenerator keyGenerator) {
    if (val instanceof TimestampData) {
      return ((TimestampData) val).toInstant().toEpochMilli();
    }
    if (val == null) {
      // should match the default partition path when STRING partition path re-format is supported
      return keyGenerator.getDefaultPartitionVal();
    }
    return val;
  }

  /**
   * Returns the row data projection for the given field names and table schema.
   *
   * @param fields       The projected field names
   * @param schemaFields The table schema names
   * @param schemaTypes  The table schema types
   * @return the row data projection for the fields
   */
  private static RowDataProjection getProjection(String[] fields, List schemaFields, List schemaTypes) {
    int[] positions = getFieldPositions(fields, schemaFields);
    LogicalType[] types = Arrays.stream(positions).mapToObj(schemaTypes::get).toArray(LogicalType[]::new);
    return RowDataProjection.instance(types, positions);
  }

  /**
   * Returns the field positions of the given fields {@code fields} among all the fields {@code allFields}.
   */
  private static int[] getFieldPositions(String[] fields, List allFields) {
    return Arrays.stream(fields).mapToInt(allFields::indexOf).toArray();
  }
}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy