All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.hudi.keygen.KeyGenUtils Maven / Gradle / Ivy

The newest version!
/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.hudi.keygen;

import org.apache.hudi.avro.HoodieAvroUtils;
import org.apache.hudi.common.config.TypedProperties;
import org.apache.hudi.common.model.HoodieRecord;
import org.apache.hudi.common.util.Option;
import org.apache.hudi.common.util.PartitionPathEncodeUtils;
import org.apache.hudi.common.util.ReflectionUtils;
import org.apache.hudi.common.util.StringUtils;
import org.apache.hudi.config.HoodieWriteConfig;
import org.apache.hudi.exception.HoodieException;
import org.apache.hudi.exception.HoodieKeyException;
import org.apache.hudi.keygen.constant.KeyGeneratorOptions;
import org.apache.hudi.keygen.constant.KeyGeneratorType;
import org.apache.hudi.keygen.parser.BaseHoodieDateTimeParser;

import org.apache.avro.generic.GenericRecord;

import java.io.IOException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.List;
import java.util.stream.Collectors;

public class KeyGenUtils {

  protected static final String NULL_RECORDKEY_PLACEHOLDER = "__null__";
  protected static final String EMPTY_RECORDKEY_PLACEHOLDER = "__empty__";

  protected static final String HUDI_DEFAULT_PARTITION_PATH = PartitionPathEncodeUtils.DEFAULT_PARTITION_PATH;
  public static final String DEFAULT_PARTITION_PATH_SEPARATOR = "/";
  public static final String DEFAULT_RECORD_KEY_PARTS_SEPARATOR = ",";
  public static final String DEFAULT_COLUMN_VALUE_SEPARATOR = ":";

  public static final String RECORD_KEY_GEN_PARTITION_ID_CONFIG = "_hoodie.record.key.gen.partition.id";
  public static final String RECORD_KEY_GEN_INSTANT_TIME_CONFIG = "_hoodie.record.key.gen.instant.time";

  /**
   * Infers the key generator type based on the record key and partition fields.
   * 

* (1) partition field is empty: {@link KeyGeneratorType#NON_PARTITION}; * (2) Only one partition field and one record key field: {@link KeyGeneratorType#SIMPLE}; * (3) More than one partition and/or record key fields: {@link KeyGeneratorType#COMPLEX}. * * @param recordsKeyFields Record key field list. * @param partitionFields Partition field list. * @return Inferred key generator type. */ public static KeyGeneratorType inferKeyGeneratorType( Option recordsKeyFields, String partitionFields) { int numRecordKeyFields = recordsKeyFields.map(fields -> fields.split(",").length).orElse(0); KeyGeneratorType partitionKeyGeneratorType = inferKeyGeneratorTypeFromPartitionFields(partitionFields); if (numRecordKeyFields <= 1) { return partitionKeyGeneratorType; } else { // More than one record key fields are configured if (partitionKeyGeneratorType == KeyGeneratorType.SIMPLE) { // if there is a single partition field configured but multiple record key fields, key generator type // should be COMPLEX and not SIMPLE return KeyGeneratorType.COMPLEX; } else { // partition generator type is COMPLEX, CUSTOM or NON_PARTITION. In all these cases, partition // generator type determines the key generator type return partitionKeyGeneratorType; } } } // When auto record key gen is enabled, our inference will be based on partition path only. static KeyGeneratorType inferKeyGeneratorTypeFromPartitionFields(String partitionFields) { if (!StringUtils.isNullOrEmpty(partitionFields)) { String[] partitonFields = partitionFields.split(","); if (partitonFields[0].contains(BaseKeyGenerator.CUSTOM_KEY_GENERATOR_SPLIT_REGEX)) { return KeyGeneratorType.CUSTOM; } else if (partitonFields.length == 1) { return KeyGeneratorType.SIMPLE; } else { return KeyGeneratorType.COMPLEX; } } return KeyGeneratorType.NON_PARTITION; } /** * Fetches record key from the GenericRecord. * * @param genericRecord generic record of interest. * @param keyGeneratorOpt Optional BaseKeyGenerator. If not, meta field will be used. * @return the record key for the passed in generic record. */ public static String getRecordKeyFromGenericRecord(GenericRecord genericRecord, Option keyGeneratorOpt) { return keyGeneratorOpt.isPresent() ? keyGeneratorOpt.get().getRecordKey(genericRecord) : genericRecord.get(HoodieRecord.RECORD_KEY_METADATA_FIELD).toString(); } /** * Fetches partition path from the GenericRecord. * * @param genericRecord generic record of interest. * @param keyGeneratorOpt Optional BaseKeyGenerator. If not, meta field will be used. * @return the partition path for the passed in generic record. */ public static String getPartitionPathFromGenericRecord(GenericRecord genericRecord, Option keyGeneratorOpt) { return keyGeneratorOpt.isPresent() ? keyGeneratorOpt.get().getPartitionPath(genericRecord) : genericRecord.get(HoodieRecord.PARTITION_PATH_METADATA_FIELD).toString(); } /** * Extracts the record key fields in strings out of the given record key, * this is the reverse operation of {@link #getRecordKey(GenericRecord, String, boolean)}. * * @see SimpleAvroKeyGenerator * @see org.apache.hudi.keygen.ComplexAvroKeyGenerator */ public static String[] extractRecordKeys(String recordKey) { return extractRecordKeysByFields(recordKey, Collections.emptyList()); } public static String[] extractRecordKeysByFields(String recordKey, List fields) { // if there is no ',' and ':', then it's a key value if (!recordKey.contains(DEFAULT_RECORD_KEY_PARTS_SEPARATOR) || !recordKey.contains(DEFAULT_COLUMN_VALUE_SEPARATOR)) { return new String[] {recordKey}; } // complex key case // Here we're reducing memory allocation for substrings and use index positions, // because for bucket index this will be called for each record, which leads to GC overhead int keyValueSep1; int keyValueSep2; int commaPosition; String currentField; String currentValue; List values = new ArrayList<>(); int processed = 0; while (processed < recordKey.length()) { // note that keyValueSeps and commaPosition are absolute keyValueSep1 = recordKey.indexOf(DEFAULT_COLUMN_VALUE_SEPARATOR, processed); currentField = recordKey.substring(processed, keyValueSep1); keyValueSep2 = recordKey.indexOf(DEFAULT_COLUMN_VALUE_SEPARATOR, keyValueSep1 + 1); if (fields.isEmpty() || (fields.size() == 1 && fields.get(0).isEmpty()) || fields.contains(currentField)) { if (keyValueSep2 < 0) { // there is no next key value pair currentValue = recordKey.substring(keyValueSep1 + 1); processed = recordKey.length(); } else { // looking for ',' in reverse order to support multiple ',' in key values by looking for the latest ',' commaPosition = recordKey.lastIndexOf(DEFAULT_RECORD_KEY_PARTS_SEPARATOR, keyValueSep2); // commaPosition could be -1 if didn't find ',', or we could find ',' from previous key-value pair ('col1:val1,...') // also we could have the last value with ':', so need to check if keyValueSep2 > 0 while (commaPosition < keyValueSep1 && keyValueSep2 > 0) { // If we have key value as a timestamp with ':', // then we continue to skip ':' until before the next ':' there is a ',' character. // For instance, 'col1:val1,col2:2014-10-22 13:50:42,col3:val3' // ^ ^ ^ ^ // 1) keyValueSep1 skip skip keyValueSep2 // ^ // commaPosition // 2) | currentValue | // ^ // 3) processed keyValueSep2 = recordKey.indexOf(DEFAULT_COLUMN_VALUE_SEPARATOR, keyValueSep2 + 1); commaPosition = recordKey.lastIndexOf(DEFAULT_RECORD_KEY_PARTS_SEPARATOR, keyValueSep2); } if (commaPosition > 0) { currentValue = recordKey.substring(keyValueSep1 + 1, commaPosition); processed = commaPosition + 1; } else { // it could be the last value with many ':', in this case we wouldn't find any ',' before currentValue = recordKey.substring(keyValueSep1 + 1); processed = recordKey.length(); } } // here could be any logic of conditional replacing of currentValue if (currentValue.equals(NULL_RECORDKEY_PLACEHOLDER)) { values.add(null); } else if (currentValue.equals(EMPTY_RECORDKEY_PLACEHOLDER)) { values.add(""); } else { values.add(currentValue); } } else { if (keyValueSep2 < 0) { processed = recordKey.length(); } else { commaPosition = recordKey.lastIndexOf(DEFAULT_RECORD_KEY_PARTS_SEPARATOR, keyValueSep2); while (commaPosition < keyValueSep1) { // described above keyValueSep2 = recordKey.indexOf(DEFAULT_COLUMN_VALUE_SEPARATOR, keyValueSep2 + 1); commaPosition = recordKey.lastIndexOf(DEFAULT_RECORD_KEY_PARTS_SEPARATOR, keyValueSep2); } if (commaPosition < 0) { // if something went wrong, and there is no ',', we should stop here, and pass the whole recordKey, // otherwise processed = commaPosition + 1 would lead to infinite loop processed = recordKey.length(); } else { processed = commaPosition + 1; } } } } return values.isEmpty() ? new String[] {recordKey} : values.toArray(new String[0]); } public static String getRecordKey(GenericRecord record, List recordKeyFields, boolean consistentLogicalTimestampEnabled) { boolean keyIsNullEmpty = true; StringBuilder recordKey = new StringBuilder(); for (int i = 0; i < recordKeyFields.size(); i++) { String recordKeyField = recordKeyFields.get(i); String recordKeyValue; try { recordKeyValue = HoodieAvroUtils.getNestedFieldValAsString(record, recordKeyField, false, consistentLogicalTimestampEnabled); } catch (HoodieException e) { throw new HoodieKeyException("Record key field '" + recordKeyField + "' does not exist in the input record"); } if (recordKeyValue == null) { recordKey.append(recordKeyField).append(DEFAULT_COLUMN_VALUE_SEPARATOR).append(NULL_RECORDKEY_PLACEHOLDER); } else if (recordKeyValue.isEmpty()) { recordKey.append(recordKeyField).append(DEFAULT_COLUMN_VALUE_SEPARATOR).append(EMPTY_RECORDKEY_PLACEHOLDER); } else { recordKey.append(recordKeyField).append(DEFAULT_COLUMN_VALUE_SEPARATOR).append(recordKeyValue); keyIsNullEmpty = false; } if (i != recordKeyFields.size() - 1) { recordKey.append(DEFAULT_RECORD_KEY_PARTS_SEPARATOR); } } if (keyIsNullEmpty) { throw new HoodieKeyException("recordKey values: \"" + recordKey + "\" for fields: " + recordKeyFields + " cannot be entirely null or empty."); } return recordKey.toString(); } public static String getRecordPartitionPath(GenericRecord record, List partitionPathFields, boolean hiveStylePartitioning, boolean encodePartitionPath, boolean consistentLogicalTimestampEnabled) { if (partitionPathFields.isEmpty()) { return ""; } StringBuilder partitionPath = new StringBuilder(); for (int i = 0; i < partitionPathFields.size(); i++) { String partitionPathField = partitionPathFields.get(i); String fieldVal = HoodieAvroUtils.getNestedFieldValAsString(record, partitionPathField, true, consistentLogicalTimestampEnabled); if (fieldVal == null || fieldVal.isEmpty()) { if (hiveStylePartitioning) { partitionPath.append(partitionPathField).append("="); } partitionPath.append(HUDI_DEFAULT_PARTITION_PATH); } else { if (encodePartitionPath) { fieldVal = PartitionPathEncodeUtils.escapePathName(fieldVal); } if (hiveStylePartitioning) { partitionPath.append(partitionPathField).append("="); } partitionPath.append(fieldVal); } if (i != partitionPathFields.size() - 1) { partitionPath.append(DEFAULT_PARTITION_PATH_SEPARATOR); } } return partitionPath.toString(); } public static String getRecordKey(GenericRecord record, String recordKeyField, boolean consistentLogicalTimestampEnabled) { String recordKey = HoodieAvroUtils.getNestedFieldValAsString(record, recordKeyField, true, consistentLogicalTimestampEnabled); if (recordKey == null || recordKey.isEmpty()) { throw new HoodieKeyException("recordKey value: \"" + recordKey + "\" for field: \"" + recordKeyField + "\" cannot be null or empty."); } return recordKey; } public static String getPartitionPath(GenericRecord record, String partitionPathField, boolean hiveStylePartitioning, boolean encodePartitionPath, boolean consistentLogicalTimestampEnabled) { String partitionPath = HoodieAvroUtils.getNestedFieldValAsString(record, partitionPathField, true, consistentLogicalTimestampEnabled); if (partitionPath == null || partitionPath.isEmpty()) { partitionPath = HUDI_DEFAULT_PARTITION_PATH; } if (encodePartitionPath) { partitionPath = PartitionPathEncodeUtils.escapePathName(partitionPath); } if (hiveStylePartitioning) { partitionPath = partitionPathField + "=" + partitionPath; } return partitionPath; } /** * Create a date time parser class for TimestampBasedKeyGenerator, passing in any configs needed. */ public static BaseHoodieDateTimeParser createDateTimeParser(TypedProperties props, String parserClass) throws IOException { try { return (BaseHoodieDateTimeParser) ReflectionUtils.loadClass(parserClass, props); } catch (Throwable e) { throw new IOException("Could not load date time parser class " + parserClass, e); } } /** * Create a key generator class via reflection, passing in any configs needed. *

* This method is for user-defined classes. To create hudi's built-in key generators, please set proper * {@link org.apache.hudi.keygen.constant.KeyGeneratorType} conf, and use the relevant factory, see * {@link org.apache.hudi.keygen.factory.HoodieAvroKeyGeneratorFactory}. */ public static KeyGenerator createKeyGeneratorByClassName(TypedProperties props) throws IOException { KeyGenerator keyGenerator = null; String keyGeneratorClass = props.getString(HoodieWriteConfig.KEYGENERATOR_CLASS_NAME.key(), null); if (!StringUtils.isNullOrEmpty(keyGeneratorClass)) { try { keyGenerator = (KeyGenerator) ReflectionUtils.loadClass(keyGeneratorClass, props); } catch (Throwable e) { throw new IOException("Could not load key generator class " + keyGeneratorClass, e); } } return keyGenerator; } public static List getRecordKeyFields(TypedProperties props) { return Option.ofNullable(props.getString(KeyGeneratorOptions.RECORDKEY_FIELD_NAME.key(), null)) .map(recordKeyConfigValue -> Arrays.stream(recordKeyConfigValue.split(",")) .map(String::trim) .filter(s -> !s.isEmpty()) .collect(Collectors.toList()) ).orElse(Collections.emptyList()); } /** * @param props props of interest. * @return true if record keys need to be auto generated. false otherwise. */ public static boolean isAutoGeneratedRecordKeysEnabled(TypedProperties props) { return !props.containsKey(KeyGeneratorOptions.RECORDKEY_FIELD_NAME.key()) || props.getProperty(KeyGeneratorOptions.RECORDKEY_FIELD_NAME.key()).equals(StringUtils.EMPTY_STRING); // spark-sql sets record key config to empty string for update, and couple of other statements. } }





© 2015 - 2025 Weber Informatics LLC | Privacy Policy