All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.hudi.common.util.FileFormatUtils Maven / Gradle / Ivy

The newest version!
/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.hudi.common.util;

import org.apache.hudi.avro.HoodieBloomFilterWriteSupport;
import org.apache.hudi.common.bloom.BloomFilter;
import org.apache.hudi.common.bloom.BloomFilterFactory;
import org.apache.hudi.common.bloom.BloomFilterTypeCode;
import org.apache.hudi.common.model.HoodieColumnRangeMetadata;
import org.apache.hudi.common.model.HoodieFileFormat;
import org.apache.hudi.common.model.HoodieKey;
import org.apache.hudi.common.model.HoodieRecord;
import org.apache.hudi.common.util.collection.ClosableIterator;
import org.apache.hudi.common.util.collection.Pair;
import org.apache.hudi.exception.HoodieException;
import org.apache.hudi.keygen.BaseKeyGenerator;
import org.apache.hudi.storage.HoodieStorage;
import org.apache.hudi.storage.StoragePath;

import org.apache.avro.Schema;
import org.apache.avro.generic.GenericRecord;

import javax.annotation.Nonnull;

import java.io.IOException;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Properties;
import java.util.Set;
import java.util.function.Function;
import java.util.stream.Collectors;

/**
 * Utils for file format used in Hudi.
 */
public abstract class FileFormatUtils {
  /**
   * Aggregate column range statistics across files in a partition.
   *
   * @param relativePartitionPath relative partition path for the column range stats
   * @param fileColumnRanges List of column range statistics for each file in a partition
   */
  public static > HoodieColumnRangeMetadata getColumnRangeInPartition(String relativePartitionPath,
                                                                                                 @Nonnull List> fileColumnRanges) {
    ValidationUtils.checkArgument(!fileColumnRanges.isEmpty(), "fileColumnRanges should not be empty.");
    // There are multiple files. Compute min(file_mins) and max(file_maxs)
    return fileColumnRanges.stream()
        .map(e -> HoodieColumnRangeMetadata.create(
            relativePartitionPath, e.getColumnName(), e.getMinValue(), e.getMaxValue(),
            e.getNullCount(), e.getValueCount(), e.getTotalSize(), e.getTotalUncompressedSize()))
        .reduce(HoodieColumnRangeMetadata::merge).orElseThrow(() -> new HoodieException("MergingColumnRanges failed."));
  }

  /**
   * Read the rowKey list from the given data file.
   *
   * @param storage  {@link HoodieStorage} instance.
   * @param filePath the data file path.
   * @return set of row keys
   */
  public Set readRowKeys(HoodieStorage storage, StoragePath filePath) {
    return filterRowKeys(storage, filePath, new HashSet<>())
        .stream().map(Pair::getKey).collect(Collectors.toSet());
  }

  /**
   * Read the bloom filter from the metadata of the given data file.
   *
   * @param storage  {@link HoodieStorage} instance.
   * @param filePath the data file path.
   * @return a BloomFilter object.
   */
  public BloomFilter readBloomFilterFromMetadata(HoodieStorage storage, StoragePath filePath) {
    Map footerVals =
        readFooter(storage, false, filePath,
            HoodieBloomFilterWriteSupport.HOODIE_AVRO_BLOOM_FILTER_METADATA_KEY,
            HoodieBloomFilterWriteSupport.OLD_HOODIE_AVRO_BLOOM_FILTER_METADATA_KEY,
            HoodieBloomFilterWriteSupport.HOODIE_BLOOM_FILTER_TYPE_CODE);
    String footerVal = footerVals.get(HoodieBloomFilterWriteSupport.HOODIE_AVRO_BLOOM_FILTER_METADATA_KEY);
    if (null == footerVal) {
      // We use old style key "com.uber.hoodie.bloomfilter"
      footerVal = footerVals.get(HoodieBloomFilterWriteSupport.OLD_HOODIE_AVRO_BLOOM_FILTER_METADATA_KEY);
    }
    BloomFilter toReturn = null;
    if (footerVal != null) {
      if (footerVals.containsKey(HoodieBloomFilterWriteSupport.HOODIE_BLOOM_FILTER_TYPE_CODE)) {
        toReturn = BloomFilterFactory.fromString(footerVal,
            footerVals.get(HoodieBloomFilterWriteSupport.HOODIE_BLOOM_FILTER_TYPE_CODE));
      } else {
        toReturn = BloomFilterFactory.fromString(footerVal, BloomFilterTypeCode.SIMPLE.name());
      }
    }
    return toReturn;
  }

  /**
   * Read the min and max record key from the metadata of the given data file.
   *
   * @param storage  {@link HoodieStorage} instance.
   * @param filePath the data file path.
   * @return an array of two string where the first is min record key and the second is max record key.
   */
  public String[] readMinMaxRecordKeys(HoodieStorage storage, StoragePath filePath) {
    Map minMaxKeys = readFooter(storage, true, filePath,
        HoodieBloomFilterWriteSupport.HOODIE_MIN_RECORD_KEY_FOOTER, HoodieBloomFilterWriteSupport.HOODIE_MAX_RECORD_KEY_FOOTER);
    if (minMaxKeys.size() != 2) {
      throw new HoodieException(
          String.format("Could not read min/max record key out of footer correctly from %s. read) : %s",
              filePath, minMaxKeys));
    }
    return new String[] {minMaxKeys.get(HoodieBloomFilterWriteSupport.HOODIE_MIN_RECORD_KEY_FOOTER),
        minMaxKeys.get(HoodieBloomFilterWriteSupport.HOODIE_MAX_RECORD_KEY_FOOTER)};
  }

  /**
   * Read the data file
   * NOTE: This literally reads the entire file contents, thus should be used with caution.
   *
   * @param storage  {@link HoodieStorage} instance.
   * @param filePath the data file path.
   * @return a list of GenericRecord.
   */
  public abstract List readAvroRecords(HoodieStorage storage, StoragePath filePath);

  /**
   * Read the data file using the given schema
   * NOTE: This literally reads the entire file contents, thus should be used with caution.
   *
   * @param storage  {@link HoodieStorage} instance.
   * @param filePath the data file path.
   * @return a list of GenericRecord.
   */
  public abstract List readAvroRecords(HoodieStorage storage, StoragePath filePath, Schema schema);

  /**
   * Read the footer data of the given data file.
   *
   * @param storage     {@link HoodieStorage} instance.
   * @param required    require the footer data to be in data file.
   * @param filePath    the data file path.
   * @param footerNames the footer names to read.
   * @return a map where the key is the footer name and the value is the footer value.
   */
  public abstract Map readFooter(HoodieStorage storage, boolean required, StoragePath filePath,
                                                 String... footerNames);

  /**
   * Returns the number of records in the data file.
   *
   * @param storage  {@link HoodieStorage} instance.
   * @param filePath the data file path.
   */
  public abstract long getRowCount(HoodieStorage storage, StoragePath filePath);

  /**
   * Read the rowKey list matching the given filter, from the given data file.
   * If the filter is empty, then this will return all the row keys and corresponding positions.
   *
   * @param storage  {@link HoodieStorage} instance.
   * @param filePath the data file path.
   * @param filter   record keys filter.
   * @return set of pairs of row key and position matching candidateRecordKeys.
   */
  public abstract Set> filterRowKeys(HoodieStorage storage, StoragePath filePath, Set filter);

  /**
   * Fetch {@link HoodieKey}s with positions from the given data file.
   *
   * @param storage  {@link HoodieStorage} instance.
   * @param filePath the data file path.
   * @return {@link List} of pairs of {@link HoodieKey} and position fetched from the data file.
   */
  public abstract List> fetchRecordKeysWithPositions(HoodieStorage storage, StoragePath filePath);

  /**
   * Provides a closable iterator for reading the given data file.
   *
   * @param storage         {@link HoodieStorage} instance.
   * @param filePath        the data file path.
   * @param keyGeneratorOpt instance of KeyGenerator.
   * @return {@link ClosableIterator} of {@link HoodieKey}s for reading the file.
   */
  public abstract ClosableIterator getHoodieKeyIterator(HoodieStorage storage,
                                                                   StoragePath filePath,
                                                                   Option keyGeneratorOpt);

  /**
   * Provides a closable iterator for reading the given data file.
   *
   * @param storage  {@link HoodieStorage} instance.
   * @param filePath the data file path.
   * @return {@link ClosableIterator} of {@link HoodieKey}s for reading the file.
   */
  public abstract ClosableIterator getHoodieKeyIterator(HoodieStorage storage, StoragePath filePath);

  /**
   * Fetch {@link HoodieKey}s with positions from the given data file.
   *
   * @param storage         {@link HoodieStorage} instance.
   * @param filePath        the data file path.
   * @param keyGeneratorOpt instance of KeyGenerator.
   * @return {@link List} of pairs of {@link HoodieKey} and position fetched from the data file.
   */
  public abstract List> fetchRecordKeysWithPositions(HoodieStorage storage,
                                                                           StoragePath filePath,
                                                                           Option keyGeneratorOpt);

  /**
   * Read the Avro schema of the data file.
   *
   * @param storage  {@link HoodieStorage} instance.
   * @param filePath the data file path.
   * @return the Avro schema of the data file.
   */
  public abstract Schema readAvroSchema(HoodieStorage storage, StoragePath filePath);

  /**
   * Reads column statistics stored in the metadata.
   *
   * @param storage    {@link HoodieStorage} instance.
   * @param filePath   the data file path.
   * @param columnList List of columns to get column statistics.
   * @return {@link List} of {@link HoodieColumnRangeMetadata}.
   */
  @SuppressWarnings("rawtype")
  public abstract List> readColumnStatsFromMetadata(HoodieStorage storage,
                                                                                          StoragePath filePath,
                                                                                          List columnList);

  /**
   * @return The subclass's {@link HoodieFileFormat}.
   */
  public abstract HoodieFileFormat getFormat();

  /**
   * Writes properties to the meta file.
   *
   * @param storage  {@link HoodieStorage} instance.
   * @param filePath file path to write to.
   * @param props    properties to write.
   * @throws IOException upon write error.
   */
  public abstract void writeMetaFile(HoodieStorage storage,
                                     StoragePath filePath,
                                     Properties props) throws IOException;

  /**
   * Serializes Hudi records to the log block.
   *
   * @param storage      {@link HoodieStorage} instance.
   * @param records      a list of {@link HoodieRecord}.
   * @param writerSchema writer schema string from the log block header.
   * @param readerSchema
   * @param keyFieldName
   * @param paramsMap    additional params for serialization.
   * @return byte array after serialization.
   * @throws IOException upon serialization error.
   */
  public abstract byte[] serializeRecordsToLogBlock(HoodieStorage storage,
                                                    List records,
                                                    Schema writerSchema,
                                                    Schema readerSchema, String keyFieldName,
                                                    Map paramsMap) throws IOException;

  // -------------------------------------------------------------------------
  //  Inner Class
  // -------------------------------------------------------------------------

  /**
   * An iterator that can apply the given function {@code func} to transform records
   * from the underneath record iterator to hoodie keys.
   */
  public static class HoodieKeyIterator implements ClosableIterator {
    private final ClosableIterator nestedItr;
    private final Function func;

    public static HoodieKeyIterator getInstance(ClosableIterator nestedItr, Option keyGenerator) {
      return new HoodieKeyIterator(nestedItr, keyGenerator);
    }

    private HoodieKeyIterator(ClosableIterator nestedItr, Option keyGenerator) {
      this.nestedItr = nestedItr;
      if (keyGenerator.isPresent()) {
        this.func = retVal -> {
          String recordKey = keyGenerator.get().getRecordKey(retVal);
          String partitionPath = keyGenerator.get().getPartitionPath(retVal);
          return new HoodieKey(recordKey, partitionPath);
        };
      } else {
        this.func = retVal -> {
          String recordKey = retVal.get(HoodieRecord.RECORD_KEY_METADATA_FIELD).toString();
          String partitionPath = retVal.get(HoodieRecord.PARTITION_PATH_METADATA_FIELD).toString();
          return new HoodieKey(recordKey, partitionPath);
        };
      }
    }

    @Override
    public void close() {
      if (this.nestedItr != null) {
        this.nestedItr.close();
      }
    }

    @Override
    public boolean hasNext() {
      return this.nestedItr.hasNext();
    }

    @Override
    public HoodieKey next() {
      return this.func.apply(this.nestedItr.next());
    }
  }
}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy