org.apache.hudi.common.util.FileFormatUtils Maven / Gradle / Ivy
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hudi.common.util;
import org.apache.hudi.avro.HoodieBloomFilterWriteSupport;
import org.apache.hudi.common.bloom.BloomFilter;
import org.apache.hudi.common.bloom.BloomFilterFactory;
import org.apache.hudi.common.bloom.BloomFilterTypeCode;
import org.apache.hudi.common.model.HoodieColumnRangeMetadata;
import org.apache.hudi.common.model.HoodieFileFormat;
import org.apache.hudi.common.model.HoodieKey;
import org.apache.hudi.common.model.HoodieRecord;
import org.apache.hudi.common.util.collection.ClosableIterator;
import org.apache.hudi.common.util.collection.Pair;
import org.apache.hudi.exception.HoodieException;
import org.apache.hudi.keygen.BaseKeyGenerator;
import org.apache.hudi.storage.HoodieStorage;
import org.apache.hudi.storage.StoragePath;
import org.apache.avro.Schema;
import org.apache.avro.generic.GenericRecord;
import javax.annotation.Nonnull;
import java.io.IOException;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Properties;
import java.util.Set;
import java.util.function.Function;
import java.util.stream.Collectors;
/**
* Utils for file format used in Hudi.
*/
public abstract class FileFormatUtils {
/**
* Aggregate column range statistics across files in a partition.
*
* @param relativePartitionPath relative partition path for the column range stats
* @param fileColumnRanges List of column range statistics for each file in a partition
*/
public static > HoodieColumnRangeMetadata getColumnRangeInPartition(String relativePartitionPath,
@Nonnull List> fileColumnRanges) {
ValidationUtils.checkArgument(!fileColumnRanges.isEmpty(), "fileColumnRanges should not be empty.");
// There are multiple files. Compute min(file_mins) and max(file_maxs)
return fileColumnRanges.stream()
.map(e -> HoodieColumnRangeMetadata.create(
relativePartitionPath, e.getColumnName(), e.getMinValue(), e.getMaxValue(),
e.getNullCount(), e.getValueCount(), e.getTotalSize(), e.getTotalUncompressedSize()))
.reduce(HoodieColumnRangeMetadata::merge).orElseThrow(() -> new HoodieException("MergingColumnRanges failed."));
}
/**
* Read the rowKey list from the given data file.
*
* @param storage {@link HoodieStorage} instance.
* @param filePath the data file path.
* @return set of row keys
*/
public Set readRowKeys(HoodieStorage storage, StoragePath filePath) {
return filterRowKeys(storage, filePath, new HashSet<>())
.stream().map(Pair::getKey).collect(Collectors.toSet());
}
/**
* Read the bloom filter from the metadata of the given data file.
*
* @param storage {@link HoodieStorage} instance.
* @param filePath the data file path.
* @return a BloomFilter object.
*/
public BloomFilter readBloomFilterFromMetadata(HoodieStorage storage, StoragePath filePath) {
Map footerVals =
readFooter(storage, false, filePath,
HoodieBloomFilterWriteSupport.HOODIE_AVRO_BLOOM_FILTER_METADATA_KEY,
HoodieBloomFilterWriteSupport.OLD_HOODIE_AVRO_BLOOM_FILTER_METADATA_KEY,
HoodieBloomFilterWriteSupport.HOODIE_BLOOM_FILTER_TYPE_CODE);
String footerVal = footerVals.get(HoodieBloomFilterWriteSupport.HOODIE_AVRO_BLOOM_FILTER_METADATA_KEY);
if (null == footerVal) {
// We use old style key "com.uber.hoodie.bloomfilter"
footerVal = footerVals.get(HoodieBloomFilterWriteSupport.OLD_HOODIE_AVRO_BLOOM_FILTER_METADATA_KEY);
}
BloomFilter toReturn = null;
if (footerVal != null) {
if (footerVals.containsKey(HoodieBloomFilterWriteSupport.HOODIE_BLOOM_FILTER_TYPE_CODE)) {
toReturn = BloomFilterFactory.fromString(footerVal,
footerVals.get(HoodieBloomFilterWriteSupport.HOODIE_BLOOM_FILTER_TYPE_CODE));
} else {
toReturn = BloomFilterFactory.fromString(footerVal, BloomFilterTypeCode.SIMPLE.name());
}
}
return toReturn;
}
/**
* Read the min and max record key from the metadata of the given data file.
*
* @param storage {@link HoodieStorage} instance.
* @param filePath the data file path.
* @return an array of two string where the first is min record key and the second is max record key.
*/
public String[] readMinMaxRecordKeys(HoodieStorage storage, StoragePath filePath) {
Map minMaxKeys = readFooter(storage, true, filePath,
HoodieBloomFilterWriteSupport.HOODIE_MIN_RECORD_KEY_FOOTER, HoodieBloomFilterWriteSupport.HOODIE_MAX_RECORD_KEY_FOOTER);
if (minMaxKeys.size() != 2) {
throw new HoodieException(
String.format("Could not read min/max record key out of footer correctly from %s. read) : %s",
filePath, minMaxKeys));
}
return new String[] {minMaxKeys.get(HoodieBloomFilterWriteSupport.HOODIE_MIN_RECORD_KEY_FOOTER),
minMaxKeys.get(HoodieBloomFilterWriteSupport.HOODIE_MAX_RECORD_KEY_FOOTER)};
}
/**
* Read the data file
* NOTE: This literally reads the entire file contents, thus should be used with caution.
*
* @param storage {@link HoodieStorage} instance.
* @param filePath the data file path.
* @return a list of GenericRecord.
*/
public abstract List readAvroRecords(HoodieStorage storage, StoragePath filePath);
/**
* Read the data file using the given schema
* NOTE: This literally reads the entire file contents, thus should be used with caution.
*
* @param storage {@link HoodieStorage} instance.
* @param filePath the data file path.
* @return a list of GenericRecord.
*/
public abstract List readAvroRecords(HoodieStorage storage, StoragePath filePath, Schema schema);
/**
* Read the footer data of the given data file.
*
* @param storage {@link HoodieStorage} instance.
* @param required require the footer data to be in data file.
* @param filePath the data file path.
* @param footerNames the footer names to read.
* @return a map where the key is the footer name and the value is the footer value.
*/
public abstract Map readFooter(HoodieStorage storage, boolean required, StoragePath filePath,
String... footerNames);
/**
* Returns the number of records in the data file.
*
* @param storage {@link HoodieStorage} instance.
* @param filePath the data file path.
*/
public abstract long getRowCount(HoodieStorage storage, StoragePath filePath);
/**
* Read the rowKey list matching the given filter, from the given data file.
* If the filter is empty, then this will return all the row keys and corresponding positions.
*
* @param storage {@link HoodieStorage} instance.
* @param filePath the data file path.
* @param filter record keys filter.
* @return set of pairs of row key and position matching candidateRecordKeys.
*/
public abstract Set> filterRowKeys(HoodieStorage storage, StoragePath filePath, Set filter);
/**
* Fetch {@link HoodieKey}s with positions from the given data file.
*
* @param storage {@link HoodieStorage} instance.
* @param filePath the data file path.
* @return {@link List} of pairs of {@link HoodieKey} and position fetched from the data file.
*/
public abstract List> fetchRecordKeysWithPositions(HoodieStorage storage, StoragePath filePath);
/**
* Provides a closable iterator for reading the given data file.
*
* @param storage {@link HoodieStorage} instance.
* @param filePath the data file path.
* @param keyGeneratorOpt instance of KeyGenerator.
* @return {@link ClosableIterator} of {@link HoodieKey}s for reading the file.
*/
public abstract ClosableIterator getHoodieKeyIterator(HoodieStorage storage,
StoragePath filePath,
Option keyGeneratorOpt);
/**
* Provides a closable iterator for reading the given data file.
*
* @param storage {@link HoodieStorage} instance.
* @param filePath the data file path.
* @return {@link ClosableIterator} of {@link HoodieKey}s for reading the file.
*/
public abstract ClosableIterator getHoodieKeyIterator(HoodieStorage storage, StoragePath filePath);
/**
* Fetch {@link HoodieKey}s with positions from the given data file.
*
* @param storage {@link HoodieStorage} instance.
* @param filePath the data file path.
* @param keyGeneratorOpt instance of KeyGenerator.
* @return {@link List} of pairs of {@link HoodieKey} and position fetched from the data file.
*/
public abstract List> fetchRecordKeysWithPositions(HoodieStorage storage,
StoragePath filePath,
Option keyGeneratorOpt);
/**
* Read the Avro schema of the data file.
*
* @param storage {@link HoodieStorage} instance.
* @param filePath the data file path.
* @return the Avro schema of the data file.
*/
public abstract Schema readAvroSchema(HoodieStorage storage, StoragePath filePath);
/**
* Reads column statistics stored in the metadata.
*
* @param storage {@link HoodieStorage} instance.
* @param filePath the data file path.
* @param columnList List of columns to get column statistics.
* @return {@link List} of {@link HoodieColumnRangeMetadata}.
*/
@SuppressWarnings("rawtype")
public abstract List> readColumnStatsFromMetadata(HoodieStorage storage,
StoragePath filePath,
List columnList);
/**
* @return The subclass's {@link HoodieFileFormat}.
*/
public abstract HoodieFileFormat getFormat();
/**
* Writes properties to the meta file.
*
* @param storage {@link HoodieStorage} instance.
* @param filePath file path to write to.
* @param props properties to write.
* @throws IOException upon write error.
*/
public abstract void writeMetaFile(HoodieStorage storage,
StoragePath filePath,
Properties props) throws IOException;
/**
* Serializes Hudi records to the log block.
*
* @param storage {@link HoodieStorage} instance.
* @param records a list of {@link HoodieRecord}.
* @param writerSchema writer schema string from the log block header.
* @param readerSchema
* @param keyFieldName
* @param paramsMap additional params for serialization.
* @return byte array after serialization.
* @throws IOException upon serialization error.
*/
public abstract byte[] serializeRecordsToLogBlock(HoodieStorage storage,
List records,
Schema writerSchema,
Schema readerSchema, String keyFieldName,
Map paramsMap) throws IOException;
// -------------------------------------------------------------------------
// Inner Class
// -------------------------------------------------------------------------
/**
* An iterator that can apply the given function {@code func} to transform records
* from the underneath record iterator to hoodie keys.
*/
public static class HoodieKeyIterator implements ClosableIterator {
private final ClosableIterator nestedItr;
private final Function func;
public static HoodieKeyIterator getInstance(ClosableIterator nestedItr, Option keyGenerator) {
return new HoodieKeyIterator(nestedItr, keyGenerator);
}
private HoodieKeyIterator(ClosableIterator nestedItr, Option keyGenerator) {
this.nestedItr = nestedItr;
if (keyGenerator.isPresent()) {
this.func = retVal -> {
String recordKey = keyGenerator.get().getRecordKey(retVal);
String partitionPath = keyGenerator.get().getPartitionPath(retVal);
return new HoodieKey(recordKey, partitionPath);
};
} else {
this.func = retVal -> {
String recordKey = retVal.get(HoodieRecord.RECORD_KEY_METADATA_FIELD).toString();
String partitionPath = retVal.get(HoodieRecord.PARTITION_PATH_METADATA_FIELD).toString();
return new HoodieKey(recordKey, partitionPath);
};
}
}
@Override
public void close() {
if (this.nestedItr != null) {
this.nestedItr.close();
}
}
@Override
public boolean hasNext() {
return this.nestedItr.hasNext();
}
@Override
public HoodieKey next() {
return this.func.apply(this.nestedItr.next());
}
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy