org.apache.hudi.common.util.ParquetUtils Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of hudi-flink1.17-bundle Show documentation
The newest version!
/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *   http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied.  See the License for the
 * specific language governing permissions and limitations
 * under the License.
 */

package org.apache.hudi.common.util;

import org.apache.hudi.avro.HoodieAvroUtils;
import org.apache.hudi.avro.HoodieAvroWriteSupport;
import org.apache.hudi.common.config.HoodieConfig;
import org.apache.hudi.common.model.HoodieColumnRangeMetadata;
import org.apache.hudi.common.model.HoodieFileFormat;
import org.apache.hudi.common.model.HoodieKey;
import org.apache.hudi.common.model.HoodieRecord;
import org.apache.hudi.common.util.collection.ClosableIterator;
import org.apache.hudi.common.util.collection.Pair;
import org.apache.hudi.exception.HoodieIOException;
import org.apache.hudi.exception.MetadataNotFoundException;
import org.apache.hudi.io.storage.HoodieFileWriter;
import org.apache.hudi.io.storage.HoodieFileWriterFactory;
import org.apache.hudi.keygen.BaseKeyGenerator;
import org.apache.hudi.storage.HoodieStorage;
import org.apache.hudi.storage.StoragePath;

import org.apache.avro.Schema;
import org.apache.avro.generic.GenericRecord;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.parquet.avro.AvroParquetReader;
import org.apache.parquet.avro.AvroReadSupport;
import org.apache.parquet.avro.AvroSchemaConverter;
import org.apache.parquet.column.statistics.Statistics;
import org.apache.parquet.hadoop.ParquetFileReader;
import org.apache.parquet.hadoop.ParquetReader;
import org.apache.parquet.hadoop.ParquetWriter;
import org.apache.parquet.hadoop.metadata.BlockMetaData;
import org.apache.parquet.hadoop.metadata.CompressionCodecName;
import org.apache.parquet.hadoop.metadata.ParquetMetadata;
import org.apache.parquet.io.api.Binary;
import org.apache.parquet.schema.DecimalMetadata;
import org.apache.parquet.schema.MessageType;
import org.apache.parquet.schema.OriginalType;
import org.apache.parquet.schema.PrimitiveType;
import org.apache.parquet.schema.Types;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import javax.annotation.Nonnull;

import java.io.ByteArrayOutputStream;
import java.io.IOException;
import java.math.BigDecimal;
import java.math.BigInteger;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Properties;
import java.util.Set;
import java.util.function.Function;
import java.util.stream.Collectors;
import java.util.stream.Stream;

import static org.apache.hudi.common.config.HoodieStorageConfig.PARQUET_BLOCK_SIZE;
import static org.apache.hudi.common.config.HoodieStorageConfig.PARQUET_MAX_FILE_SIZE;
import static org.apache.hudi.common.config.HoodieStorageConfig.PARQUET_PAGE_SIZE;
import static org.apache.hudi.hadoop.fs.HadoopFSUtils.convertToStoragePath;

/**
 * Utility functions involving with parquet.
 */
public class ParquetUtils extends FileFormatUtils {

  private static final Logger LOG = LoggerFactory.getLogger(ParquetUtils.class);

  /**
   * Read the rowKey list matching the given filter, from the given parquet file. If the filter is empty, then this will
   * return all the rowkeys and corresponding positions.
   *
   * @param storage  {@link HoodieStorage} instance.
   * @param filePath The parquet file path.
   * @param filter   record keys filter
   * @return Set Set of pairs of row key and position matching candidateRecordKeys
   */
  @Override
  public Set> filterRowKeys(HoodieStorage storage, StoragePath filePath, Set filter) {
    return filterParquetRowKeys(storage, new Path(filePath.toUri()), filter, HoodieAvroUtils.getRecordKeySchema());
  }

  public static ParquetMetadata readMetadata(HoodieStorage storage, StoragePath parquetFilePath) {
    Path parquetFileHadoopPath = new Path(parquetFilePath.toUri());
    ParquetMetadata footer;
    try {
      // TODO(vc): Should we use the parallel reading version here?
      footer = ParquetFileReader.readFooter(storage.newInstance(
          parquetFilePath, storage.getConf()).getConf().unwrapAs(Configuration.class), parquetFileHadoopPath);
    } catch (IOException e) {
      throw new HoodieIOException("Failed to read footer for parquet " + parquetFileHadoopPath, e);
    }
    return footer;
  }

  /**
   * Read the rowKey list matching the given filter, from the given parquet file. If the filter is empty, then this will
   * return all the rowkeys.
   *
   * @param storage    {@link HoodieStorage} instance.
   * @param filePath   The parquet file path.
   * @param filter     record keys filter
   * @param readSchema schema of columns to be read
   * @return Set of pairs of row key and position matching candidateRecordKeys
   */
  private static Set> filterParquetRowKeys(HoodieStorage storage,
                                                              Path filePath, Set filter,
                                                              Schema readSchema) {
    Option filterFunction = Option.empty();
    if (filter != null && !filter.isEmpty()) {
      filterFunction = Option.of(new RecordKeysFilterFunction(filter));
    }
    Configuration conf = storage.getConf().unwrapCopyAs(Configuration.class);
    conf.addResource(storage.newInstance(convertToStoragePath(filePath), storage.getConf()).getConf().unwrapAs(Configuration.class));
    AvroReadSupport.setAvroReadSchema(conf, readSchema);
    AvroReadSupport.setRequestedProjection(conf, readSchema);
    Set> rowKeys = new HashSet<>();
    long rowPosition = 0;
    try (ParquetReader reader = AvroParquetReader.builder(filePath).withConf(conf).build()) {
      Object obj = reader.read();
      while (obj != null) {
        if (obj instanceof GenericRecord) {
          String recordKey = ((GenericRecord) obj).get(HoodieRecord.RECORD_KEY_METADATA_FIELD).toString();
          if (!filterFunction.isPresent() || filterFunction.get().apply(recordKey)) {
            rowKeys.add(Pair.of(recordKey, rowPosition));
          }
          obj = reader.read();
          rowPosition++;
        }
      }
    } catch (IOException e) {
      throw new HoodieIOException("Failed to read row keys from Parquet " + filePath, e);

    }
    // ignore
    return rowKeys;
  }

  /**
   * @param codecName codec name in String.
   * @return {@link CompressionCodecName} Enum.
   */
  public static CompressionCodecName getCompressionCodecName(String codecName) {
    return CompressionCodecName.fromConf(StringUtils.isNullOrEmpty(codecName) ? null : codecName);
  }

  /**
   * Fetch {@link HoodieKey}s with row positions from the given parquet file.
   *
   * @param storage  {@link HoodieStorage} instance.
   * @param filePath The parquet file path.
   * @return {@link List} of pairs of {@link HoodieKey} and row position fetched from the parquet file
   */
  @Override
  public List> fetchRecordKeysWithPositions(HoodieStorage storage, StoragePath filePath) {
    return fetchRecordKeysWithPositions(storage, filePath, Option.empty());
  }

  @Override
  public ClosableIterator getHoodieKeyIterator(HoodieStorage storage, StoragePath filePath) {
    return getHoodieKeyIterator(storage, filePath, Option.empty());
  }

  /**
   * Returns a closable iterator for reading the given parquet file.
   *
   * @param storage         {@link HoodieStorage} instance.
   * @param filePath        The parquet file path
   * @param keyGeneratorOpt instance of KeyGenerator
   * @return {@link ClosableIterator} of {@link HoodieKey}s for reading the parquet file
   */
  @Override
  public ClosableIterator getHoodieKeyIterator(HoodieStorage storage, StoragePath filePath, Option keyGeneratorOpt) {
    try {
      Configuration conf = storage.getConf().unwrapCopyAs(Configuration.class);
      conf.addResource(storage.newInstance(filePath, storage.getConf()).getConf().unwrapAs(Configuration.class));
      Schema readSchema = keyGeneratorOpt
          .map(keyGenerator -> {
            List fields = new ArrayList<>();
            fields.addAll(keyGenerator.getRecordKeyFieldNames());
            fields.addAll(keyGenerator.getPartitionPathFields());
            return HoodieAvroUtils.getSchemaForFields(readAvroSchema(storage, filePath), fields);
          })
          .orElse(HoodieAvroUtils.getRecordKeyPartitionPathSchema());
      AvroReadSupport.setAvroReadSchema(conf, readSchema);
      AvroReadSupport.setRequestedProjection(conf, readSchema);
      ParquetReader reader =
          AvroParquetReader.builder(new Path(filePath.toUri())).withConf(conf).build();
      return HoodieKeyIterator.getInstance(new ParquetReaderIterator<>(reader), keyGeneratorOpt);
    } catch (IOException e) {
      throw new HoodieIOException("Failed to read from Parquet file " + filePath, e);
    }
  }

  /**
   * Fetch {@link HoodieKey}s with row positions from the given parquet file.
   *
   * @param storage         {@link HoodieStorage} instance.
   * @param filePath        The parquet file path.
   * @param keyGeneratorOpt instance of KeyGenerator.
   * @return {@link List} of pairs of {@link HoodieKey} and row position fetched from the parquet file
   */
  @Override
  public List> fetchRecordKeysWithPositions(HoodieStorage storage, StoragePath filePath, Option keyGeneratorOpt) {
    List> hoodieKeysAndPositions = new ArrayList<>();
    long position = 0;
    try (ClosableIterator iterator = getHoodieKeyIterator(storage, filePath, keyGeneratorOpt)) {
      while (iterator.hasNext()) {
        hoodieKeysAndPositions.add(Pair.of(iterator.next(), position));
        position++;
      }
      return hoodieKeysAndPositions;
    }
  }

  /**
   * Get the schema of the given parquet file.
   */
  public MessageType readSchema(HoodieStorage storage, StoragePath parquetFilePath) {
    return readMetadata(storage, parquetFilePath).getFileMetaData().getSchema();
  }

  @Override
  public Map readFooter(HoodieStorage storage, boolean required,
                                        StoragePath filePath, String... footerNames) {
    Map footerVals = new HashMap<>();
    ParquetMetadata footer = readMetadata(storage, filePath);
    Map metadata = footer.getFileMetaData().getKeyValueMetaData();
    for (String footerName : footerNames) {
      if (metadata.containsKey(footerName)) {
        footerVals.put(footerName, metadata.get(footerName));
      } else if (required) {
        throw new MetadataNotFoundException(
            "Could not find index in Parquet footer. Looked for key " + footerName + " in " + filePath);
      }
    }
    return footerVals;
  }

  @Override
  public Schema readAvroSchema(HoodieStorage storage, StoragePath filePath) {
    MessageType parquetSchema = readSchema(storage, filePath);
    return new AvroSchemaConverter(storage.getConf().unwrapAs(Configuration.class)).convert(parquetSchema);
  }

  @Override
  public List> readColumnStatsFromMetadata(HoodieStorage storage,
                                                                                 StoragePath filePath,
                                                                                 List columnList) {
    ParquetMetadata metadata = readMetadata(storage, filePath);

    // Collect stats from all individual Parquet blocks
    Stream> hoodieColumnRangeMetadataStream =
        metadata.getBlocks().stream().sequential().flatMap(blockMetaData ->
            blockMetaData.getColumns().stream()
                    .filter(f -> columnList.contains(f.getPath().toDotString()))
                    .map(columnChunkMetaData -> {
                      Statistics stats = columnChunkMetaData.getStatistics();
                      return (HoodieColumnRangeMetadata) HoodieColumnRangeMetadata.create(
                          filePath.getName(),
                          columnChunkMetaData.getPath().toDotString(),
                          convertToNativeJavaType(
                              columnChunkMetaData.getPrimitiveType(),
                              stats.genericGetMin()),
                          convertToNativeJavaType(
                              columnChunkMetaData.getPrimitiveType(),
                              stats.genericGetMax()),
                          // NOTE: In case when column contains only nulls Parquet won't be creating
                          //       stats for it instead returning stubbed (empty) object. In that case
                          //       we have to equate number of nulls to the value count ourselves
                          stats.isEmpty() ? columnChunkMetaData.getValueCount() : stats.getNumNulls(),
                          columnChunkMetaData.getValueCount(),
                          columnChunkMetaData.getTotalSize(),
                          columnChunkMetaData.getTotalUncompressedSize());
                    })
        );

    Map>> columnToStatsListMap =
        hoodieColumnRangeMetadataStream.collect(Collectors.groupingBy(HoodieColumnRangeMetadata::getColumnName));

    // Combine those into file-level statistics
    // NOTE: Inlining this var makes javac (1.8) upset (due to its inability to infer
    // expression type correctly)
    Stream> stream = columnToStatsListMap.values()
        .stream()
        .map(this::getColumnRangeInFile);

    return stream.collect(Collectors.toList());
  }

  @Override
  public HoodieFileFormat getFormat() {
    return HoodieFileFormat.PARQUET;
  }

  /**
   * NOTE: This literally reads the entire file contents, thus should be used with caution.
   */
  @Override
  public List readAvroRecords(HoodieStorage storage, StoragePath filePath) {
    List records = new ArrayList<>();
    try (ParquetReader reader = AvroParquetReader.builder(new Path(filePath.toUri()))
        .withConf(storage.getConf().unwrapAs(Configuration.class)).build()) {
      Object obj = reader.read();
      while (obj != null) {
        if (obj instanceof GenericRecord) {
          records.add(((GenericRecord) obj));
        }
        obj = reader.read();
      }
    } catch (IOException e) {
      throw new HoodieIOException("Failed to read avro records from Parquet " + filePath, e);

    }
    return records;
  }

  @Override
  public List readAvroRecords(HoodieStorage storage, StoragePath filePath, Schema schema) {
    AvroReadSupport.setAvroReadSchema(storage.getConf().unwrapAs(Configuration.class), schema);
    return readAvroRecords(storage, filePath);
  }

  /**
   * Returns the number of records in the parquet file.
   *
   * @param storage  {@link HoodieStorage} instance.
   * @param filePath path of the file
   */
  @Override
  public long getRowCount(HoodieStorage storage, StoragePath filePath) {
    ParquetMetadata footer;
    long rowCount = 0;
    footer = readMetadata(storage, filePath);
    for (BlockMetaData b : footer.getBlocks()) {
      rowCount += b.getRowCount();
    }
    return rowCount;
  }

  @Override
  public void writeMetaFile(HoodieStorage storage,
                            StoragePath filePath,
                            Properties props) throws IOException {
    // Since we are only interested in saving metadata to the footer, the schema, blocksizes and other
    // parameters are not important.
    Schema schema = HoodieAvroUtils.getRecordKeySchema();
    MessageType type = Types.buildMessage().optional(PrimitiveType.PrimitiveTypeName.INT64).named("dummyint").named("dummy");
    HoodieAvroWriteSupport writeSupport = new HoodieAvroWriteSupport(type, schema, Option.empty(), new Properties());
    try (ParquetWriter writer = new ParquetWriter(new Path(filePath.toUri()), writeSupport, CompressionCodecName.UNCOMPRESSED, 1024, 1024)) {
      for (String key : props.stringPropertyNames()) {
        writeSupport.addFooterMetadata(key, props.getProperty(key));
      }
    }
  }

  @Override
  public byte[] serializeRecordsToLogBlock(HoodieStorage storage,
                                           List records,
                                           Schema writerSchema,
                                           Schema readerSchema,
                                           String keyFieldName,
                                           Map paramsMap) throws IOException {
    if (records.size() == 0) {
      return new byte[0];
    }

    ByteArrayOutputStream outputStream = new ByteArrayOutputStream();
    HoodieConfig config = new HoodieConfig();
    paramsMap.entrySet().stream().forEach(entry -> config.setValue(entry.getKey(), entry.getValue()));
    config.setValue(PARQUET_BLOCK_SIZE.key(), String.valueOf(ParquetWriter.DEFAULT_BLOCK_SIZE));
    config.setValue(PARQUET_PAGE_SIZE.key(), String.valueOf(ParquetWriter.DEFAULT_PAGE_SIZE));
    config.setValue(PARQUET_MAX_FILE_SIZE.key(), String.valueOf(1024 * 1024 * 1024));
    HoodieRecord.HoodieRecordType recordType = records.iterator().next().getRecordType();
    try (HoodieFileWriter parquetWriter = HoodieFileWriterFactory.getFileWriter(
        HoodieFileFormat.PARQUET, outputStream, storage, config, writerSchema, recordType)) {
      for (HoodieRecord record : records) {
        String recordKey = record.getRecordKey(readerSchema, keyFieldName);
        parquetWriter.write(recordKey, record, writerSchema);
      }
      outputStream.flush();
    }
    return outputStream.toByteArray();
  }

  static class RecordKeysFilterFunction implements Function {

    private final Set candidateKeys;

    RecordKeysFilterFunction(Set candidateKeys) {
      this.candidateKeys = candidateKeys;
    }

    @Override
    public Boolean apply(String recordKey) {
      return candidateKeys.contains(recordKey);
    }
  }

  private > HoodieColumnRangeMetadata getColumnRangeInFile(
      @Nonnull List> blockRanges
  ) {
    if (blockRanges.size() == 1) {
      // only one block in parquet file. we can just return that range.
      return blockRanges.get(0);
    }

    // there are multiple blocks. Compute min(block_mins) and max(block_maxs)
    return blockRanges.stream()
        .sequential()
        .reduce(HoodieColumnRangeMetadata::merge).get();
  }

  private static Comparable convertToNativeJavaType(PrimitiveType primitiveType, Comparable val) {
    if (val == null) {
      return null;
    }

    if (primitiveType.getOriginalType() == OriginalType.DECIMAL) {
      return extractDecimal(val, primitiveType.getDecimalMetadata());
    } else if (primitiveType.getOriginalType() == OriginalType.DATE) {
      // NOTE: This is a workaround to address race-condition in using
      //       {@code SimpleDataFormat} concurrently (w/in {@code DateStringifier})
      // TODO cleanup after Parquet upgrade to 1.12
      synchronized (primitiveType.stringifier()) {
        // Date logical type is implemented as a signed INT32
        // REF: https://github.com/apache/parquet-format/blob/master/LogicalTypes.md
        return java.sql.Date.valueOf(
            primitiveType.stringifier().stringify((Integer) val)
        );
      }
    } else if (primitiveType.getOriginalType() == OriginalType.UTF8) {
      // NOTE: UTF8 type designates a byte array that should be interpreted as a
      // UTF-8 encoded character string
      // REF: https://github.com/apache/parquet-format/blob/master/LogicalTypes.md
      return ((Binary) val).toStringUsingUTF8();
    } else if (primitiveType.getPrimitiveTypeName() == PrimitiveType.PrimitiveTypeName.BINARY) {
      // NOTE: `getBytes` access makes a copy of the underlying byte buffer
      return ((Binary) val).toByteBuffer();
    }

    return val;
  }

  @Nonnull
  private static BigDecimal extractDecimal(Object val, DecimalMetadata decimalMetadata) {
    // In Parquet, Decimal could be represented as either of
    //    1. INT32 (for 1 <= precision <= 9)
    //    2. INT64 (for 1 <= precision <= 18)
    //    3. FIXED_LEN_BYTE_ARRAY (precision is limited by the array size. Length n can store <= floor(log_10(2^(8*n - 1) - 1)) base-10 digits)
    //    4. BINARY (precision is not limited)
    // REF: https://github.com/apache/parquet-format/blob/master/LogicalTypes.md#DECIMAL
    int scale = decimalMetadata.getScale();
    if (val == null) {
      return null;
    } else if (val instanceof Integer) {
      return BigDecimal.valueOf((Integer) val, scale);
    } else if (val instanceof Long) {
      return BigDecimal.valueOf((Long) val, scale);
    } else if (val instanceof Binary) {
      // NOTE: Unscaled number is stored in BE format (most significant byte is 0th)
      return new BigDecimal(new BigInteger(((Binary) val).getBytesUnsafe()), scale);
    } else {
      throw new UnsupportedOperationException(String.format("Unsupported value type (%s)", val.getClass().getName()));
    }
  }
}