All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.hudi.common.util.HFileUtils Maven / Gradle / Ivy

The newest version!
/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *   http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied.  See the License for the
 * specific language governing permissions and limitations
 * under the License.
 */

package org.apache.hudi.common.util;

import org.apache.hudi.avro.HoodieAvroUtils;
import org.apache.hudi.common.model.HoodieColumnRangeMetadata;
import org.apache.hudi.common.model.HoodieFileFormat;
import org.apache.hudi.common.model.HoodieKey;
import org.apache.hudi.common.model.HoodieRecord;
import org.apache.hudi.common.util.collection.ClosableIterator;
import org.apache.hudi.common.util.collection.Pair;
import org.apache.hudi.exception.HoodieIOException;
import org.apache.hudi.io.compress.CompressionCodec;
import org.apache.hudi.io.storage.HoodieAvroHFileReaderImplBase;
import org.apache.hudi.io.storage.HoodieFileReader;
import org.apache.hudi.io.storage.HoodieHBaseKVComparator;
import org.apache.hudi.io.storage.HoodieIOFactory;
import org.apache.hudi.keygen.BaseKeyGenerator;
import org.apache.hudi.storage.HoodieStorage;
import org.apache.hudi.storage.StoragePath;

import org.apache.avro.Schema;
import org.apache.avro.generic.GenericRecord;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataOutputStream;
import org.apache.hadoop.hbase.KeyValue;
import org.apache.hadoop.hbase.io.compress.Compression;
import org.apache.hadoop.hbase.io.hfile.CacheConfig;
import org.apache.hadoop.hbase.io.hfile.HFile;
import org.apache.hadoop.hbase.io.hfile.HFileContext;
import org.apache.hadoop.hbase.io.hfile.HFileContextBuilder;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import java.io.ByteArrayOutputStream;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Properties;
import java.util.Set;
import java.util.TreeMap;

import static org.apache.hudi.common.config.HoodieStorageConfig.HFILE_COMPRESSION_ALGORITHM_NAME;
import static org.apache.hudi.common.util.StringUtils.getUTF8Bytes;

/**
 * Utility functions for HFile files.
 */
public class HFileUtils extends FileFormatUtils {
  private static final Logger LOG = LoggerFactory.getLogger(HFileUtils.class);
  private static final int DEFAULT_BLOCK_SIZE_FOR_LOG_FILE = 1024 * 1024;

  /**
   * Gets the {@link Compression.Algorithm} Enum based on the {@link CompressionCodec} name.
   *
   * @param paramsMap parameter map containing the compression codec config.
   * @return the {@link Compression.Algorithm} Enum.
   */
  public static Compression.Algorithm getHFileCompressionAlgorithm(Map paramsMap) {
    String algoName = paramsMap.get(HFILE_COMPRESSION_ALGORITHM_NAME.key());
    if (StringUtils.isNullOrEmpty(algoName)) {
      return Compression.Algorithm.GZ;
    }
    return Compression.Algorithm.valueOf(algoName.toUpperCase());
  }

  @Override
  public List readAvroRecords(HoodieStorage storage, StoragePath filePath) {
    throw new UnsupportedOperationException("HFileUtils does not support readAvroRecords");
  }

  @Override
  public List readAvroRecords(HoodieStorage storage, StoragePath filePath, Schema schema) {
    throw new UnsupportedOperationException("HFileUtils does not support readAvroRecords");
  }

  @Override
  public Map readFooter(HoodieStorage storage, boolean required, StoragePath filePath, String... footerNames) {
    throw new UnsupportedOperationException("HFileUtils does not support readFooter");
  }

  @Override
  public long getRowCount(HoodieStorage storage, StoragePath filePath) {
    throw new UnsupportedOperationException("HFileUtils does not support getRowCount");
  }

  @Override
  public Set> filterRowKeys(HoodieStorage storage, StoragePath filePath, Set filter) {
    throw new UnsupportedOperationException("HFileUtils does not support filterRowKeys");
  }

  @Override
  public List> fetchRecordKeysWithPositions(HoodieStorage storage, StoragePath filePath) {
    throw new UnsupportedOperationException("HFileUtils does not support fetchRecordKeysWithPositions");
  }

  @Override
  public ClosableIterator getHoodieKeyIterator(HoodieStorage storage, StoragePath filePath, Option keyGeneratorOpt) {
    throw new UnsupportedOperationException("HFileUtils does not support getHoodieKeyIterator");
  }

  @Override
  public ClosableIterator getHoodieKeyIterator(HoodieStorage storage, StoragePath filePath) {
    throw new UnsupportedOperationException("HFileUtils does not support getHoodieKeyIterator");
  }

  @Override
  public List> fetchRecordKeysWithPositions(HoodieStorage storage, StoragePath filePath, Option keyGeneratorOpt) {
    throw new UnsupportedOperationException("HFileUtils does not support fetchRecordKeysWithPositions");
  }

  @Override
  public Schema readAvroSchema(HoodieStorage storage, StoragePath filePath) {
    LOG.info("Reading schema from {}", filePath);

    try (HoodieFileReader fileReader =
             HoodieIOFactory.getIOFactory(storage)
                 .getReaderFactory(HoodieRecord.HoodieRecordType.AVRO)
                 .getFileReader(
                     ConfigUtils.DEFAULT_HUDI_CONFIG_FOR_READER,
                     filePath)) {
      return fileReader.getSchema();
    } catch (IOException e) {
      throw new HoodieIOException("Failed to read schema from HFile", e);
    }
  }

  @Override
  public List> readColumnStatsFromMetadata(HoodieStorage storage, StoragePath filePath, List columnList) {
    throw new UnsupportedOperationException(
        "Reading column statistics from metadata is not supported for HFile format yet");
  }

  @Override
  public HoodieFileFormat getFormat() {
    return HoodieFileFormat.HFILE;
  }

  @Override
  public void writeMetaFile(HoodieStorage storage, StoragePath filePath, Properties props) throws IOException {
    throw new UnsupportedOperationException("HFileUtils does not support writeMetaFile");
  }

  @Override
  public byte[] serializeRecordsToLogBlock(HoodieStorage storage,
                                           List records,
                                           Schema writerSchema,
                                           Schema readerSchema,
                                           String keyFieldName,
                                           Map paramsMap) throws IOException {
    Compression.Algorithm compressionAlgorithm = getHFileCompressionAlgorithm(paramsMap);
    HFileContext context = new HFileContextBuilder()
        .withBlockSize(DEFAULT_BLOCK_SIZE_FOR_LOG_FILE)
        .withCompression(compressionAlgorithm)
        .withCellComparator(new HoodieHBaseKVComparator())
        .build();

    Configuration conf = storage.getConf().unwrapAs(Configuration.class);
    CacheConfig cacheConfig = new CacheConfig(conf);
    ByteArrayOutputStream baos = new ByteArrayOutputStream();
    FSDataOutputStream ostream = new FSDataOutputStream(baos, null);

    // Use simple incrementing counter as a key
    boolean useIntegerKey = !getRecordKey(records.get(0), readerSchema, keyFieldName).isPresent();
    // This is set here to avoid re-computing this in the loop
    int keyWidth = useIntegerKey ? (int) Math.ceil(Math.log(records.size())) + 1 : -1;

    // Serialize records into bytes
    Map> sortedRecordsMap = new TreeMap<>();

    Iterator itr = records.iterator();
    int id = 0;
    while (itr.hasNext()) {
      HoodieRecord record = itr.next();
      String recordKey;
      if (useIntegerKey) {
        recordKey = String.format("%" + keyWidth + "s", id++);
      } else {
        recordKey = getRecordKey(record, readerSchema, keyFieldName).get();
      }

      final byte[] recordBytes = serializeRecord(record, writerSchema, keyFieldName);
      // If key exists in the map, append to its list. If not, create a new list.
      // Get the existing list of recordBytes for the recordKey, or an empty list if it doesn't exist
      List recordBytesList = sortedRecordsMap.getOrDefault(recordKey, new ArrayList<>());
      recordBytesList.add(recordBytes);
      // Put the updated list back into the map
      sortedRecordsMap.put(recordKey, recordBytesList);
    }

    HFile.Writer writer = HFile.getWriterFactory(conf, cacheConfig)
        .withOutputStream(ostream).withFileContext(context).create();

    // Write the records
    sortedRecordsMap.forEach((recordKey, recordBytesList) -> {
      for (byte[] recordBytes : recordBytesList) {
        try {
          KeyValue kv = new KeyValue(recordKey.getBytes(), null, null, recordBytes);
          writer.append(kv);
        } catch (IOException e) {
          throw new HoodieIOException("IOException serializing records", e);
        }
      }
    });

    writer.appendFileInfo(
        getUTF8Bytes(HoodieAvroHFileReaderImplBase.SCHEMA_KEY), getUTF8Bytes(readerSchema.toString()));

    writer.close();
    ostream.flush();
    ostream.close();

    return baos.toByteArray();
  }

  private static Option getRecordKey(HoodieRecord record, Schema readerSchema, String keyFieldName) {
    return Option.ofNullable(record.getRecordKey(readerSchema, keyFieldName));
  }

  private static byte[] serializeRecord(HoodieRecord record, Schema schema, String keyFieldName) throws IOException {
    Option keyField = Option.ofNullable(schema.getField(keyFieldName));
    // Reset key value w/in the record to avoid duplicating the key w/in payload
    if (keyField.isPresent()) {
      record.truncateRecordKey(schema, new Properties(), keyField.get().name());
    }
    return HoodieAvroUtils.recordToBytes(record, schema).get();
  }
}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy