All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.hudi.io.hadoop.HoodieAvroHFileWriter Maven / Gradle / Ivy

/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *   http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied.  See the License for the
 * specific language governing permissions and limitations
 * under the License.
 */

package org.apache.hudi.io.hadoop;

import org.apache.hudi.avro.HoodieAvroUtils;
import org.apache.hudi.common.bloom.BloomFilter;
import org.apache.hudi.common.engine.TaskContextSupplier;
import org.apache.hudi.common.model.HoodieKey;
import org.apache.hudi.common.util.Option;
import org.apache.hudi.exception.HoodieDuplicateKeyException;
import org.apache.hudi.hadoop.fs.HadoopFSUtils;
import org.apache.hudi.hadoop.fs.HoodieWrapperFileSystem;
import org.apache.hudi.io.storage.HoodieAvroFileWriter;
import org.apache.hudi.io.storage.HoodieAvroHFileReaderImplBase;
import org.apache.hudi.metadata.MetadataPartitionType;
import org.apache.hudi.storage.StoragePath;

import org.apache.avro.Schema;
import org.apache.avro.generic.GenericRecord;
import org.apache.avro.generic.IndexedRecord;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hbase.HColumnDescriptor;
import org.apache.hadoop.hbase.KeyValue;
import org.apache.hadoop.hbase.io.hfile.CacheConfig;
import org.apache.hadoop.hbase.io.hfile.HFile;
import org.apache.hadoop.hbase.io.hfile.HFileContext;
import org.apache.hadoop.hbase.io.hfile.HFileContextBuilder;
import org.apache.hadoop.io.Writable;

import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
import java.util.concurrent.atomic.AtomicLong;

import static org.apache.hudi.common.util.StringUtils.EMPTY_STRING;
import static org.apache.hudi.common.util.StringUtils.getUTF8Bytes;

/**
 * HoodieHFileWriter writes IndexedRecords into an HFile. The record's key is used as the key and the
 * AVRO encoded record bytes are saved as the value.
 * 

* Limitations (compared to columnar formats like Parquet or ORC): * 1. Records should be added in order of keys * 2. There are no column stats */ public class HoodieAvroHFileWriter implements HoodieAvroFileWriter { private static AtomicLong recordIndex = new AtomicLong(1); private final Path file; private final HoodieHFileConfig hfileConfig; private final boolean isWrapperFileSystem; private final Option wrapperFs; private final long maxFileSize; private final String instantTime; private final TaskContextSupplier taskContextSupplier; private final boolean populateMetaFields; private final Option keyFieldSchema; private HFile.Writer writer; private String minRecordKey; private String maxRecordKey; private String prevRecordKey; // This is private in CacheConfig so have been copied here. private static final String DROP_BEHIND_CACHE_COMPACTION_KEY = "hbase.hfile.drop.behind.compaction"; public HoodieAvroHFileWriter(String instantTime, StoragePath file, HoodieHFileConfig hfileConfig, Schema schema, TaskContextSupplier taskContextSupplier, boolean populateMetaFields) throws IOException { Configuration conf = HadoopFSUtils.registerFileSystem(file, hfileConfig.getHadoopConf()); this.file = HoodieWrapperFileSystem.convertToHoodiePath(file, conf); FileSystem fs = this.file.getFileSystem(conf); this.isWrapperFileSystem = fs instanceof HoodieWrapperFileSystem; this.wrapperFs = this.isWrapperFileSystem ? Option.of((HoodieWrapperFileSystem) fs) : Option.empty(); this.hfileConfig = hfileConfig; this.keyFieldSchema = Option.ofNullable(schema.getField(hfileConfig.getKeyFieldName())); // TODO - compute this compression ratio dynamically by looking at the bytes written to the // stream and the actual file size reported by HDFS // this.maxFileSize = hfileConfig.getMaxFileSize() // + Math.round(hfileConfig.getMaxFileSize() * hfileConfig.getCompressionRatio()); this.maxFileSize = hfileConfig.getMaxFileSize(); this.instantTime = instantTime; this.taskContextSupplier = taskContextSupplier; this.populateMetaFields = populateMetaFields; HFileContext context = new HFileContextBuilder().withBlockSize(hfileConfig.getBlockSize()) .withCompression(hfileConfig.getCompressionAlgorithm()) .withCellComparator(hfileConfig.getHFileComparator()) .build(); conf.set(CacheConfig.PREFETCH_BLOCKS_ON_OPEN_KEY, String.valueOf(hfileConfig.shouldPrefetchBlocksOnOpen())); conf.set(HColumnDescriptor.CACHE_DATA_IN_L1, String.valueOf(hfileConfig.shouldCacheDataInL1())); conf.set(DROP_BEHIND_CACHE_COMPACTION_KEY, String.valueOf(hfileConfig.shouldDropBehindCacheCompaction())); CacheConfig cacheConfig = new CacheConfig(conf); this.writer = HFile.getWriterFactory(conf, cacheConfig) .withPath(fs, this.file) .withFileContext(context) .create(); writer.appendFileInfo(getUTF8Bytes(HoodieAvroHFileReaderImplBase.SCHEMA_KEY), getUTF8Bytes(schema.toString())); this.prevRecordKey = ""; } @Override public void writeAvroWithMetadata(HoodieKey key, IndexedRecord avroRecord) throws IOException { if (populateMetaFields) { prepRecordWithMetadata(key, avroRecord, instantTime, taskContextSupplier.getPartitionIdSupplier().get(), recordIndex.getAndIncrement(), file.getName()); writeAvro(key.getRecordKey(), avroRecord); } else { writeAvro(key.getRecordKey(), avroRecord); } } @Override public boolean canWrite() { return !isWrapperFileSystem || wrapperFs.get().getBytesWritten(file) < maxFileSize; } @Override public void writeAvro(String recordKey, IndexedRecord record) throws IOException { // do not allow duplicates for record index (primary index) // secondary index can have duplicates if (prevRecordKey.equals(recordKey) && file.getName().startsWith(MetadataPartitionType.RECORD_INDEX.getFileIdPrefix())) { throw new HoodieDuplicateKeyException("Duplicate recordKey " + recordKey + " found while writing to HFile." + "Record payload: " + record); } byte[] value = null; boolean isRecordSerialized = false; if (keyFieldSchema.isPresent()) { GenericRecord keyExcludedRecord = (GenericRecord) record; int keyFieldPos = this.keyFieldSchema.get().pos(); boolean isKeyAvailable = (record.get(keyFieldPos) != null && !(record.get(keyFieldPos).toString().isEmpty())); if (isKeyAvailable) { Object originalKey = keyExcludedRecord.get(keyFieldPos); keyExcludedRecord.put(keyFieldPos, EMPTY_STRING); value = HoodieAvroUtils.avroToBytes(keyExcludedRecord); keyExcludedRecord.put(keyFieldPos, originalKey); isRecordSerialized = true; } } if (!isRecordSerialized) { value = HoodieAvroUtils.avroToBytes((GenericRecord) record); } KeyValue kv = new KeyValue(getUTF8Bytes(recordKey), null, null, value); writer.append(kv); if (hfileConfig.useBloomFilter()) { hfileConfig.getBloomFilter().add(recordKey); if (minRecordKey == null) { minRecordKey = recordKey; } maxRecordKey = recordKey; } prevRecordKey = recordKey; } @Override public void close() throws IOException { if (hfileConfig.useBloomFilter()) { final BloomFilter bloomFilter = hfileConfig.getBloomFilter(); if (minRecordKey == null) { minRecordKey = ""; } if (maxRecordKey == null) { maxRecordKey = ""; } writer.appendFileInfo(getUTF8Bytes(HoodieAvroHFileReaderImplBase.KEY_MIN_RECORD), getUTF8Bytes(minRecordKey)); writer.appendFileInfo(getUTF8Bytes(HoodieAvroHFileReaderImplBase.KEY_MAX_RECORD), getUTF8Bytes(maxRecordKey)); writer.appendFileInfo(getUTF8Bytes(HoodieAvroHFileReaderImplBase.KEY_BLOOM_FILTER_TYPE_CODE), getUTF8Bytes(bloomFilter.getBloomFilterTypeCode().toString())); writer.appendMetaBlock(HoodieAvroHFileReaderImplBase.KEY_BLOOM_FILTER_META_BLOCK, new Writable() { @Override public void write(DataOutput out) throws IOException { out.write(getUTF8Bytes(bloomFilter.serializeToString())); } @Override public void readFields(DataInput in) throws IOException { } }); } writer.close(); writer = null; } }





© 2015 - 2025 Weber Informatics LLC | Privacy Policy