org.apache.hudi.io.hadoop.HoodieAvroOrcWriter Maven / Gradle / Ivy
The newest version!
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package org.apache.hudi.io.hadoop;
import org.apache.hudi.avro.HoodieBloomFilterWriteSupport;
import org.apache.hudi.common.bloom.BloomFilter;
import org.apache.hudi.common.bloom.HoodieDynamicBoundedBloomFilter;
import org.apache.hudi.common.engine.TaskContextSupplier;
import org.apache.hudi.common.model.HoodieKey;
import org.apache.hudi.common.util.AvroOrcUtils;
import org.apache.hudi.common.util.Option;
import org.apache.hudi.hadoop.fs.HadoopFSUtils;
import org.apache.hudi.hadoop.fs.HoodieWrapperFileSystem;
import org.apache.hudi.io.storage.HoodieAvroFileWriter;
import org.apache.hudi.io.storage.HoodieOrcConfig;
import org.apache.hudi.storage.StoragePath;
import org.apache.avro.Schema;
import org.apache.avro.generic.GenericRecord;
import org.apache.avro.generic.IndexedRecord;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hive.ql.exec.vector.ColumnVector;
import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch;
import org.apache.orc.OrcFile;
import org.apache.orc.TypeDescription;
import org.apache.orc.Writer;
import java.io.Closeable;
import java.io.IOException;
import java.nio.ByteBuffer;
import java.util.List;
import java.util.concurrent.atomic.AtomicLong;
import static org.apache.hudi.common.util.StringUtils.getUTF8Bytes;
public class HoodieAvroOrcWriter implements HoodieAvroFileWriter, Closeable {
private static final AtomicLong RECORD_INDEX = new AtomicLong(1);
private final long maxFileSize;
private final Schema avroSchema;
private final List fieldTypes;
private final List fieldNames;
private final VectorizedRowBatch batch;
private final Writer writer;
private final Path file;
private final boolean isWrapperFileSystem;
private final Option wrapperFs;
private final String instantTime;
private final TaskContextSupplier taskContextSupplier;
private HoodieOrcConfig orcConfig;
private String minRecordKey;
private String maxRecordKey;
public HoodieAvroOrcWriter(String instantTime, StoragePath file, HoodieOrcConfig config, Schema schema,
TaskContextSupplier taskContextSupplier) throws IOException {
Configuration conf = HadoopFSUtils.registerFileSystem(file, config.getStorageConf().unwrapAs(Configuration.class));
this.file = HoodieWrapperFileSystem.convertToHoodiePath(file, conf);
FileSystem fs = this.file.getFileSystem(conf);
this.isWrapperFileSystem = fs instanceof HoodieWrapperFileSystem;
this.wrapperFs = this.isWrapperFileSystem ? Option.of((HoodieWrapperFileSystem) fs) : Option.empty();
this.instantTime = instantTime;
this.taskContextSupplier = taskContextSupplier;
this.avroSchema = schema;
final TypeDescription orcSchema = AvroOrcUtils.createOrcSchema(avroSchema);
this.fieldTypes = orcSchema.getChildren();
this.fieldNames = orcSchema.getFieldNames();
this.maxFileSize = config.getMaxFileSize();
this.batch = orcSchema.createRowBatch();
OrcFile.WriterOptions writerOptions = OrcFile.writerOptions(conf)
.blockSize(config.getBlockSize())
.stripeSize(config.getStripeSize())
.compress(config.getCompressionKind())
.bufferSize(config.getBlockSize())
.fileSystem(fs)
.setSchema(orcSchema);
this.writer = OrcFile.createWriter(this.file, writerOptions);
this.orcConfig = config;
}
@Override
public void writeAvroWithMetadata(HoodieKey key, IndexedRecord avroRecord) throws IOException {
prepRecordWithMetadata(key, avroRecord, instantTime,
taskContextSupplier.getPartitionIdSupplier().get(), RECORD_INDEX.getAndIncrement(), file.getName());
writeAvro(key.getRecordKey(), avroRecord);
}
@Override
public boolean canWrite() {
return !isWrapperFileSystem || wrapperFs.get().getBytesWritten(file) < maxFileSize;
}
@Override
public void writeAvro(String recordKey, IndexedRecord object) throws IOException {
for (int col = 0; col < batch.numCols; col++) {
ColumnVector colVector = batch.cols[col];
final String thisField = fieldNames.get(col);
final TypeDescription type = fieldTypes.get(col);
Object fieldValue = ((GenericRecord) object).get(thisField);
Schema.Field avroField = avroSchema.getField(thisField);
AvroOrcUtils.addToVector(type, colVector, avroField.schema(), fieldValue, batch.size);
}
batch.size++;
// Batch size corresponds to the number of written rows out of 1024 total rows (by default)
// in the row batch, add the batch to file once all rows are filled and reset.
if (batch.size == batch.getMaxSize()) {
writer.addRowBatch(batch);
batch.reset();
batch.size = 0;
}
if (orcConfig.useBloomFilter()) {
orcConfig.getBloomFilter().add(recordKey);
if (minRecordKey != null) {
minRecordKey = minRecordKey.compareTo(recordKey) <= 0 ? minRecordKey : recordKey;
} else {
minRecordKey = recordKey;
}
if (maxRecordKey != null) {
maxRecordKey = maxRecordKey.compareTo(recordKey) >= 0 ? maxRecordKey : recordKey;
} else {
maxRecordKey = recordKey;
}
}
}
@Override
public void close() throws IOException {
if (batch.size != 0) {
writer.addRowBatch(batch);
batch.reset();
}
if (orcConfig.useBloomFilter()) {
final BloomFilter bloomFilter = orcConfig.getBloomFilter();
writer.addUserMetadata(HoodieBloomFilterWriteSupport.HOODIE_AVRO_BLOOM_FILTER_METADATA_KEY, ByteBuffer.wrap(getUTF8Bytes(bloomFilter.serializeToString())));
if (minRecordKey != null && maxRecordKey != null) {
writer.addUserMetadata(HoodieBloomFilterWriteSupport.HOODIE_MIN_RECORD_KEY_FOOTER, ByteBuffer.wrap(getUTF8Bytes(minRecordKey)));
writer.addUserMetadata(HoodieBloomFilterWriteSupport.HOODIE_MAX_RECORD_KEY_FOOTER, ByteBuffer.wrap(getUTF8Bytes(maxRecordKey)));
}
if (bloomFilter.getBloomFilterTypeCode().name().contains(HoodieDynamicBoundedBloomFilter.TYPE_CODE_PREFIX)) {
writer.addUserMetadata(HoodieBloomFilterWriteSupport.HOODIE_BLOOM_FILTER_TYPE_CODE, ByteBuffer.wrap(getUTF8Bytes(bloomFilter.getBloomFilterTypeCode().name())));
}
}
writer.addUserMetadata(HoodieOrcConfig.AVRO_SCHEMA_METADATA_KEY, ByteBuffer.wrap(getUTF8Bytes(avroSchema.toString())));
writer.close();
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy