org.apache.iceberg.orc.OrcFileAppender Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of iceberg-orc Show documentation
Show all versions of iceberg-orc Show documentation
A table format for huge analytic datasets
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package org.apache.iceberg.orc;
import java.io.IOException;
import java.io.UncheckedIOException;
import java.util.Collections;
import java.util.List;
import java.util.Map;
import java.util.function.BiFunction;
import org.apache.hadoop.conf.Configuration;
import org.apache.iceberg.Metrics;
import org.apache.iceberg.MetricsConfig;
import org.apache.iceberg.Schema;
import org.apache.iceberg.exceptions.RuntimeIOException;
import org.apache.iceberg.hadoop.HadoopOutputFile;
import org.apache.iceberg.io.FileAppender;
import org.apache.iceberg.io.OutputFile;
import org.apache.iceberg.relocated.com.google.common.base.Preconditions;
import org.apache.iceberg.relocated.com.google.common.collect.Lists;
import org.apache.orc.OrcFile;
import org.apache.orc.StripeInformation;
import org.apache.orc.TypeDescription;
import org.apache.orc.Writer;
import org.apache.orc.storage.ql.exec.vector.VectorizedRowBatch;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
/** Create a file appender for ORC. */
class OrcFileAppender implements FileAppender {
private static final Logger LOG = LoggerFactory.getLogger(OrcFileAppender.class);
private final int batchSize;
private final OutputFile file;
private final Writer writer;
private final VectorizedRowBatch batch;
private final int avgRowByteSize;
private final OrcRowWriter valueWriter;
private boolean isClosed = false;
private final Configuration conf;
private final MetricsConfig metricsConfig;
OrcFileAppender(
Schema schema,
OutputFile file,
BiFunction> createWriterFunc,
Configuration conf,
Map metadata,
int batchSize,
MetricsConfig metricsConfig) {
this.conf = conf;
this.file = file;
this.batchSize = batchSize;
this.metricsConfig = metricsConfig;
TypeDescription orcSchema = ORCSchemaUtil.convert(schema);
this.avgRowByteSize =
OrcSchemaVisitor.visitSchema(orcSchema, new EstimateOrcAvgWidthVisitor()).stream()
.reduce(Integer::sum)
.orElse(0);
if (avgRowByteSize == 0) {
LOG.warn("The average length of the rows appears to be zero.");
}
this.batch = orcSchema.createRowBatch(this.batchSize);
OrcFile.WriterOptions options = OrcFile.writerOptions(conf).useUTCTimestamp(true);
if (file instanceof HadoopOutputFile) {
options.fileSystem(((HadoopOutputFile) file).getFileSystem());
}
options.setSchema(orcSchema);
this.writer = ORC.newFileWriter(file, options, metadata);
this.valueWriter = newOrcRowWriter(schema, orcSchema, createWriterFunc);
}
@Override
public void add(D datum) {
try {
valueWriter.write(datum, batch);
if (batch.size == this.batchSize) {
writer.addRowBatch(batch);
batch.reset();
}
} catch (IOException ioe) {
throw new UncheckedIOException(
String.format("Problem writing to ORC file %s", file.location()), ioe);
}
}
@Override
public Metrics metrics() {
Preconditions.checkState(isClosed, "Cannot return metrics while appending to an open file.");
return OrcMetrics.fromWriter(writer, valueWriter.metrics(), metricsConfig);
}
@Override
public long length() {
if (isClosed) {
return file.toInputFile().getLength();
}
long estimateMemory = writer.estimateMemory();
long dataLength = 0;
try {
List stripes = writer.getStripes();
if (!stripes.isEmpty()) {
StripeInformation stripeInformation = stripes.get(stripes.size() - 1);
dataLength =
stripeInformation != null
? stripeInformation.getOffset() + stripeInformation.getLength()
: 0;
}
} catch (IOException e) {
throw new UncheckedIOException(
String.format(
"Can't get Stripe's length from the file writer with path: %s.", file.location()),
e);
}
// This value is estimated, not actual.
return (long)
Math.ceil(dataLength + (estimateMemory + (long) batch.size * avgRowByteSize) * 0.2);
}
@Override
public List splitOffsets() {
Preconditions.checkState(isClosed, "File is not yet closed");
try {
List stripes = writer.getStripes();
return Collections.unmodifiableList(Lists.transform(stripes, StripeInformation::getOffset));
} catch (IOException e) {
throw new RuntimeIOException(
e, "Failed to get stripe information from writer for: %s", file.location());
}
}
@Override
public void close() throws IOException {
if (!isClosed) {
try {
if (batch.size > 0) {
writer.addRowBatch(batch);
batch.reset();
}
} finally {
writer.close();
this.isClosed = true;
}
}
}
@SuppressWarnings("unchecked")
private static OrcRowWriter newOrcRowWriter(
Schema schema,
TypeDescription orcSchema,
BiFunction> createWriterFunc) {
return (OrcRowWriter) createWriterFunc.apply(schema, orcSchema);
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy