org.apache.iceberg.orc.ORC Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of iceberg-orc Show documentation
Show all versions of iceberg-orc Show documentation
A table format for huge analytic datasets
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package org.apache.iceberg.orc;
import java.io.IOException;
import java.nio.charset.StandardCharsets;
import java.util.HashMap;
import java.util.Map;
import java.util.function.BiFunction;
import java.util.function.Function;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.iceberg.MetricsConfig;
import org.apache.iceberg.Schema;
import org.apache.iceberg.exceptions.RuntimeIOException;
import org.apache.iceberg.expressions.Expression;
import org.apache.iceberg.hadoop.HadoopInputFile;
import org.apache.iceberg.hadoop.HadoopOutputFile;
import org.apache.iceberg.io.CloseableIterable;
import org.apache.iceberg.io.FileAppender;
import org.apache.iceberg.io.InputFile;
import org.apache.iceberg.io.OutputFile;
import org.apache.iceberg.mapping.NameMapping;
import org.apache.iceberg.relocated.com.google.common.base.Preconditions;
import org.apache.orc.OrcConf;
import org.apache.orc.OrcFile;
import org.apache.orc.OrcFile.ReaderOptions;
import org.apache.orc.Reader;
import org.apache.orc.TypeDescription;
import org.apache.orc.storage.ql.exec.vector.VectorizedRowBatch;
@SuppressWarnings("checkstyle:AbbreviationAsWordInName")
public class ORC {
private static final String VECTOR_ROW_BATCH_SIZE = "iceberg.orc.vectorbatch.size";
private ORC() {
}
public static WriteBuilder write(OutputFile file) {
return new WriteBuilder(file);
}
public static class WriteBuilder {
private final OutputFile file;
private final Configuration conf;
private Schema schema = null;
private BiFunction> createWriterFunc;
private Map metadata = new HashMap<>();
private MetricsConfig metricsConfig;
private WriteBuilder(OutputFile file) {
this.file = file;
if (file instanceof HadoopOutputFile) {
this.conf = new Configuration(((HadoopOutputFile) file).getConf());
} else {
this.conf = new Configuration();
}
}
public WriteBuilder metadata(String property, String value) {
metadata.put(property, value.getBytes(StandardCharsets.UTF_8));
return this;
}
public WriteBuilder config(String property, String value) {
conf.set(property, value);
return this;
}
public WriteBuilder createWriterFunc(BiFunction> writerFunction) {
this.createWriterFunc = writerFunction;
return this;
}
public WriteBuilder setAll(Map properties) {
properties.forEach(conf::set);
return this;
}
public WriteBuilder schema(Schema newSchema) {
this.schema = newSchema;
return this;
}
public WriteBuilder overwrite() {
return overwrite(true);
}
public WriteBuilder overwrite(boolean enabled) {
OrcConf.OVERWRITE_OUTPUT_FILE.setBoolean(conf, enabled);
return this;
}
public WriteBuilder metricsConfig(MetricsConfig newMetricsConfig) {
this.metricsConfig = newMetricsConfig;
return this;
}
public FileAppender build() {
Preconditions.checkNotNull(schema, "Schema is required");
return new OrcFileAppender<>(schema,
this.file, createWriterFunc, conf, metadata,
conf.getInt(VECTOR_ROW_BATCH_SIZE, VectorizedRowBatch.DEFAULT_SIZE), metricsConfig);
}
}
public static ReadBuilder read(InputFile file) {
return new ReadBuilder(file);
}
public static class ReadBuilder {
private final InputFile file;
private final Configuration conf;
private Schema schema = null;
private Long start = null;
private Long length = null;
private Expression filter = null;
private boolean caseSensitive = true;
private NameMapping nameMapping = null;
private OrcRowFilter rowFilter = null;
private Function> readerFunc;
private Function> batchedReaderFunc;
private int recordsPerBatch = VectorizedRowBatch.DEFAULT_SIZE;
private ReadBuilder(InputFile file) {
Preconditions.checkNotNull(file, "Input file cannot be null");
this.file = file;
if (file instanceof HadoopInputFile) {
this.conf = new Configuration(((HadoopInputFile) file).getConf());
} else {
this.conf = new Configuration();
}
// We need to turn positional schema evolution off since we use column name based schema evolution for projection
this.conf.setBoolean(OrcConf.FORCE_POSITIONAL_EVOLUTION.getHiveConfName(), false);
}
/**
* Restricts the read to the given range: [start, start + length).
*
* @param newStart the start position for this read
* @param newLength the length of the range this read should scan
* @return this builder for method chaining
*/
public ReadBuilder split(long newStart, long newLength) {
this.start = newStart;
this.length = newLength;
return this;
}
public ReadBuilder project(Schema newSchema) {
this.schema = newSchema;
return this;
}
public ReadBuilder caseSensitive(boolean newCaseSensitive) {
OrcConf.IS_SCHEMA_EVOLUTION_CASE_SENSITIVE.setBoolean(this.conf, newCaseSensitive);
this.caseSensitive = newCaseSensitive;
return this;
}
public ReadBuilder config(String property, String value) {
conf.set(property, value);
return this;
}
public ReadBuilder createReaderFunc(Function> readerFunction) {
Preconditions.checkArgument(this.batchedReaderFunc == null,
"Reader function cannot be set since the batched version is already set");
this.readerFunc = readerFunction;
return this;
}
public ReadBuilder filter(Expression newFilter) {
this.filter = newFilter;
return this;
}
public ReadBuilder createBatchedReaderFunc(Function> batchReaderFunction) {
Preconditions.checkArgument(this.readerFunc == null,
"Batched reader function cannot be set since the non-batched version is already set");
this.batchedReaderFunc = batchReaderFunction;
return this;
}
public ReadBuilder recordsPerBatch(int numRecordsPerBatch) {
this.recordsPerBatch = numRecordsPerBatch;
return this;
}
public ReadBuilder withNameMapping(NameMapping newNameMapping) {
this.nameMapping = newNameMapping;
return this;
}
public ReadBuilder rowFilter(OrcRowFilter newRowFilter) {
this.rowFilter = newRowFilter;
return this;
}
public CloseableIterable build() {
Preconditions.checkNotNull(schema, "Schema is required");
return new OrcIterable<>(file, conf, schema, nameMapping, start, length, readerFunc, caseSensitive, filter,
batchedReaderFunc, recordsPerBatch, rowFilter);
}
}
static Reader newFileReader(String location, ReaderOptions readerOptions) {
try {
return OrcFile.createReader(new Path(location), readerOptions);
} catch (IOException ioe) {
throw new RuntimeIOException(ioe, "Failed to open file: %s", location);
}
}
static Reader newFileReader(InputFile file, Configuration config) {
ReaderOptions readerOptions = OrcFile.readerOptions(config).useUTCTimestamp(true);
if (file instanceof HadoopInputFile) {
readerOptions.filesystem(((HadoopInputFile) file).getFileSystem());
}
return newFileReader(file.location(), readerOptions);
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy