Please wait. This can take some minutes ...
Many resources are needed to download a project. Please understand that we have to compensate our server costs. Thank you in advance.
Project price only 1 $
You can buy this project and download/modify it how often you want.
org.apache.iceberg.orc.ORC Maven / Gradle / Ivy
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package org.apache.iceberg.orc;
import static org.apache.iceberg.TableProperties.DELETE_ORC_BLOCK_SIZE_BYTES;
import static org.apache.iceberg.TableProperties.DELETE_ORC_COMPRESSION;
import static org.apache.iceberg.TableProperties.DELETE_ORC_COMPRESSION_STRATEGY;
import static org.apache.iceberg.TableProperties.DELETE_ORC_STRIPE_SIZE_BYTES;
import static org.apache.iceberg.TableProperties.DELETE_ORC_WRITE_BATCH_SIZE;
import static org.apache.iceberg.TableProperties.ORC_BLOCK_SIZE_BYTES;
import static org.apache.iceberg.TableProperties.ORC_BLOCK_SIZE_BYTES_DEFAULT;
import static org.apache.iceberg.TableProperties.ORC_BLOOM_FILTER_COLUMNS;
import static org.apache.iceberg.TableProperties.ORC_BLOOM_FILTER_COLUMNS_DEFAULT;
import static org.apache.iceberg.TableProperties.ORC_BLOOM_FILTER_FPP;
import static org.apache.iceberg.TableProperties.ORC_BLOOM_FILTER_FPP_DEFAULT;
import static org.apache.iceberg.TableProperties.ORC_COMPRESSION;
import static org.apache.iceberg.TableProperties.ORC_COMPRESSION_DEFAULT;
import static org.apache.iceberg.TableProperties.ORC_COMPRESSION_STRATEGY;
import static org.apache.iceberg.TableProperties.ORC_COMPRESSION_STRATEGY_DEFAULT;
import static org.apache.iceberg.TableProperties.ORC_STRIPE_SIZE_BYTES;
import static org.apache.iceberg.TableProperties.ORC_STRIPE_SIZE_BYTES_DEFAULT;
import static org.apache.iceberg.TableProperties.ORC_WRITE_BATCH_SIZE;
import static org.apache.iceberg.TableProperties.ORC_WRITE_BATCH_SIZE_DEFAULT;
import java.io.IOException;
import java.nio.ByteBuffer;
import java.nio.charset.StandardCharsets;
import java.util.List;
import java.util.Locale;
import java.util.Map;
import java.util.Objects;
import java.util.function.BiFunction;
import java.util.function.Function;
import java.util.stream.Collectors;
import java.util.stream.IntStream;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.iceberg.FileFormat;
import org.apache.iceberg.MetricsConfig;
import org.apache.iceberg.PartitionSpec;
import org.apache.iceberg.Schema;
import org.apache.iceberg.SortOrder;
import org.apache.iceberg.StructLike;
import org.apache.iceberg.Table;
import org.apache.iceberg.TableProperties;
import org.apache.iceberg.data.orc.GenericOrcWriter;
import org.apache.iceberg.data.orc.GenericOrcWriters;
import org.apache.iceberg.deletes.EqualityDeleteWriter;
import org.apache.iceberg.deletes.PositionDeleteWriter;
import org.apache.iceberg.encryption.EncryptionKeyMetadata;
import org.apache.iceberg.exceptions.RuntimeIOException;
import org.apache.iceberg.expressions.Expression;
import org.apache.iceberg.hadoop.HadoopInputFile;
import org.apache.iceberg.hadoop.HadoopOutputFile;
import org.apache.iceberg.io.CloseableIterable;
import org.apache.iceberg.io.DataWriter;
import org.apache.iceberg.io.DeleteSchemaUtil;
import org.apache.iceberg.io.FileAppender;
import org.apache.iceberg.io.InputFile;
import org.apache.iceberg.io.OutputFile;
import org.apache.iceberg.mapping.NameMapping;
import org.apache.iceberg.relocated.com.google.common.base.Preconditions;
import org.apache.iceberg.relocated.com.google.common.collect.Maps;
import org.apache.iceberg.util.ArrayUtil;
import org.apache.iceberg.util.PropertyUtil;
import org.apache.orc.CompressionKind;
import org.apache.orc.OrcConf;
import org.apache.orc.OrcFile;
import org.apache.orc.OrcFile.CompressionStrategy;
import org.apache.orc.OrcFile.ReaderOptions;
import org.apache.orc.Reader;
import org.apache.orc.TypeDescription;
import org.apache.orc.Writer;
import org.apache.orc.storage.ql.exec.vector.VectorizedRowBatch;
@SuppressWarnings("checkstyle:AbbreviationAsWordInName")
public class ORC {
/** @deprecated use {@link TableProperties#ORC_WRITE_BATCH_SIZE} instead */
@Deprecated private static final String VECTOR_ROW_BATCH_SIZE = "iceberg.orc.vectorbatch.size";
private ORC() {}
public static WriteBuilder write(OutputFile file) {
return new WriteBuilder(file);
}
public static class WriteBuilder {
private final OutputFile file;
private final Configuration conf;
private Schema schema = null;
private BiFunction> createWriterFunc;
private Map metadata = Maps.newHashMap();
private MetricsConfig metricsConfig;
private Function, Context> createContextFunc = Context::dataContext;
private final Map config = Maps.newLinkedHashMap();
private boolean overwrite = false;
private WriteBuilder(OutputFile file) {
this.file = file;
if (file instanceof HadoopOutputFile) {
this.conf = new Configuration(((HadoopOutputFile) file).getConf());
} else {
this.conf = new Configuration();
}
}
public WriteBuilder forTable(Table table) {
schema(table.schema());
setAll(table.properties());
metricsConfig(MetricsConfig.forTable(table));
return this;
}
public WriteBuilder metadata(String property, String value) {
metadata.put(property, value.getBytes(StandardCharsets.UTF_8));
return this;
}
public WriteBuilder set(String property, String value) {
config.put(property, value);
return this;
}
public WriteBuilder createWriterFunc(
BiFunction> writerFunction) {
this.createWriterFunc = writerFunction;
return this;
}
public WriteBuilder setAll(Map properties) {
config.putAll(properties);
return this;
}
public WriteBuilder schema(Schema newSchema) {
this.schema = newSchema;
return this;
}
public WriteBuilder overwrite() {
return overwrite(true);
}
public WriteBuilder overwrite(boolean enabled) {
this.overwrite = enabled;
return this;
}
public WriteBuilder metricsConfig(MetricsConfig newMetricsConfig) {
this.metricsConfig = newMetricsConfig;
return this;
}
// supposed to always be a private method used strictly by data and delete write builders
private WriteBuilder createContextFunc(
Function, Context> newCreateContextFunc) {
this.createContextFunc = newCreateContextFunc;
return this;
}
public FileAppender build() {
Preconditions.checkNotNull(schema, "Schema is required");
for (Map.Entry entry : config.entrySet()) {
this.conf.set(entry.getKey(), entry.getValue());
}
// for compatibility
if (conf.get(VECTOR_ROW_BATCH_SIZE) != null && config.get(ORC_WRITE_BATCH_SIZE) == null) {
config.put(ORC_WRITE_BATCH_SIZE, conf.get(VECTOR_ROW_BATCH_SIZE));
}
// Map Iceberg properties to pass down to the ORC writer
Context context = createContextFunc.apply(config);
OrcConf.STRIPE_SIZE.setLong(conf, context.stripeSize());
OrcConf.BLOCK_SIZE.setLong(conf, context.blockSize());
OrcConf.COMPRESS.setString(conf, context.compressionKind().name());
OrcConf.COMPRESSION_STRATEGY.setString(conf, context.compressionStrategy().name());
OrcConf.OVERWRITE_OUTPUT_FILE.setBoolean(conf, overwrite);
OrcConf.BLOOM_FILTER_COLUMNS.setString(conf, context.bloomFilterColumns());
OrcConf.BLOOM_FILTER_FPP.setDouble(conf, context.bloomFilterFpp());
return new OrcFileAppender<>(
schema,
this.file,
createWriterFunc,
conf,
metadata,
context.vectorizedRowBatchSize(),
metricsConfig);
}
private static class Context {
private final long stripeSize;
private final long blockSize;
private final int vectorizedRowBatchSize;
private final CompressionKind compressionKind;
private final CompressionStrategy compressionStrategy;
private final String bloomFilterColumns;
private final double bloomFilterFpp;
public long stripeSize() {
return stripeSize;
}
public long blockSize() {
return blockSize;
}
public int vectorizedRowBatchSize() {
return vectorizedRowBatchSize;
}
public CompressionKind compressionKind() {
return compressionKind;
}
public CompressionStrategy compressionStrategy() {
return compressionStrategy;
}
public String bloomFilterColumns() {
return bloomFilterColumns;
}
public double bloomFilterFpp() {
return bloomFilterFpp;
}
private Context(
long stripeSize,
long blockSize,
int vectorizedRowBatchSize,
CompressionKind compressionKind,
CompressionStrategy compressionStrategy,
String bloomFilterColumns,
double bloomFilterFpp) {
this.stripeSize = stripeSize;
this.blockSize = blockSize;
this.vectorizedRowBatchSize = vectorizedRowBatchSize;
this.compressionKind = compressionKind;
this.compressionStrategy = compressionStrategy;
this.bloomFilterColumns = bloomFilterColumns;
this.bloomFilterFpp = bloomFilterFpp;
}
static Context dataContext(Map config) {
long stripeSize =
PropertyUtil.propertyAsLong(
config, OrcConf.STRIPE_SIZE.getAttribute(), ORC_STRIPE_SIZE_BYTES_DEFAULT);
stripeSize = PropertyUtil.propertyAsLong(config, ORC_STRIPE_SIZE_BYTES, stripeSize);
Preconditions.checkArgument(stripeSize > 0, "Stripe size must be > 0");
long blockSize =
PropertyUtil.propertyAsLong(
config, OrcConf.BLOCK_SIZE.getAttribute(), ORC_BLOCK_SIZE_BYTES_DEFAULT);
blockSize = PropertyUtil.propertyAsLong(config, ORC_BLOCK_SIZE_BYTES, blockSize);
Preconditions.checkArgument(blockSize > 0, "Block size must be > 0");
int vectorizedRowBatchSize =
PropertyUtil.propertyAsInt(config, ORC_WRITE_BATCH_SIZE, ORC_WRITE_BATCH_SIZE_DEFAULT);
Preconditions.checkArgument(
vectorizedRowBatchSize > 0, "VectorizedRow batch size must be > 0");
String codecAsString =
PropertyUtil.propertyAsString(
config, OrcConf.COMPRESS.getAttribute(), ORC_COMPRESSION_DEFAULT);
codecAsString = PropertyUtil.propertyAsString(config, ORC_COMPRESSION, codecAsString);
CompressionKind compressionKind = toCompressionKind(codecAsString);
String strategyAsString =
PropertyUtil.propertyAsString(
config,
OrcConf.COMPRESSION_STRATEGY.getAttribute(),
ORC_COMPRESSION_STRATEGY_DEFAULT);
strategyAsString =
PropertyUtil.propertyAsString(config, ORC_COMPRESSION_STRATEGY, strategyAsString);
CompressionStrategy compressionStrategy = toCompressionStrategy(strategyAsString);
String bloomFilterColumns =
PropertyUtil.propertyAsString(
config,
OrcConf.BLOOM_FILTER_COLUMNS.getAttribute(),
ORC_BLOOM_FILTER_COLUMNS_DEFAULT);
bloomFilterColumns =
PropertyUtil.propertyAsString(config, ORC_BLOOM_FILTER_COLUMNS, bloomFilterColumns);
double bloomFilterFpp =
PropertyUtil.propertyAsDouble(
config, OrcConf.BLOOM_FILTER_FPP.getAttribute(), ORC_BLOOM_FILTER_FPP_DEFAULT);
bloomFilterFpp =
PropertyUtil.propertyAsDouble(config, ORC_BLOOM_FILTER_FPP, bloomFilterFpp);
Preconditions.checkArgument(
bloomFilterFpp > 0.0 && bloomFilterFpp < 1.0,
"Bloom filter fpp must be > 0.0 and < 1.0");
return new Context(
stripeSize,
blockSize,
vectorizedRowBatchSize,
compressionKind,
compressionStrategy,
bloomFilterColumns,
bloomFilterFpp);
}
static Context deleteContext(Map config) {
Context dataContext = dataContext(config);
long stripeSize =
PropertyUtil.propertyAsLong(
config, DELETE_ORC_STRIPE_SIZE_BYTES, dataContext.stripeSize());
long blockSize =
PropertyUtil.propertyAsLong(
config, DELETE_ORC_BLOCK_SIZE_BYTES, dataContext.blockSize());
int vectorizedRowBatchSize =
PropertyUtil.propertyAsInt(
config, DELETE_ORC_WRITE_BATCH_SIZE, dataContext.vectorizedRowBatchSize());
String codecAsString = config.get(DELETE_ORC_COMPRESSION);
CompressionKind compressionKind =
codecAsString != null
? toCompressionKind(codecAsString)
: dataContext.compressionKind();
String strategyAsString = config.get(DELETE_ORC_COMPRESSION_STRATEGY);
CompressionStrategy compressionStrategy =
strategyAsString != null
? toCompressionStrategy(strategyAsString)
: dataContext.compressionStrategy();
return new Context(
stripeSize,
blockSize,
vectorizedRowBatchSize,
compressionKind,
compressionStrategy,
dataContext.bloomFilterColumns(),
dataContext.bloomFilterFpp());
}
private static CompressionKind toCompressionKind(String codecAsString) {
try {
return CompressionKind.valueOf(codecAsString.toUpperCase(Locale.ENGLISH));
} catch (IllegalArgumentException e) {
throw new IllegalArgumentException("Unsupported compression codec: " + codecAsString);
}
}
private static CompressionStrategy toCompressionStrategy(String strategyAsString) {
try {
return CompressionStrategy.valueOf(strategyAsString.toUpperCase(Locale.ENGLISH));
} catch (IllegalArgumentException e) {
throw new IllegalArgumentException(
"Unsupported compression strategy: " + strategyAsString);
}
}
}
}
public static DataWriteBuilder writeData(OutputFile file) {
return new DataWriteBuilder(file);
}
public static class DataWriteBuilder {
private final WriteBuilder appenderBuilder;
private final String location;
private PartitionSpec spec = null;
private StructLike partition = null;
private EncryptionKeyMetadata keyMetadata = null;
private SortOrder sortOrder = null;
private DataWriteBuilder(OutputFile file) {
this.appenderBuilder = write(file);
this.location = file.location();
}
public DataWriteBuilder forTable(Table table) {
schema(table.schema());
withSpec(table.spec());
setAll(table.properties());
metricsConfig(MetricsConfig.forTable(table));
return this;
}
public DataWriteBuilder schema(Schema newSchema) {
appenderBuilder.schema(newSchema);
return this;
}
public DataWriteBuilder set(String property, String value) {
appenderBuilder.set(property, value);
return this;
}
public DataWriteBuilder setAll(Map properties) {
appenderBuilder.setAll(properties);
return this;
}
public DataWriteBuilder meta(String property, String value) {
appenderBuilder.metadata(property, value);
return this;
}
public DataWriteBuilder overwrite() {
return overwrite(true);
}
public DataWriteBuilder overwrite(boolean enabled) {
appenderBuilder.overwrite(enabled);
return this;
}
public DataWriteBuilder metricsConfig(MetricsConfig newMetricsConfig) {
appenderBuilder.metricsConfig(newMetricsConfig);
return this;
}
public DataWriteBuilder createWriterFunc(
BiFunction> writerFunction) {
appenderBuilder.createWriterFunc(writerFunction);
return this;
}
public DataWriteBuilder withSpec(PartitionSpec newSpec) {
this.spec = newSpec;
return this;
}
public DataWriteBuilder withPartition(StructLike newPartition) {
this.partition = newPartition;
return this;
}
public DataWriteBuilder withKeyMetadata(EncryptionKeyMetadata metadata) {
this.keyMetadata = metadata;
return this;
}
public DataWriteBuilder withSortOrder(SortOrder newSortOrder) {
this.sortOrder = newSortOrder;
return this;
}
public DataWriter build() {
Preconditions.checkArgument(spec != null, "Cannot create data writer without spec");
Preconditions.checkArgument(
spec.isUnpartitioned() || partition != null,
"Partition must not be null when creating data writer for partitioned spec");
FileAppender fileAppender = appenderBuilder.build();
return new DataWriter<>(
fileAppender, FileFormat.ORC, location, spec, partition, keyMetadata, sortOrder);
}
}
public static DeleteWriteBuilder writeDeletes(OutputFile file) {
return new DeleteWriteBuilder(file);
}
public static class DeleteWriteBuilder {
private final WriteBuilder appenderBuilder;
private final String location;
private BiFunction> createWriterFunc = null;
private Schema rowSchema = null;
private PartitionSpec spec = null;
private StructLike partition = null;
private EncryptionKeyMetadata keyMetadata = null;
private int[] equalityFieldIds = null;
private SortOrder sortOrder;
private Function pathTransformFunc = Function.identity();
private DeleteWriteBuilder(OutputFile file) {
this.appenderBuilder = write(file);
this.location = file.location();
}
public DeleteWriteBuilder forTable(Table table) {
rowSchema(table.schema());
withSpec(table.spec());
setAll(table.properties());
metricsConfig(MetricsConfig.forTable(table));
return this;
}
public DeleteWriteBuilder set(String property, String value) {
appenderBuilder.set(property, value);
return this;
}
public DeleteWriteBuilder setAll(Map properties) {
appenderBuilder.setAll(properties);
return this;
}
public DeleteWriteBuilder meta(String property, String value) {
appenderBuilder.metadata(property, value);
return this;
}
public DeleteWriteBuilder overwrite() {
return overwrite(true);
}
public DeleteWriteBuilder overwrite(boolean enabled) {
appenderBuilder.overwrite(enabled);
return this;
}
public DeleteWriteBuilder metricsConfig(MetricsConfig newMetricsConfig) {
appenderBuilder.metricsConfig(newMetricsConfig);
return this;
}
public DeleteWriteBuilder createWriterFunc(
BiFunction> newWriterFunc) {
this.createWriterFunc = newWriterFunc;
return this;
}
public DeleteWriteBuilder rowSchema(Schema newSchema) {
this.rowSchema = newSchema;
return this;
}
public DeleteWriteBuilder withSpec(PartitionSpec newSpec) {
this.spec = newSpec;
return this;
}
public DeleteWriteBuilder withPartition(StructLike key) {
this.partition = key;
return this;
}
public DeleteWriteBuilder withKeyMetadata(EncryptionKeyMetadata metadata) {
this.keyMetadata = metadata;
return this;
}
public DeleteWriteBuilder equalityFieldIds(List fieldIds) {
this.equalityFieldIds = ArrayUtil.toIntArray(fieldIds);
return this;
}
public DeleteWriteBuilder equalityFieldIds(int... fieldIds) {
this.equalityFieldIds = fieldIds;
return this;
}
public DeleteWriteBuilder transformPaths(Function newPathTransformFunc) {
this.pathTransformFunc = newPathTransformFunc;
return this;
}
public DeleteWriteBuilder withSortOrder(SortOrder newSortOrder) {
this.sortOrder = newSortOrder;
return this;
}
public EqualityDeleteWriter buildEqualityWriter() {
Preconditions.checkState(
rowSchema != null, "Cannot create equality delete file without a schema");
Preconditions.checkState(
equalityFieldIds != null, "Cannot create equality delete file without delete field ids");
Preconditions.checkState(
createWriterFunc != null,
"Cannot create equality delete file unless createWriterFunc is set");
Preconditions.checkArgument(
spec != null, "Spec must not be null when creating equality delete writer");
Preconditions.checkArgument(
spec.isUnpartitioned() || partition != null,
"Partition must not be null for partitioned writes");
meta("delete-type", "equality");
meta(
"delete-field-ids",
IntStream.of(equalityFieldIds)
.mapToObj(Objects::toString)
.collect(Collectors.joining(", ")));
// the appender uses the row schema without extra columns
appenderBuilder.schema(rowSchema);
appenderBuilder.createWriterFunc(createWriterFunc);
appenderBuilder.createContextFunc(WriteBuilder.Context::deleteContext);
return new EqualityDeleteWriter<>(
appenderBuilder.build(),
FileFormat.ORC,
location,
spec,
partition,
keyMetadata,
sortOrder,
equalityFieldIds);
}
public PositionDeleteWriter buildPositionWriter() {
Preconditions.checkState(
equalityFieldIds == null, "Cannot create position delete file using delete field ids");
Preconditions.checkArgument(
spec != null, "Spec must not be null when creating position delete writer");
Preconditions.checkArgument(
spec.isUnpartitioned() || partition != null,
"Partition must not be null for partitioned writes");
Preconditions.checkArgument(
rowSchema == null || createWriterFunc != null,
"Create function should be provided if we write row data");
meta("delete-type", "position");
if (rowSchema != null && createWriterFunc != null) {
Schema deleteSchema = DeleteSchemaUtil.posDeleteSchema(rowSchema);
appenderBuilder.schema(deleteSchema);
appenderBuilder.createWriterFunc(
(schema, typeDescription) ->
GenericOrcWriters.positionDelete(
createWriterFunc.apply(deleteSchema, typeDescription), pathTransformFunc));
} else {
appenderBuilder.schema(DeleteSchemaUtil.pathPosSchema());
// We ignore the 'createWriterFunc' and 'rowSchema' even if is provided, since we do not
// write row data itself
appenderBuilder.createWriterFunc(
(schema, typeDescription) ->
GenericOrcWriters.positionDelete(
GenericOrcWriter.buildWriter(schema, typeDescription), Function.identity()));
}
appenderBuilder.createContextFunc(WriteBuilder.Context::deleteContext);
return new PositionDeleteWriter<>(
appenderBuilder.build(), FileFormat.ORC, location, spec, partition, keyMetadata);
}
}
public static ReadBuilder read(InputFile file) {
return new ReadBuilder(file);
}
public static class ReadBuilder {
private final InputFile file;
private final Configuration conf;
private Schema schema = null;
private Long start = null;
private Long length = null;
private Expression filter = null;
private boolean caseSensitive = true;
private NameMapping nameMapping = null;
private Function> readerFunc;
private Function> batchedReaderFunc;
private int recordsPerBatch = VectorizedRowBatch.DEFAULT_SIZE;
private ReadBuilder(InputFile file) {
Preconditions.checkNotNull(file, "Input file cannot be null");
this.file = file;
if (file instanceof HadoopInputFile) {
this.conf = new Configuration(((HadoopInputFile) file).getConf());
} else {
this.conf = new Configuration();
}
// We need to turn positional schema evolution off since we use column name based schema
// evolution for projection
this.conf.setBoolean(OrcConf.FORCE_POSITIONAL_EVOLUTION.getHiveConfName(), false);
}
/**
* Restricts the read to the given range: [start, start + length).
*
* @param newStart the start position for this read
* @param newLength the length of the range this read should scan
* @return this builder for method chaining
*/
public ReadBuilder split(long newStart, long newLength) {
this.start = newStart;
this.length = newLength;
return this;
}
public ReadBuilder project(Schema newSchema) {
this.schema = newSchema;
return this;
}
public ReadBuilder caseSensitive(boolean newCaseSensitive) {
OrcConf.IS_SCHEMA_EVOLUTION_CASE_SENSITIVE.setBoolean(this.conf, newCaseSensitive);
this.caseSensitive = newCaseSensitive;
return this;
}
public ReadBuilder config(String property, String value) {
conf.set(property, value);
return this;
}
public ReadBuilder createReaderFunc(Function> readerFunction) {
Preconditions.checkArgument(
this.batchedReaderFunc == null,
"Reader function cannot be set since the batched version is already set");
this.readerFunc = readerFunction;
return this;
}
public ReadBuilder filter(Expression newFilter) {
this.filter = newFilter;
return this;
}
public ReadBuilder createBatchedReaderFunc(
Function> batchReaderFunction) {
Preconditions.checkArgument(
this.readerFunc == null,
"Batched reader function cannot be set since the non-batched version is already set");
this.batchedReaderFunc = batchReaderFunction;
return this;
}
public ReadBuilder recordsPerBatch(int numRecordsPerBatch) {
this.recordsPerBatch = numRecordsPerBatch;
return this;
}
public ReadBuilder withNameMapping(NameMapping newNameMapping) {
this.nameMapping = newNameMapping;
return this;
}
public CloseableIterable build() {
Preconditions.checkNotNull(schema, "Schema is required");
return new OrcIterable<>(
file,
conf,
schema,
nameMapping,
start,
length,
readerFunc,
caseSensitive,
filter,
batchedReaderFunc,
recordsPerBatch);
}
}
static Reader newFileReader(InputFile file, Configuration config) {
ReaderOptions readerOptions = OrcFile.readerOptions(config).useUTCTimestamp(true);
if (file instanceof HadoopInputFile) {
readerOptions.filesystem(((HadoopInputFile) file).getFileSystem());
} else {
// In case of any other InputFile we wrap the InputFile with InputFileSystem that only
// supports the creation of an InputStream. To prevent a file status call to determine the
// length we supply the length as input
readerOptions.filesystem(new FileIOFSUtil.InputFileSystem(file)).maxLength(file.getLength());
}
try {
return OrcFile.createReader(new Path(file.location()), readerOptions);
} catch (IOException ioe) {
throw new RuntimeIOException(ioe, "Failed to open file: %s", file.location());
}
}
static Writer newFileWriter(
OutputFile file, OrcFile.WriterOptions options, Map metadata) {
if (file instanceof HadoopOutputFile) {
options.fileSystem(((HadoopOutputFile) file).getFileSystem());
} else {
options.fileSystem(new FileIOFSUtil.OutputFileSystem(file));
}
final Path locPath = new Path(file.location());
final Writer writer;
try {
writer = OrcFile.createWriter(locPath, options);
} catch (IOException ioe) {
throw new RuntimeIOException(ioe, "Can't create file %s", locPath);
}
metadata.forEach((key, value) -> writer.addUserMetadata(key, ByteBuffer.wrap(value)));
return writer;
}
}