org.apache.iceberg.orc.OrcIterable Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of iceberg-orc Show documentation
Show all versions of iceberg-orc Show documentation
A table format for huge analytic datasets
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package org.apache.iceberg.orc;
import java.io.IOException;
import java.util.Set;
import java.util.function.Function;
import org.apache.hadoop.conf.Configuration;
import org.apache.iceberg.Schema;
import org.apache.iceberg.exceptions.RuntimeIOException;
import org.apache.iceberg.expressions.Binder;
import org.apache.iceberg.expressions.Expression;
import org.apache.iceberg.expressions.Expressions;
import org.apache.iceberg.io.CloseableGroup;
import org.apache.iceberg.io.CloseableIterable;
import org.apache.iceberg.io.CloseableIterator;
import org.apache.iceberg.io.InputFile;
import org.apache.iceberg.mapping.MappingUtil;
import org.apache.iceberg.mapping.NameMapping;
import org.apache.iceberg.relocated.com.google.common.base.Preconditions;
import org.apache.iceberg.relocated.com.google.common.collect.Sets;
import org.apache.iceberg.types.TypeUtil;
import org.apache.iceberg.util.Pair;
import org.apache.orc.Reader;
import org.apache.orc.TypeDescription;
import org.apache.orc.storage.ql.exec.vector.VectorizedRowBatch;
import org.apache.orc.storage.ql.io.sarg.SearchArgument;
/**
* Iterable used to read rows from ORC.
*/
class OrcIterable extends CloseableGroup implements CloseableIterable {
private final Configuration config;
private final Schema schema;
private final InputFile file;
private final Long start;
private final Long length;
private final Function> readerFunction;
private final Expression filter;
private final boolean caseSensitive;
private final Function> batchReaderFunction;
private final int recordsPerBatch;
private NameMapping nameMapping;
private final OrcRowFilter rowFilter;
OrcIterable(InputFile file, Configuration config, Schema schema,
NameMapping nameMapping, Long start, Long length,
Function> readerFunction, boolean caseSensitive, Expression filter,
Function> batchReaderFunction, int recordsPerBatch,
OrcRowFilter rowFilter) {
this.schema = schema;
this.readerFunction = readerFunction;
this.file = file;
this.nameMapping = nameMapping;
this.start = start;
this.length = length;
this.config = config;
this.caseSensitive = caseSensitive;
this.filter = (filter == Expressions.alwaysTrue()) ? null : filter;
this.batchReaderFunction = batchReaderFunction;
this.recordsPerBatch = recordsPerBatch;
this.rowFilter = rowFilter;
}
@SuppressWarnings("unchecked")
@Override
public CloseableIterator iterator() {
Reader orcFileReader = ORC.newFileReader(file, config);
addCloseable(orcFileReader);
TypeDescription fileSchema = orcFileReader.getSchema();
final TypeDescription readOrcSchema;
final TypeDescription fileSchemaWithIds;
if (ORCSchemaUtil.hasIds(fileSchema)) {
fileSchemaWithIds = fileSchema;
} else {
if (nameMapping == null) {
nameMapping = MappingUtil.create(schema);
}
fileSchemaWithIds = ORCSchemaUtil.applyNameMapping(fileSchema, nameMapping);
}
readOrcSchema = ORCSchemaUtil.buildOrcProjection(schema, fileSchemaWithIds);
// If the projected ORC schema is an empty struct, it means we are only projecting columns
// with default values that aren't existent in previously written files, and thus we won't need
// to push down filters to ORC's SearchArgument, since we are not reading anything from files at all
boolean isEmptyStruct = readOrcSchema.getChildren().size() == 0;
SearchArgument sarg = null;
if (filter != null && !isEmptyStruct) {
Expression boundFilter = Binder.bind(schema.asStruct(), filter, caseSensitive);
sarg = ExpressionToSearchArgument.convert(boundFilter, readOrcSchema);
}
if (rowFilter == null) {
VectorizedRowBatchIterator rowBatchIterator = newOrcIterator(file, readOrcSchema, start, length, orcFileReader,
sarg, recordsPerBatch);
if (batchReaderFunction != null) {
OrcBatchReader batchReader = (OrcBatchReader) batchReaderFunction.apply(readOrcSchema);
return CloseableIterator.transform(rowBatchIterator, pair -> {
batchReader.setBatchContext(pair.second());
return batchReader.read(pair.first());
});
} else {
return new OrcRowIterator<>(rowBatchIterator, (OrcRowReader) readerFunction.apply(readOrcSchema),
null, null);
}
} else {
Preconditions.checkArgument(batchReaderFunction == null,
"Row-level filtering not supported by vectorized reader");
Set filterColumnIds = TypeUtil.getProjectedIds(rowFilter.requiredSchema());
Set filterColumnIdsNotInReadSchema = Sets.difference(filterColumnIds,
TypeUtil.getProjectedIds(schema));
Schema extraFilterColumns = TypeUtil.select(rowFilter.requiredSchema(), filterColumnIdsNotInReadSchema);
Schema finalReadSchema = TypeUtil.join(schema, extraFilterColumns);
TypeDescription finalReadOrcSchema = ORCSchemaUtil.buildOrcProjection(finalReadSchema, fileSchemaWithIds);
TypeDescription rowFilterOrcSchema = ORCSchemaUtil.buildOrcProjection(rowFilter.requiredSchema(),
fileSchemaWithIds);
RowFilterValueReader filterReader = new RowFilterValueReader(finalReadOrcSchema, rowFilterOrcSchema);
return new OrcRowIterator<>(
newOrcIterator(file, finalReadOrcSchema, start, length, orcFileReader, sarg, recordsPerBatch),
(OrcRowReader) readerFunction.apply(readOrcSchema), rowFilter, filterReader);
}
}
private static VectorizedRowBatchIterator newOrcIterator(InputFile file,
TypeDescription readerSchema,
Long start, Long length,
Reader orcFileReader, SearchArgument sarg,
int recordsPerBatch) {
final Reader.Options options = orcFileReader.options();
if (start != null) {
options.range(start, length);
}
options.schema(readerSchema);
options.searchArgument(sarg, new String[]{});
try {
return new VectorizedRowBatchIterator(file.location(), readerSchema, orcFileReader.rows(options),
recordsPerBatch);
} catch (IOException ioe) {
throw new RuntimeIOException(ioe, "Failed to get ORC rows for file: %s", file);
}
}
private static class OrcRowIterator implements CloseableIterator {
private int currentRow;
private VectorizedRowBatch currentBatch;
private boolean advanced = false;
private final VectorizedRowBatchIterator batchIter;
private final OrcRowReader reader;
private final OrcRowFilter filter;
private final RowFilterValueReader filterReader;
OrcRowIterator(VectorizedRowBatchIterator batchIter, OrcRowReader reader, OrcRowFilter filter,
RowFilterValueReader filterReader) {
this.batchIter = batchIter;
this.reader = reader;
this.filter = filter;
this.filterReader = filterReader;
currentBatch = null;
currentRow = 0;
}
private void advance() {
if (!advanced) {
while (true) {
currentRow++;
// if batch has been consumed, move to next batch
if (currentBatch == null || currentRow >= currentBatch.size) {
if (batchIter.hasNext()) {
Pair nextBatch = batchIter.next();
currentBatch = nextBatch.first();
currentRow = 0;
reader.setBatchContext(nextBatch.second());
if (filterReader != null) {
filterReader.setBatchContext(nextBatch.second());
}
} else {
// no more batches left to process
currentBatch = null;
currentRow = -1;
break;
}
}
if (filter == null || filter.shouldKeep(filterReader.read(currentBatch, currentRow))) {
// we have found our row
break;
}
}
advanced = true;
}
}
@Override
public boolean hasNext() {
advance();
return currentBatch != null;
}
@Override
public T next() {
advance();
// mark current row as used
advanced = false;
return this.reader.read(currentBatch, currentRow);
}
@Override
public void close() throws IOException {
batchIter.close();
}
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy