org.apache.iceberg.orc.OrcIterable Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of iceberg-orc Show documentation
A table format for huge analytic datasets
There is a newer version: 1.7.1
/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *   http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied.  See the License for the
 * specific language governing permissions and limitations
 * under the License.
 */
package org.apache.iceberg.orc;

import java.io.IOException;
import java.util.function.Function;
import org.apache.hadoop.conf.Configuration;
import org.apache.iceberg.Schema;
import org.apache.iceberg.exceptions.RuntimeIOException;
import org.apache.iceberg.expressions.Binder;
import org.apache.iceberg.expressions.Expression;
import org.apache.iceberg.expressions.Expressions;
import org.apache.iceberg.io.CloseableGroup;
import org.apache.iceberg.io.CloseableIterable;
import org.apache.iceberg.io.CloseableIterator;
import org.apache.iceberg.io.InputFile;
import org.apache.iceberg.mapping.MappingUtil;
import org.apache.iceberg.mapping.NameMapping;
import org.apache.iceberg.util.Pair;
import org.apache.orc.Reader;
import org.apache.orc.TypeDescription;
import org.apache.orc.storage.ql.exec.vector.VectorizedRowBatch;
import org.apache.orc.storage.ql.io.sarg.SearchArgument;

/** Iterable used to read rows from ORC. */
class OrcIterable extends CloseableGroup implements CloseableIterable {
  private final Configuration config;
  private final Schema schema;
  private final InputFile file;
  private final Long start;
  private final Long length;
  private final Function> readerFunction;
  private final Expression filter;
  private final boolean caseSensitive;
  private final Function> batchReaderFunction;
  private final int recordsPerBatch;
  private NameMapping nameMapping;

  OrcIterable(
      InputFile file,
      Configuration config,
      Schema schema,
      NameMapping nameMapping,
      Long start,
      Long length,
      Function> readerFunction,
      boolean caseSensitive,
      Expression filter,
      Function> batchReaderFunction,
      int recordsPerBatch) {
    this.schema = schema;
    this.readerFunction = readerFunction;
    this.file = file;
    this.nameMapping = nameMapping;
    this.start = start;
    this.length = length;
    this.config = config;
    this.caseSensitive = caseSensitive;
    this.filter = (filter == Expressions.alwaysTrue()) ? null : filter;
    this.batchReaderFunction = batchReaderFunction;
    this.recordsPerBatch = recordsPerBatch;
  }

  @SuppressWarnings("unchecked")
  @Override
  public CloseableIterator iterator() {
    Reader orcFileReader = ORC.newFileReader(file, config);
    addCloseable(orcFileReader);

    TypeDescription fileSchema = orcFileReader.getSchema();
    final TypeDescription readOrcSchema;
    if (ORCSchemaUtil.hasIds(fileSchema)) {
      readOrcSchema = ORCSchemaUtil.buildOrcProjection(schema, fileSchema);
    } else {
      if (nameMapping == null) {
        nameMapping = MappingUtil.create(schema);
      }
      TypeDescription typeWithIds = ORCSchemaUtil.applyNameMapping(fileSchema, nameMapping);
      readOrcSchema = ORCSchemaUtil.buildOrcProjection(schema, typeWithIds);
    }

    SearchArgument sarg = null;
    if (filter != null) {
      Expression boundFilter = Binder.bind(schema.asStruct(), filter, caseSensitive);
      sarg = ExpressionToSearchArgument.convert(boundFilter, readOrcSchema);
    }

    VectorizedRowBatchIterator rowBatchIterator =
        newOrcIterator(file, readOrcSchema, start, length, orcFileReader, sarg, recordsPerBatch);
    if (batchReaderFunction != null) {
      OrcBatchReader batchReader = (OrcBatchReader) batchReaderFunction.apply(readOrcSchema);
      return CloseableIterator.transform(
          rowBatchIterator,
          pair -> {
            batchReader.setBatchContext(pair.second());
            return batchReader.read(pair.first());
          });
    } else {
      return new OrcRowIterator<>(
          rowBatchIterator, (OrcRowReader) readerFunction.apply(readOrcSchema));
    }
  }

  private static VectorizedRowBatchIterator newOrcIterator(
      InputFile file,
      TypeDescription readerSchema,
      Long start,
      Long length,
      Reader orcFileReader,
      SearchArgument sarg,
      int recordsPerBatch) {
    final Reader.Options options = orcFileReader.options();
    if (start != null) {
      options.range(start, length);
    }
    options.schema(readerSchema);
    options.searchArgument(sarg, new String[] {});

    try {
      return new VectorizedRowBatchIterator(
          file.location(), readerSchema, orcFileReader.rows(options), recordsPerBatch);
    } catch (IOException ioe) {
      throw new RuntimeIOException(ioe, "Failed to get ORC rows for file: %s", file);
    }
  }

  private static class OrcRowIterator implements CloseableIterator {

    private int nextRow;
    private VectorizedRowBatch current;
    private int currentBatchSize;

    private final VectorizedRowBatchIterator batchIter;
    private final OrcRowReader reader;

    OrcRowIterator(VectorizedRowBatchIterator batchIter, OrcRowReader reader) {
      this.batchIter = batchIter;
      this.reader = reader;
      current = null;
      nextRow = 0;
      currentBatchSize = 0;
    }

    @Override
    public boolean hasNext() {
      return (current != null && nextRow < currentBatchSize) || batchIter.hasNext();
    }

    @Override
    public T next() {
      if (current == null || nextRow >= currentBatchSize) {
        Pair nextBatch = batchIter.next();
        current = nextBatch.first();
        currentBatchSize = current.size;
        nextRow = 0;
        this.reader.setBatchContext(nextBatch.second());
      }
      // If selected is in use then the row ids should be determined from the selected vector
      int rowId = current.isSelectedInUse() ? current.selected[nextRow] : nextRow;
      nextRow++;
      return this.reader.read(current, rowId);
    }

    @Override
    public void close() throws IOException {
      batchIter.close();
    }
  }
}