org.apache.parquet.column.impl.SynchronizingColumnReader Maven / Gradle / Ivy

Go to download
/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *   http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied.  See the License for the
 * specific language governing permissions and limitations
 * under the License.
 */
package org.apache.parquet.column.impl;

import java.util.PrimitiveIterator;

import org.apache.parquet.VersionParser.ParsedVersion;
import org.apache.parquet.column.ColumnDescriptor;
import org.apache.parquet.column.ColumnReader;
import org.apache.parquet.column.page.DataPage;
import org.apache.parquet.column.page.PageReader;
import org.apache.parquet.io.RecordReader;
import org.apache.parquet.io.api.PrimitiveConverter;

/**
 * A {@link ColumnReader} implementation for utilizing indexes. When filtering using column indexes we might skip
 * reading some pages for different columns. Because the rows are not aligned between the pages of the different columns
 * it might be required to skip some values in this {@link ColumnReader} so we provide only the required values for the
 * higher API ({@link RecordReader}) and they do not need to handle or know about the skipped pages. The values (and the
 * related rl and dl) are skipped based on the iterator of the required row indexes and the first row index of each
 * page.

 * For example:
 *
 *  * rows   col1   col2   col3
 *      ┌──────┬──────┬──────┐
 *   0  │  p0  │      │      │
 *      ╞══════╡  p0  │  p0  │
 *  20  │ p1(X)│------│------│
 *      ╞══════╪══════╡      │
 *  40  │ p2(X)│      │------│
 *      ╞══════╡ p1(X)╞══════╡
 *  60  │ p3(X)│      │------│
 *      ╞══════╪══════╡      │
 *  80  │  p4  │      │  p1  │
 *      ╞══════╡  p2  │      │
 * 100  │  p5  │      │      │
 *      └──────┴──────┴──────┘
 * 
 *
 * The pages 1, 2, 3 in col1 are skipped so we have to skip the rows [20, 79]. Because page 1 in col2 contains values
 * only for the rows [40, 79] we skip this entire page as well. To synchronize the row reading we have to skip the
 * values (and the related rl and dl) for the rows [20, 39] in the end of the page 0 for col2. Similarly, we have to
 * skip values while reading page0 and page1 for col3.
 */
class SynchronizingColumnReader extends ColumnReaderBase {

  private final PrimitiveIterator.OfLong rowIndexes;
  private long currentRow;
  private long targetRow;
  private long lastRowInPage;
  private int valuesReadFromPage;

  SynchronizingColumnReader(ColumnDescriptor path, PageReader pageReader, PrimitiveConverter converter,
      ParsedVersion writerVersion, PrimitiveIterator.OfLong rowIndexes) {
    super(path, pageReader, converter, writerVersion);
    this.rowIndexes = rowIndexes;
    targetRow = Long.MIN_VALUE;
    consume();
  }

  @Override
  boolean isPageFullyConsumed() {
    return getPageValueCount() <= valuesReadFromPage || lastRowInPage < targetRow;
  }

  @Override
  boolean isFullyConsumed() {
    return !rowIndexes.hasNext();
  }

  @Override
  boolean skipRL(int rl) {
    ++valuesReadFromPage;
    if (rl == 0) {
      ++currentRow;
      if (currentRow > targetRow) {
        targetRow = rowIndexes.hasNext() ? rowIndexes.nextLong() : Long.MAX_VALUE;
      }
    }
    return currentRow < targetRow;
  }

  @Override
  protected void newPageInitialized(DataPage page) {
    long firstRowIndex = page.getFirstRowIndex()
        .orElseThrow(() -> new IllegalArgumentException("Missing firstRowIndex for synchronizing values"));
    int rowCount = page.getIndexRowCount()
        .orElseThrow(() -> new IllegalArgumentException("Missing rowCount for synchronizing values"));
    currentRow = firstRowIndex - 1;
    lastRowInPage = firstRowIndex + rowCount - 1;
    valuesReadFromPage = 0;
  }

}