All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.parquet.column.impl.SynchronizingColumnReader Maven / Gradle / Ivy

/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *   http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied.  See the License for the
 * specific language governing permissions and limitations
 * under the License.
 */
package org.apache.parquet.column.impl;

import java.util.PrimitiveIterator;
import org.apache.parquet.VersionParser.ParsedVersion;
import org.apache.parquet.column.ColumnDescriptor;
import org.apache.parquet.column.ColumnReader;
import org.apache.parquet.column.page.DataPage;
import org.apache.parquet.column.page.PageReader;
import org.apache.parquet.io.RecordReader;
import org.apache.parquet.io.api.PrimitiveConverter;

/**
 * A {@link ColumnReader} implementation for utilizing indexes. When filtering using column indexes we might skip
 * reading some pages for different columns. Because the rows are not aligned between the pages of the different columns
 * it might be required to skip some values in this {@link ColumnReader} so we provide only the required values for the
 * higher API ({@link RecordReader}) and they do not need to handle or know about the skipped pages. The values (and the
 * related rl and dl) are skipped based on the iterator of the required row indexes and the first row index of each
 * page.
* For example: * *
 * rows   col1   col2   col3
 *      ┌──────┬──────┬──────┐
 *   0  │  p0  │      │      │
 *      ╞══════╡  p0  │  p0  │
 *  20  │ p1(X)│------│------│
 *      ╞══════╪══════╡      │
 *  40  │ p2(X)│      │------│
 *      ╞══════╡ p1(X)╞══════╡
 *  60  │ p3(X)│      │------│
 *      ╞══════╪══════╡      │
 *  80  │  p4  │      │  p1  │
 *      ╞══════╡  p2  │      │
 * 100  │  p5  │      │      │
 *      └──────┴──────┴──────┘
 * 
*

* The pages 1, 2, 3 in col1 are skipped so we have to skip the rows [20, 79]. Because page 1 in col2 contains values * only for the rows [40, 79] we skip this entire page as well. To synchronize the row reading we have to skip the * values (and the related rl and dl) for the rows [20, 39] in the end of the page 0 for col2. Similarly, we have to * skip values while reading page0 and page1 for col3. */ class SynchronizingColumnReader extends ColumnReaderBase { private final PrimitiveIterator.OfLong rowIndexes; private long currentRow; private long targetRow; private long lastRowInPage; private int valuesReadFromPage; SynchronizingColumnReader( ColumnDescriptor path, PageReader pageReader, PrimitiveConverter converter, ParsedVersion writerVersion, PrimitiveIterator.OfLong rowIndexes) { super(path, pageReader, converter, writerVersion); this.rowIndexes = rowIndexes; targetRow = Long.MIN_VALUE; consume(); } @Override boolean isPageFullyConsumed() { return getPageValueCount() <= valuesReadFromPage || lastRowInPage < targetRow; } @Override boolean isFullyConsumed() { return !rowIndexes.hasNext(); } @Override boolean skipRL(int rl) { ++valuesReadFromPage; if (rl == 0) { ++currentRow; if (currentRow > targetRow) { targetRow = rowIndexes.hasNext() ? rowIndexes.nextLong() : Long.MAX_VALUE; } } return currentRow < targetRow; } @Override protected void newPageInitialized(DataPage page) { long firstRowIndex = page.getFirstRowIndex() .orElseThrow(() -> new IllegalArgumentException("Missing firstRowIndex for synchronizing values")); int rowCount = page.getIndexRowCount() .orElseThrow(() -> new IllegalArgumentException("Missing rowCount for synchronizing values")); currentRow = firstRowIndex - 1; lastRowInPage = firstRowIndex + rowCount - 1; valuesReadFromPage = 0; } }





© 2015 - 2025 Weber Informatics LLC | Privacy Policy