All Downloads are FREE. Search and download functionalities are using the official Maven repository.

uk.ac.starlink.parquet.SequentialParquetStarTable Maven / Gradle / Ivy

package uk.ac.starlink.parquet;

import java.io.IOException;
import java.util.Arrays;
import java.util.List;
import java.util.function.LongSupplier;
import org.apache.parquet.column.ColumnDescriptor;
import org.apache.parquet.column.ColumnReader;
import org.apache.parquet.column.ColumnReadStore;
import org.apache.parquet.column.impl.ColumnReaderImpl;
import org.apache.parquet.column.page.PageReadStore;
import org.apache.parquet.hadoop.ParquetFileReader;
import org.apache.parquet.hadoop.metadata.BlockMetaData;
import org.apache.parquet.schema.MessageType;
import uk.ac.starlink.table.RowSequence;
import uk.ac.starlink.table.RowSplittable;
import uk.ac.starlink.util.IOSupplier;

/**
 * ParquetStarTable concrete subclass that provides sequential access only.
 *
 * @author   Mark Taylor
 * @since    25 Feb 2021
 */
public class SequentialParquetStarTable extends ParquetStarTable {

    /**
     * Constructor.
     *
     * @param  pfrSupplier  access to parquet data file
     */
    public SequentialParquetStarTable( IOSupplier
                                       pfrSupplier )
            throws IOException {
        super( pfrSupplier );
    }

    public boolean isRandom() {
        return false;
    }

    public RowSequence getRowSequence() throws IOException {
        return new ParquetRowSplittable();
    }

    public RowSplittable getRowSplittable() throws IOException {
        return new ParquetRowSplittable();
    }

    /**
     * Returns an array giving the number of rows in each row block
     * for a parquet file.
     *
     * @param   pfr  file reader
     * @return   array giving row counts per row group
     */
    private static long[] getBlockSizes( ParquetFileReader pfr ) {
        return pfr.getRowGroups().stream()
                  .mapToLong( blk -> blk.getRowCount() )
                  .toArray();
    }

    /**
     * RowSequence implementation for ParquetStarTable.
     */
    private class ParquetRowSplittable implements RowSplittable {

        final ParquetFileReader pfr_;
        final int ncol_;
        final MessageType schema_;
        final long[] irows_;
        final long[] blockSizes_;
        int iblock_;
        int iblockEnd_;
        ColAccess[] colAccesses_;
        long irow_;
        long irGroupEnd_;

        /**
         * Constructs a splittable over the whole table.
         */
        public ParquetRowSplittable() throws IOException {
            this( -1, -1 );
        }

        /**
         * Constructor for internal use, constructs a sub-splittable.
         *
         * @param  iblock  index before first block to be processed
         * @param  iblockEnd  index after last block to be processed,
         *                    or -1 for last block in table
         */
        private ParquetRowSplittable( int iblock, int iblockEnd )
                throws IOException {
            pfr_ = getParquetFileReader();
            ncol_ = getColumnCount();
            schema_ = getSchema();
            blockSizes_ = getBlockSizes( pfr_ );
            irows_ = new long[ ncol_ ];
            iblock_ = -1;
            irow_ = -1;
            iblockEnd_ = iblockEnd >= 0 ? iblockEnd : blockSizes_.length;
            skipBlocks( iblock + 1 );
        }

        public LongSupplier rowIndex() {
            return () -> irow_;
        }

        public ParquetRowSplittable split() {
            if ( colAccesses_ == null && iblockEnd_ - iblock_ > 2 ) {
                int mid = ( 1 + iblock_ + iblockEnd_ ) / 2;
                ParquetRowSplittable split;
                try {
                    split = new ParquetRowSplittable( iblock_, mid );
                    skipBlocks( mid - 1 - iblock_ );
                }
                catch ( IOException e ) {
                    return null;
                }
                return split;
            }
            else {
                return null;
            }
        }

        public long splittableSize() {
            int nr = 0;
            for ( int ib = iblock_ + 1; ib < iblockEnd_; ib++ ) {
                nr += blockSizes_[ ib ];
            }
            return nr;
        }

        public boolean next() throws IOException {
            assert irow_ < irGroupEnd_;
            if ( irow_ + 1 == irGroupEnd_ ) {
                assert iblock_ < iblockEnd_;
                if ( iblock_ + 1 == iblockEnd_ ) {
                    return false;
                }
                nextReadStore();
                Arrays.fill( irows_, irow_ );
            }
            irow_++;
            return true;
        }

        public Object getCell( int icol ) {
            ColAccess colAccess;
            try {
                colAccess = colAccesses_[ icol ];
            }
            catch ( NullPointerException e ) {
                throw new IllegalStateException( "next() not called" );
            }

            /* Make sure that we have read or skipped to the current
             * position before reading the value of the cell.
             * By doing it on demand here rather than during the next call,
             * we can avoid reading any data at all for those columns that
             * are never read in a given row group. */
            long nadv = irow_ - irows_[ icol ];
            if ( nadv > 0 ) {
                if ( nadv > 1 ) {
                    colAccess.skip( nadv - 1 );
                }
                colAccess.clear();
            }
            irows_[ icol ] = irow_;
            return colAccess.read();
        }

        public Object[] getRow() {
            Object[] row = new Object[ ncol_ ];
            for ( int ic = 0; ic < ncol_; ic++ ) {
                row[ ic ] = getCell( ic );
            }
            return row;
        }

        public void close() throws IOException {
            pfr_.close();
        }

        /**
         * Reads the next row group and prepares column handlers
         * ready to supply its content.
         */
        private boolean nextReadStore() throws IOException {
            PageReadStore pageStore = pfr_.readNextRowGroup();
            if ( pageStore == null ) {
                return false;
            }
            iblock_++;
            assert pageStore.getRowCount() == blockSizes_[ iblock_ ];
            irGroupEnd_ += pageStore.getRowCount();
            ColumnReadStore crstore = getColumnReadStore( pageStore, schema_ );
            colAccesses_ = new ColAccess[ ncol_ ];
            for ( int ic = 0; ic < ncol_; ic++ ) {
                colAccesses_[ ic ] =
                    createColAccess( crstore, getInputColumn( ic ) );
            }
            return true;
        }

        /**
         * Skips over a given number of row blocks from the current position
         * without reading the data.
         *
         * @param  nb  number of blocks to skip
         */
        private void skipBlocks( int nb ) throws IOException {
            for ( int ib = 0; ib < nb; ib++ ) {
                if ( ! pfr_.skipNextRowGroup() ) {
                    throw new IOException( "Failed to skip row group" );
                }
                iblock_++;
                long nr = blockSizes_[ iblock_ ];
                irow_ += nr;
                irGroupEnd_ += nr;
            }
        }
    }

    /**
     * Creates a ColAccess for accessing a given column.
     */
    private static  ColAccess
            createColAccess( ColumnReadStore crstore,
                             final InputColumn incol ) {

        /* 

This implementation was written with reference to * org.apache.parquet.tools.command.DumpCommand. * There is a certain amount of guesswork involved in use * of the generally under-documented parquet-mr data access API. */ /* Prepare to read. */ ColumnDescriptor cdesc = incol.getColumnDescriptor(); final int cdefmax = cdesc.getMaxDefinitionLevel(); final Decoder decoder = incol.createDecoder(); final ColumnReader crdr = crstore.getColumnReader( cdesc ); /* The readValue method is cleaner and a bit faster, so use that * where possible, but we don't have compile-time guarantees that * it's available, so fall back to advancing by hand if required. */ final Runnable readValue = crdr instanceof ColumnReaderImpl ? ((ColumnReaderImpl) crdr)::readValue : () -> { decoder.clearValue(); decoder.readItem( crdr ); }; return new ColAccess() { private boolean hasValue_; private T value_; public void clear() { hasValue_ = false; } public void skip( long n ) { for ( long i = 0; i < n; i++ ) { /* From the scant parquet-mr API documentation I would have * thought that just a consume call here would be the * right thing to do, but it seems you need to read * the value as well. */ do { if ( crdr.getCurrentDefinitionLevel() == cdefmax ) { readValue.run(); } crdr.consume(); } while ( crdr.getCurrentRepetitionLevel() > 0 ); } } public T read() { if ( ! hasValue_ ) { decoder.clearValue(); do { if ( crdr.getCurrentDefinitionLevel() == cdefmax ) { decoder.readItem( crdr ); } // I thought that I ought to be passing a null along // in this case. However, it does the wrong thing // at least for some arrays. Hmm. // else { // decoder.readNull(); // } crdr.consume(); } while ( crdr.getCurrentRepetitionLevel() > 0 ); value_ = decoder.getValue(); hasValue_ = true; } return value_; } }; } /** * Manages reading, caching and skipping values from a column. */ private static interface ColAccess { /** * Indicates that the most recently-read value will no longer be * required. */ void clear(); /** * Skips over a given number of column entries. * * @param n number of entries to skip. */ void skip( long n ); /** * Returns the current value, reading it from the column data * if it has not already been read. * * @return entry value */ T read(); } }





© 2015 - 2024 Weber Informatics LLC | Privacy Policy