uk.ac.starlink.parquet.SequentialParquetStarTable Maven / Gradle / Ivy
package uk.ac.starlink.parquet;
import java.io.IOException;
import java.util.Arrays;
import java.util.List;
import java.util.function.LongSupplier;
import org.apache.parquet.column.ColumnDescriptor;
import org.apache.parquet.column.ColumnReader;
import org.apache.parquet.column.ColumnReadStore;
import org.apache.parquet.column.impl.ColumnReaderImpl;
import org.apache.parquet.column.page.PageReadStore;
import org.apache.parquet.hadoop.ParquetFileReader;
import org.apache.parquet.hadoop.metadata.BlockMetaData;
import org.apache.parquet.schema.MessageType;
import uk.ac.starlink.table.RowSequence;
import uk.ac.starlink.table.RowSplittable;
import uk.ac.starlink.util.IOSupplier;
/**
* ParquetStarTable concrete subclass that provides sequential access only.
*
* @author Mark Taylor
* @since 25 Feb 2021
*/
public class SequentialParquetStarTable extends ParquetStarTable {
/**
* Constructor.
*
* @param pfrSupplier access to parquet data file
*/
public SequentialParquetStarTable( IOSupplier
pfrSupplier )
throws IOException {
super( pfrSupplier );
}
public boolean isRandom() {
return false;
}
public RowSequence getRowSequence() throws IOException {
return new ParquetRowSplittable();
}
public RowSplittable getRowSplittable() throws IOException {
return new ParquetRowSplittable();
}
/**
* Returns an array giving the number of rows in each row block
* for a parquet file.
*
* @param pfr file reader
* @return array giving row counts per row group
*/
private static long[] getBlockSizes( ParquetFileReader pfr ) {
return pfr.getRowGroups().stream()
.mapToLong( blk -> blk.getRowCount() )
.toArray();
}
/**
* RowSequence implementation for ParquetStarTable.
*/
private class ParquetRowSplittable implements RowSplittable {
final ParquetFileReader pfr_;
final int ncol_;
final MessageType schema_;
final long[] irows_;
final long[] blockSizes_;
int iblock_;
int iblockEnd_;
ColAccess>[] colAccesses_;
long irow_;
long irGroupEnd_;
/**
* Constructs a splittable over the whole table.
*/
public ParquetRowSplittable() throws IOException {
this( -1, -1 );
}
/**
* Constructor for internal use, constructs a sub-splittable.
*
* @param iblock index before first block to be processed
* @param iblockEnd index after last block to be processed,
* or -1 for last block in table
*/
private ParquetRowSplittable( int iblock, int iblockEnd )
throws IOException {
pfr_ = getParquetFileReader();
ncol_ = getColumnCount();
schema_ = getSchema();
blockSizes_ = getBlockSizes( pfr_ );
irows_ = new long[ ncol_ ];
iblock_ = -1;
irow_ = -1;
iblockEnd_ = iblockEnd >= 0 ? iblockEnd : blockSizes_.length;
skipBlocks( iblock + 1 );
}
public LongSupplier rowIndex() {
return () -> irow_;
}
public ParquetRowSplittable split() {
if ( colAccesses_ == null && iblockEnd_ - iblock_ > 2 ) {
int mid = ( 1 + iblock_ + iblockEnd_ ) / 2;
ParquetRowSplittable split;
try {
split = new ParquetRowSplittable( iblock_, mid );
skipBlocks( mid - 1 - iblock_ );
}
catch ( IOException e ) {
return null;
}
return split;
}
else {
return null;
}
}
public long splittableSize() {
int nr = 0;
for ( int ib = iblock_ + 1; ib < iblockEnd_; ib++ ) {
nr += blockSizes_[ ib ];
}
return nr;
}
public boolean next() throws IOException {
assert irow_ < irGroupEnd_;
if ( irow_ + 1 == irGroupEnd_ ) {
assert iblock_ < iblockEnd_;
if ( iblock_ + 1 == iblockEnd_ ) {
return false;
}
nextReadStore();
Arrays.fill( irows_, irow_ );
}
irow_++;
return true;
}
public Object getCell( int icol ) {
ColAccess> colAccess;
try {
colAccess = colAccesses_[ icol ];
}
catch ( NullPointerException e ) {
throw new IllegalStateException( "next() not called" );
}
/* Make sure that we have read or skipped to the current
* position before reading the value of the cell.
* By doing it on demand here rather than during the next call,
* we can avoid reading any data at all for those columns that
* are never read in a given row group. */
long nadv = irow_ - irows_[ icol ];
if ( nadv > 0 ) {
if ( nadv > 1 ) {
colAccess.skip( nadv - 1 );
}
colAccess.clear();
}
irows_[ icol ] = irow_;
return colAccess.read();
}
public Object[] getRow() {
Object[] row = new Object[ ncol_ ];
for ( int ic = 0; ic < ncol_; ic++ ) {
row[ ic ] = getCell( ic );
}
return row;
}
public void close() throws IOException {
pfr_.close();
}
/**
* Reads the next row group and prepares column handlers
* ready to supply its content.
*/
private boolean nextReadStore() throws IOException {
PageReadStore pageStore = pfr_.readNextRowGroup();
if ( pageStore == null ) {
return false;
}
iblock_++;
assert pageStore.getRowCount() == blockSizes_[ iblock_ ];
irGroupEnd_ += pageStore.getRowCount();
ColumnReadStore crstore = getColumnReadStore( pageStore, schema_ );
colAccesses_ = new ColAccess>[ ncol_ ];
for ( int ic = 0; ic < ncol_; ic++ ) {
colAccesses_[ ic ] =
createColAccess( crstore, getInputColumn( ic ) );
}
return true;
}
/**
* Skips over a given number of row blocks from the current position
* without reading the data.
*
* @param nb number of blocks to skip
*/
private void skipBlocks( int nb ) throws IOException {
for ( int ib = 0; ib < nb; ib++ ) {
if ( ! pfr_.skipNextRowGroup() ) {
throw new IOException( "Failed to skip row group" );
}
iblock_++;
long nr = blockSizes_[ iblock_ ];
irow_ += nr;
irGroupEnd_ += nr;
}
}
}
/**
* Creates a ColAccess for accessing a given column.
*/
private static ColAccess
createColAccess( ColumnReadStore crstore,
final InputColumn incol ) {
/* This implementation was written with reference to
* org.apache.parquet.tools.command.DumpCommand.
* There is a certain amount of guesswork involved in use
* of the generally under-documented parquet-mr data access API. */
/* Prepare to read. */
ColumnDescriptor cdesc = incol.getColumnDescriptor();
final int cdefmax = cdesc.getMaxDefinitionLevel();
final Decoder decoder = incol.createDecoder();
final ColumnReader crdr = crstore.getColumnReader( cdesc );
/* The readValue method is cleaner and a bit faster, so use that
* where possible, but we don't have compile-time guarantees that
* it's available, so fall back to advancing by hand if required. */
final Runnable readValue = crdr instanceof ColumnReaderImpl
? ((ColumnReaderImpl) crdr)::readValue
: () -> {
decoder.clearValue();
decoder.readItem( crdr );
};
return new ColAccess() {
private boolean hasValue_;
private T value_;
public void clear() {
hasValue_ = false;
}
public void skip( long n ) {
for ( long i = 0; i < n; i++ ) {
/* From the scant parquet-mr API documentation I would have
* thought that just a consume call here would be the
* right thing to do, but it seems you need to read
* the value as well. */
do {
if ( crdr.getCurrentDefinitionLevel() == cdefmax ) {
readValue.run();
}
crdr.consume();
} while ( crdr.getCurrentRepetitionLevel() > 0 );
}
}
public T read() {
if ( ! hasValue_ ) {
decoder.clearValue();
do {
if ( crdr.getCurrentDefinitionLevel() == cdefmax ) {
decoder.readItem( crdr );
}
// I thought that I ought to be passing a null along
// in this case. However, it does the wrong thing
// at least for some arrays. Hmm.
// else {
// decoder.readNull();
// }
crdr.consume();
} while ( crdr.getCurrentRepetitionLevel() > 0 );
value_ = decoder.getValue();
hasValue_ = true;
}
return value_;
}
};
}
/**
* Manages reading, caching and skipping values from a column.
*/
private static interface ColAccess {
/**
* Indicates that the most recently-read value will no longer be
* required.
*/
void clear();
/**
* Skips over a given number of column entries.
*
* @param n number of entries to skip.
*/
void skip( long n );
/**
* Returns the current value, reading it from the column data
* if it has not already been read.
*
* @return entry value
*/
T read();
}
}