All Downloads are FREE. Search and download functionalities are using the official Maven repository.

uk.ac.starlink.parquet.InputColumns Maven / Gradle / Ivy

There is a newer version: 4.3
Show newest version
package uk.ac.starlink.parquet;

import java.util.ArrayList;
import java.util.BitSet;
import java.util.List;
import java.util.function.BiConsumer;
import java.util.function.Consumer;
import java.util.function.Function;
import java.util.function.Supplier;
import org.apache.parquet.column.ColumnDescriptor;
import org.apache.parquet.column.ColumnReader;
import org.apache.parquet.schema.LogicalTypeAnnotation;
import org.apache.parquet.schema.MessageType;
import org.apache.parquet.schema.PrimitiveType;
import org.apache.parquet.schema.Type;
import uk.ac.starlink.util.ByteList;
import uk.ac.starlink.util.DoubleList;
import uk.ac.starlink.util.FloatList;
import uk.ac.starlink.util.IntList;
import uk.ac.starlink.util.LongList;
import uk.ac.starlink.util.PrimitiveList;
import uk.ac.starlink.util.ShortList;

/**
 * Provides InputColumn instances that know how to read parquet column data.
 *
 * @author   Mark Taylor
 * @since    24 Feb 2021
 */
public class InputColumns {

    public static final byte BAD_BYTE = Byte.MIN_VALUE;
    public static final short BAD_SHORT = Short.MIN_VALUE;
    public static final int BAD_INT = Integer.MIN_VALUE;
    public static final long BAD_LONG = Long.MIN_VALUE;

    /**
     * Private constructor prevents instantiation.
     */
    private InputColumns() {
    }

    /**
     * Returns an InputColumn for reading a given column from a parquet file.
     *
     * @param  schema  table schema
     * @param  path   table column identifier
     * @return   input column reader, or null if the column type
     *           is not supported
     */
    public static InputColumn createInputColumn( MessageType schema,
                                                    String[] path ) {
        final Col col = createCol( schema, path );
        if ( col == null ) {
            return null;
        }
        else {
            boolean isNullable = ! schema.getType( path[ 0 ] )
                                  .isRepetition( Type.Repetition.REQUIRED );
            ColumnDescriptor cdesc = schema.getColumnDescription( path );
            return createInputColumn( col, cdesc, isNullable );
        }
    }

    /**
     * Packages internally-generated column handling components
     * into an InputColumn for external use.
     *
     * @param  col  basic column reading object
     * @param  cdesc   column descriptor
     * @param  isNullable   false if column is known to contain no nulls
     */
    private static  InputColumn
            createInputColumn( final Col col,
                               ColumnDescriptor cdesc, boolean isNullable ) {
        final Class clazz = col.getContentClass();
        return new InputColumn() {
            public Class getContentClass() {
                return clazz;
            }
            public Decoder createDecoder() {
                return col.createDecoder();
            }
            public ColumnDescriptor getColumnDescriptor() {
                return cdesc; 
            }
            public boolean isNullable() {
                return isNullable;
            }
        };
    }

    /**
     * Creates a column reading object for a table in a column.
     *
     * @param  schema  table schema
     * @param  path   table column identifier
     * @return   column reader
     */
    private static Col createCol( MessageType schema, String[] path ) {
        PrimitiveType scalarType = getScalarType( schema, path );
        if ( scalarType != null ) {
            return createScalarCol( scalarType );
        }
        PrimitiveType elType = getArrayElementType( schema, path );
        if ( elType != null ) {
            return createArrayCol( elType );
        }
        return null;
    }

    /**
     * Returns the primitive type associated with a scalar column, or null.
     *
     * @param  schema  table schema
     * @param  path   table column identifier
     * @return  primitive scalar type of column,
     *          or null if it's not a scalar column of supported type
     */
    private static PrimitiveType getScalarType( MessageType schema,
                                                String[] path ) {
        if ( path.length == 1 ) {
            Type t = schema.getType( path );
            if ( t.isPrimitive() &&
                 ! t.isRepetition( Type.Repetition.REPEATED ) ) {
                return t.asPrimitiveType();
            }
            else {
                return null;
            }
        }
        else {
            return null;
        }
    }

    /**
     * Returns the primitive type associated with the elements of
     * an array-valued column, or null.
     *
     * @param  schema  table schema
     * @param  path   table column identifier
     * @return   primitive type of column array elements,
     *           or null if it's not an array column of supported type
     */
    private static PrimitiveType getArrayElementType( MessageType schema,
                                                      String[] path ) {

        /* This may not be the only way to do array-valued columns,
         * and it doesn't look like the most obvious way to me,
         * but I've seen it in example parquet files (from pandas?). */
        if ( path.length == 3 ) {
            Type t0 = schema.getType( path[ 0 ] );
            Type t1 = schema.getType( path[ 0 ], path[ 1 ] );
            Type t2 = schema.getType( path );
            if ( ! t0.isPrimitive() &&
                 ! t1.isPrimitive() &&
                 t1.isRepetition( Type.Repetition.REPEATED ) &&
                 t2.isPrimitive() ) {
                return t2.asPrimitiveType();
            }
            else {
                return null;
            }
        }

        /* This looks more obvious, but I haven't so far seen examples. */
        else if ( path.length == 1 ) {
            Type t = schema.getType( path );
            if ( t.isPrimitive() &&
                 t.isRepetition( Type.Repetition.REPEATED ) ) {
                return t.asPrimitiveType();
            }
            else {
                return null;
            }
        }

        /* There are probably other ways to do it, but don't try to
         * enumerate them without some evidence. */
        else {
            return null;
        }
    }

    /**
     * Returns a column reader for scalar data.
     *
     * @param  ptype   primitive type of scalar column
     * @return   column reader, or null if type not supported
     */
    private static Col createScalarCol( PrimitiveType ptype ) {
        LogicalTypeAnnotation logType = ptype.getLogicalTypeAnnotation();
        PrimitiveType.PrimitiveTypeName ptName = ptype.getPrimitiveTypeName();

        /* See
         * https://github.com/apache/parquet-format/blob/master/LogicalTypes.md
         */
        switch ( ptName ) {
            case BOOLEAN:
                return new ScalarCol(
                           Boolean.class,
                           rdr -> Boolean.valueOf( rdr.getBoolean() ) );
            case INT32:
                final int nbit;
                final boolean isSigned;
                if ( logType instanceof LogicalTypeAnnotation
                                       .IntLogicalTypeAnnotation ) {
                    LogicalTypeAnnotation.IntLogicalTypeAnnotation intType =
                        (LogicalTypeAnnotation.IntLogicalTypeAnnotation)
                        logType;
                    nbit = intType.getBitWidth();
                    isSigned = intType.isSigned();
                }
                else {
                    nbit = 32;
                    isSigned = true;
                }
                if ( nbit == 8 && isSigned ) {
                    return new ScalarCol(
                               Byte.class,
                               rdr -> Byte.valueOf( (byte) rdr.getInteger() ) );
                }
                else if ( nbit == 16 && isSigned ||
                          nbit == 8 && ! isSigned ) {
                    return new ScalarCol(
                               Short.class,
                               rdr -> Short.valueOf( (short) rdr.getInteger()));
                }
                else if ( nbit == 32 && isSigned ||
                          nbit == 16 && ! isSigned ) {
                    return new ScalarCol(
                               Integer.class,
                               rdr -> Integer.valueOf( rdr.getInteger() ) );
                }
                else if ( nbit == 32 || ! isSigned ) {
                    return new ScalarCol(
                        Long.class,
                        rdr -> Long.valueOf(
                               Integer.toUnsignedLong( rdr.getInteger() ) ) );
                }
                else {
                    return null;
                }
            case INT64:
                return new ScalarCol(
                           Long.class,
                           rdr -> Long.valueOf( rdr.getLong() ) );
            case FLOAT:
                return new ScalarCol(
                           Float.class,
                           rdr -> Float.valueOf( rdr.getFloat() ) );
            case DOUBLE:
                return new ScalarCol(
                           Double.class,
                           rdr -> Double.valueOf( rdr.getDouble() ) );
            case BINARY:
                if ( logType instanceof LogicalTypeAnnotation
                                       .StringLogicalTypeAnnotation ) {
                    return new ScalarCol(
                               String.class,
                               rdr -> rdr.getBinary().toStringUsingUTF8() );
                }
                else {
                    return null;
                }
            case FIXED_LEN_BYTE_ARRAY:
                // to-do: timestamps etc.
            case INT96:
            default:
                return null;
        }
    }

    /**
     * Returns a column reader for array data.
     *
     * @param  ptype   primitive type of array column elements
     * @return   column reader, or null if type not supported
     */
    private static Col createArrayCol( PrimitiveType elType ) {
        LogicalTypeAnnotation logType = elType.getLogicalTypeAnnotation();
        PrimitiveType.PrimitiveTypeName ptName = elType.getPrimitiveTypeName();
        switch ( ptName ) {
            case INT32:
                final int nbit;
                final boolean isSigned;
                if ( logType instanceof LogicalTypeAnnotation
                                       .IntLogicalTypeAnnotation ) {
                    LogicalTypeAnnotation.IntLogicalTypeAnnotation intType =
                        (LogicalTypeAnnotation.IntLogicalTypeAnnotation)
                        logType;
                    nbit = intType.getBitWidth();
                    isSigned = intType.isSigned();
                }
                else {
                    nbit = 32;
                    isSigned = true;
                }
                if ( nbit == 8 && isSigned ) {
                    return new PrimitiveArrayCol(
                               byte[].class, ByteList::new,
                               (rdr, list) -> list.add( (byte)
                                                        rdr.getInteger() ),
                               list -> list.add( BAD_BYTE ) );
                }
                else if ( nbit == 16 && isSigned ||
                          nbit == 8 && ! isSigned ) {
                    return new PrimitiveArrayCol(
                               short[].class, ShortList::new,
                               (rdr, list) -> list.add( (short)
                                                        rdr.getInteger() ),
                               list -> list.add( BAD_SHORT ) );
                }
                else if ( nbit == 32 && isSigned ||
                          nbit == 16 && isSigned ) {
                    return new PrimitiveArrayCol(
                               int[].class, IntList::new,
                               (rdr, list) -> list.add( rdr.getInteger() ),
                               list -> list.add( BAD_INT ) );
                }
                else if ( nbit == 32 || ! isSigned ) {
                    return new PrimitiveArrayCol(
                               long[].class, LongList::new,
                               (rdr, list) ->
                                   list.add( Integer.toUnsignedLong(
                                                 rdr.getInteger() ) ),
                               list -> list.add( BAD_LONG ) );
                }
                else {
                    return null;
                }
            case INT64:
                return new PrimitiveArrayCol(
                           long[].class, LongList::new,
                           (rdr, list) -> list.add( rdr.getLong() ),
                           list -> list.add( BAD_LONG ) );
            case FLOAT:
                return new PrimitiveArrayCol(
                           float[].class, FloatList::new,
                           (rdr, list) -> list.add( rdr.getFloat() ),
                           list -> list.add( Float.NaN ) );
            case DOUBLE:
                return new PrimitiveArrayCol(
                           double[].class, DoubleList::new,
                           (rdr, list) -> list.add( rdr.getDouble() ),
                           list -> list.add( Double.NaN ) );
            case BINARY:
                if ( logType instanceof LogicalTypeAnnotation
                                       .StringLogicalTypeAnnotation ) {
                    return createStringArrayCol();
                }
                else {
                    return null;
                }
            case BOOLEAN:
                return createBooleanArrayCol();
            case FIXED_LEN_BYTE_ARRAY:
            case INT96:
            default:
                return null;
        }
    }

    /**
     * Returns a reader for boolean-array-valued columns.
     *
     * @return   boolean array reader
     */
    private static Col createBooleanArrayCol() {
        final boolean[] array0 = new boolean[ 0 ];
        return new Col() {
            public Class getContentClass() {
                return boolean[].class;
            }
            public Decoder createDecoder() {
                return new Decoder() {
                    final BitSet bits_ = new BitSet();
                    boolean[] value_;
                    int n_;
                    public Class getContentClass() {
                        return boolean[].class;
                    }
                    public void clearValue() {
                        value_ = null;
                        n_ = 0;
                    }
                    public void readItem( ColumnReader crdr ) {
                        bits_.set( n_++, crdr.getBoolean() );
                    }
                    public void readNull() {
                        bits_.clear( n_++ );
                    }
                    public boolean[] getValue() {
                        if ( value_ == null ) {
                            if ( n_ == 0 ) {
                                value_ = array0;
                            }
                            else {
                                value_ = new boolean[ n_ ];
                                for ( int i = 0; i < n_; i++ ) {
                                    value_[ i ] = bits_.get( i );
                                }
                            }
                        }
                        return value_;
                    }
                };
            }
        };
    }

    /**
     * Returns a reader for String-array-valued columns.
     *
     * @return  string array reader
     */
    private static Col createStringArrayCol() {
        final String[] array0 = new String[ 0 ];
        return new Col() {
            public Class getContentClass() {
                return String[].class;
            }
            public Decoder createDecoder() {
                return new Decoder() {
                    final List list_ = new ArrayList();
                    String[] value_;
                    public Class getContentClass() {
                        return String[].class;
                    }
                    public void clearValue() {
                        value_ = null;
                        list_.clear();
                    }
                    public void readItem( ColumnReader crdr ) {
                        list_.add( crdr.getBinary().toStringUsingUTF8() );
                    }
                    public void readNull() {
                        list_.add( null );
                    }
                    public String[] getValue() {
                        if ( value_ == null ) {
                            int n = list_.size();
                            value_ = n == 0 ? array0
                                            : list_.toArray( new String[ n ] );
                        }
                        return value_;
                    }
                };
            }
        };
    }

    /**
     * Factory for decoders.
     */
    private static interface Col {

        /**
         * Returns the content class of produced decoders.
         *
         * @return   decoder output class
         */
        Class getContentClass();

        /**
         * Creates a decoder.
         *
         * @return   new decoder
         */
        Decoder createDecoder();
    }

    /**
     * Col implementation for scalar valued columns.
     */
    private static class ScalarCol implements Col {
        final Class clazz_;
        final Function readFunc_;

        /**
         * Constructor.
         *
         * @param   clazz  content class
         * @param   readFunc   takes a typed primitive value from a ColumnReader
         */
        ScalarCol( Class clazz, Function readFunc ) {
            clazz_ = clazz;
            readFunc_ = readFunc;
        }

        public Class getContentClass() {
            return clazz_;
        }

        public Decoder createDecoder() {
            return new Decoder() {
                T value_; 
                public Class getContentClass() {
                    return clazz_;
                }
                public void clearValue() {
                    value_ = null;
                }     
                public void readItem( ColumnReader crdr ) {
                    value_ = readFunc_.apply( crdr );
                }       
                public void readNull() {
                    assert value_ == null; // clearValue should have been called
                    value_ = null;
                }
                public T getValue() {
                    return value_;
                }
            };
        }
    }

    /**
     * Col implementation for array valued columns.
     */
    private static class PrimitiveArrayCol
            implements Col {

        final Class clazz_;
        final Supplier listSupplier_;
        final BiConsumer append_;
        final Consumer appendNull_;

        /**
         * Constructor.
         *
         * @param  clazz  array value class of column
         * @param  listSupplier  supplier of suitable primitive list instance
         * @param  append  copies a value from a column reader to a list
         * @param  appendNull  adds a null value to a list
         */
        PrimitiveArrayCol( Class clazz, Supplier listSupplier,
                           BiConsumer append,
                           Consumer appendNull ) {
            clazz_ = clazz;
            listSupplier_ = listSupplier;
            append_ = append;
            appendNull_ = appendNull;
        }

        public Class getContentClass() {
            return clazz_;
        }

        public Decoder createDecoder() {
            return new Decoder() {
                final L plist_ = listSupplier_.get();
                boolean hasValue_;
                T value_;
                public Class getContentClass() {
                    return clazz_;
                }
                public void clearValue() {
                    hasValue_ = false;
                    plist_.clear();
                }
                public void readItem( ColumnReader crdr ) {
                    append_.accept( crdr, plist_ );
                }
                public void readNull() {
                    appendNull_.accept( plist_ );
                }
                public T getValue() {
                    if ( ! hasValue_ ) {
                        hasValue_ = true;
                        value_ = plist_.size() == 0
                               ? null
                               : clazz_.cast( plist_.toArray() );
                    }
                    return value_;
                }
            };
        }
    }
}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy