All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.iceberg.parquet.ParquetAvroValueReaders Maven / Gradle / Ivy

There is a newer version: 1.7.1
Show newest version
/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *   http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied.  See the License for the
 * specific language governing permissions and limitations
 * under the License.
 */

package org.apache.iceberg.parquet;

import com.google.common.collect.ImmutableList;
import com.google.common.collect.Lists;
import com.google.common.collect.Maps;
import java.math.BigDecimal;
import java.math.BigInteger;
import java.nio.ByteBuffer;
import java.nio.ByteOrder;
import java.util.List;
import java.util.Map;
import java.util.UUID;
import org.apache.avro.Schema;
import org.apache.avro.generic.GenericData.Fixed;
import org.apache.avro.generic.GenericData.Record;
import org.apache.avro.util.Utf8;
import org.apache.iceberg.avro.AvroSchemaUtil;
import org.apache.iceberg.parquet.ParquetValueReaders.BytesReader;
import org.apache.iceberg.parquet.ParquetValueReaders.FloatAsDoubleReader;
import org.apache.iceberg.parquet.ParquetValueReaders.IntAsLongReader;
import org.apache.iceberg.parquet.ParquetValueReaders.IntegerAsDecimalReader;
import org.apache.iceberg.parquet.ParquetValueReaders.ListReader;
import org.apache.iceberg.parquet.ParquetValueReaders.LongAsDecimalReader;
import org.apache.iceberg.parquet.ParquetValueReaders.MapReader;
import org.apache.iceberg.parquet.ParquetValueReaders.StructReader;
import org.apache.iceberg.parquet.ParquetValueReaders.UnboxedReader;
import org.apache.iceberg.types.Type.TypeID;
import org.apache.iceberg.types.Types;
import org.apache.parquet.column.ColumnDescriptor;
import org.apache.parquet.schema.DecimalMetadata;
import org.apache.parquet.schema.GroupType;
import org.apache.parquet.schema.MessageType;
import org.apache.parquet.schema.PrimitiveType;
import org.apache.parquet.schema.Type;

public class ParquetAvroValueReaders {
  private ParquetAvroValueReaders() {
  }

  @SuppressWarnings("unchecked")
  public static ParquetValueReader buildReader(org.apache.iceberg.Schema expectedSchema,
                                                       MessageType fileSchema) {
    return (ParquetValueReader)
        TypeWithSchemaVisitor.visit(expectedSchema.asStruct(), fileSchema,
            new ReadBuilder(expectedSchema, fileSchema));
  }

  private static class ReadBuilder extends TypeWithSchemaVisitor> {
    private final org.apache.iceberg.Schema schema;
    private final Map avroSchemas;
    private final MessageType type;

    ReadBuilder(org.apache.iceberg.Schema schema, MessageType type) {
      this.schema = schema;
      this.avroSchemas = AvroSchemaUtil.convertTypes(schema.asStruct(), type.getName());
      this.type = type;
    }

    @Override
    public ParquetValueReader message(Types.StructType expected, MessageType message,
                                         List> fieldReaders) {
      return struct(expected, message.asGroupType(), fieldReaders);
    }

    @Override
    public ParquetValueReader struct(Types.StructType expected, GroupType struct,
                                        List> fieldReaders) {
      Schema avroSchema = avroSchemas.get(expected);

      // match the expected struct's order
      Map> readersById = Maps.newHashMap();
      Map typesById = Maps.newHashMap();
      List fields = struct.getFields();
      for (int i = 0; i < fields.size(); i += 1) {
        Type fieldType = fields.get(i);
        int fieldD = type.getMaxDefinitionLevel(path(fieldType.getName())) - 1;
        int id = fieldType.getId().intValue();
        readersById.put(id, ParquetValueReaders.option(fieldType, fieldD, fieldReaders.get(i)));
        typesById.put(id, fieldType);
      }

      List expectedFields = expected != null ?
          expected.fields() : ImmutableList.of();
      List> reorderedFields = Lists.newArrayListWithExpectedSize(
          expectedFields.size());
      List types = Lists.newArrayListWithExpectedSize(expectedFields.size());
      for (Types.NestedField field : expectedFields) {
        int id = field.fieldId();
        ParquetValueReader reader = readersById.get(id);
        if (reader != null) {
          reorderedFields.add(reader);
          types.add(typesById.get(id));
        } else {
          reorderedFields.add(ParquetValueReaders.nulls());
          types.add(null);
        }
      }

      return new RecordReader(types, reorderedFields, avroSchema);
    }

    @Override
    public ParquetValueReader list(Types.ListType expectedList, GroupType array,
                                      ParquetValueReader elementReader) {
      GroupType repeated = array.getFields().get(0).asGroupType();
      String[] repeatedPath = currentPath();

      int repeatedD = type.getMaxDefinitionLevel(repeatedPath) - 1;
      int repeatedR = type.getMaxRepetitionLevel(repeatedPath) - 1;

      Type elementType = repeated.getType(0);
      int elementD = type.getMaxDefinitionLevel(path(elementType.getName())) - 1;

      return new ListReader<>(repeatedD, repeatedR, ParquetValueReaders.option(elementType, elementD, elementReader));
    }

    @Override
    public ParquetValueReader map(Types.MapType expectedMap, GroupType map,
                                     ParquetValueReader keyReader,
                                     ParquetValueReader valueReader) {
      GroupType repeatedKeyValue = map.getFields().get(0).asGroupType();
      String[] repeatedPath = currentPath();

      int repeatedD = type.getMaxDefinitionLevel(repeatedPath) - 1;
      int repeatedR = type.getMaxRepetitionLevel(repeatedPath) - 1;

      Type keyType = repeatedKeyValue.getType(0);
      int keyD = type.getMaxDefinitionLevel(path(keyType.getName())) - 1;
      Type valueType = repeatedKeyValue.getType(1);
      int valueD = type.getMaxDefinitionLevel(path(valueType.getName())) - 1;

      return new MapReader<>(repeatedD, repeatedR,
          ParquetValueReaders.option(keyType, keyD, keyReader),
          ParquetValueReaders.option(valueType, valueD, valueReader));
    }

    @Override
    public ParquetValueReader primitive(org.apache.iceberg.types.Type.PrimitiveType expected,
                                           PrimitiveType primitive) {
      ColumnDescriptor desc = type.getColumnDescription(currentPath());

      boolean isMapKey = fieldNames.contains("key");

      if (primitive.getOriginalType() != null) {
        switch (primitive.getOriginalType()) {
          case ENUM:
          case JSON:
          case UTF8:
            if (isMapKey) {
              return new StringReader(desc);
            }
            return new Utf8Reader(desc);
          case DATE:
          case INT_8:
          case INT_16:
          case INT_32:
          case INT_64:
          case TIME_MICROS:
          case TIMESTAMP_MICROS:
            return new UnboxedReader<>(desc);
          case TIME_MILLIS:
            return new TimeMillisReader(desc);
          case TIMESTAMP_MILLIS:
            return new TimestampMillisReader(desc);
          case DECIMAL:
            DecimalMetadata decimal = primitive.getDecimalMetadata();
            switch (primitive.getPrimitiveTypeName()) {
              case BINARY:
              case FIXED_LEN_BYTE_ARRAY:
                return new DecimalReader(desc, decimal.getScale());
              case INT64:
                return new IntegerAsDecimalReader(desc, decimal.getScale());
              case INT32:
                return new LongAsDecimalReader(desc, decimal.getScale());
              default:
                throw new UnsupportedOperationException(
                    "Unsupported base type for decimal: " + primitive.getPrimitiveTypeName());
            }
          case BSON:
            return new BytesReader(desc);
          default:
            throw new UnsupportedOperationException(
                "Unsupported logical type: " + primitive.getOriginalType());
        }
      }

      switch (primitive.getPrimitiveTypeName()) {
        case FIXED_LEN_BYTE_ARRAY:
          int fieldId = primitive.getId().intValue();
          Schema avroSchema = AvroSchemaUtil.convert(schema.findType(fieldId));
          return new FixedReader(desc, avroSchema);
        case BINARY:
          return new BytesReader(desc);
        case INT32:
          if (expected != null && expected.typeId() == TypeID.LONG) {
            return new IntAsLongReader(desc);
          } else {
            return new UnboxedReader<>(desc);
          }
        case FLOAT:
          if (expected != null && expected.typeId() == TypeID.DOUBLE) {
            return new FloatAsDoubleReader(desc);
          } else {
            return new UnboxedReader<>(desc);
          }
        case BOOLEAN:
        case INT64:
        case DOUBLE:
          return new UnboxedReader<>(desc);
        default:
          throw new UnsupportedOperationException("Unsupported type: " + primitive);
      }
    }
  }

  static class DecimalReader extends ParquetValueReaders.PrimitiveReader {
    private final int scale;

    DecimalReader(ColumnDescriptor desc, int scale) {
      super(desc);
      this.scale = scale;
    }

    @Override
    public BigDecimal read(BigDecimal ignored) {
      return new BigDecimal(new BigInteger(column.nextBinary().getBytesUnsafe()), scale);
    }
  }

  static class StringReader extends ParquetValueReaders.PrimitiveReader {
    StringReader(ColumnDescriptor desc) {
      super(desc);
    }

    @Override
    public String read(String ignored) {
      return column.nextBinary().toStringUsingUTF8();
    }
  }

  static class Utf8Reader extends ParquetValueReaders.PrimitiveReader {
    Utf8Reader(ColumnDescriptor desc) {
      super(desc);
    }

    @Override
    public Utf8 read(Utf8 reuse) {
      Utf8 utf8;
      if (reuse != null) {
        utf8 = reuse;
      } else {
        utf8 = new Utf8();
      }

      // use a byte buffer because it never results in a copy
      ByteBuffer buffer = column.nextBinary().toByteBuffer();

      // always copy the bytes into the Utf8. for constant binary data backed by an array starting
      // at 0, it is possible to wrap the bytes in a Utf8, but reusing that Utf8 could corrupt the
      // constant binary if its backing buffer is copied to.
      utf8.setByteLength(buffer.remaining());
      buffer.get(utf8.getBytes(), 0, buffer.remaining());

      return utf8;
    }
  }

  static class UUIDReader extends ParquetValueReaders.PrimitiveReader {
    UUIDReader(ColumnDescriptor desc) {
      super(desc);
    }

    @Override
    public UUID read(UUID ignored) {
      ByteBuffer buffer = column.nextBinary().toByteBuffer();
      buffer.order(ByteOrder.BIG_ENDIAN);

      long mostSigBits = buffer.getLong();
      long leastSigBits = buffer.getLong();

      return new UUID(mostSigBits, leastSigBits);
    }
  }

  static class FixedReader extends ParquetValueReaders.PrimitiveReader {
    private final Schema schema;

    FixedReader(ColumnDescriptor desc, Schema schema) {
      super(desc);
      this.schema = schema;
    }

    @Override
    public Fixed read(Fixed reuse) {
      Fixed fixed;
      if (reuse != null) {
        fixed = reuse;
      } else {
        fixed = new Fixed(schema);
      }

      column.nextBinary().toByteBuffer().get(fixed.bytes());

      return fixed;
    }
  }

  public static class TimeMillisReader extends UnboxedReader {
    TimeMillisReader(ColumnDescriptor desc) {
      super(desc);
    }

    @Override
    public long readLong() {
      return 1000 * column.nextLong();
    }
  }

  public static class TimestampMillisReader extends UnboxedReader {
    TimestampMillisReader(ColumnDescriptor desc) {
      super(desc);
    }

    @Override
    public long readLong() {
      return 1000 * column.nextLong();
    }
  }

  static class RecordReader extends StructReader {
    private final Schema schema;

    RecordReader(List types,
                 List> readers,
                 Schema schema) {
      super(types, readers);
      this.schema = schema;
    }

    @Override
    protected Record newStructData(Record reuse) {
      if (reuse != null) {
        return reuse;
      } else {
        return new Record(schema);
      }
    }

    @Override
    @SuppressWarnings("unchecked")
    protected Object getField(Record intermediate, int pos) {
      return intermediate.get(pos);
    }

    @Override
    protected Record buildStruct(Record struct) {
      return struct;
    }

    @Override
    protected void set(Record struct, int pos, Object value) {
      struct.put(pos, value);
    }
  }
}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy