All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.iceberg.BaseFile Maven / Gradle / Ivy

There is a newer version: 1.6.1
Show newest version
/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *   http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied.  See the License for the
 * specific language governing permissions and limitations
 * under the License.
 */

package org.apache.iceberg;

import java.io.Serializable;
import java.nio.ByteBuffer;
import java.util.Arrays;
import java.util.Collections;
import java.util.List;
import java.util.Locale;
import java.util.Map;
import org.apache.avro.Schema;
import org.apache.avro.generic.IndexedRecord;
import org.apache.avro.specific.SpecificData;
import org.apache.iceberg.avro.AvroSchemaUtil;
import org.apache.iceberg.relocated.com.google.common.base.MoreObjects;
import org.apache.iceberg.relocated.com.google.common.collect.Lists;
import org.apache.iceberg.relocated.com.google.common.collect.Maps;
import org.apache.iceberg.types.Type;
import org.apache.iceberg.types.Types;
import org.apache.iceberg.util.ByteBuffers;

/**
 * Base class for both {@link DataFile} and {@link DeleteFile}.
 */
abstract class BaseFile
    implements ContentFile, IndexedRecord, StructLike, SpecificData.SchemaConstructable, Serializable {
  static final Types.StructType EMPTY_STRUCT_TYPE = Types.StructType.of();
  static final PartitionData EMPTY_PARTITION_DATA = new PartitionData(EMPTY_STRUCT_TYPE) {
    @Override
    public PartitionData copy() {
      return this; // this does not change
    }
  };

  private int[] fromProjectionPos;
  private Types.StructType partitionType;

  private FileContent content = FileContent.DATA;
  private String filePath = null;
  private FileFormat format = null;
  private PartitionData partitionData = null;
  private Long recordCount = null;
  private long fileSizeInBytes = -1L;

  // optional fields
  private Map columnSizes = null;
  private Map valueCounts = null;
  private Map nullValueCounts = null;
  private Map lowerBounds = null;
  private Map upperBounds = null;
  private List splitOffsets = null;
  private byte[] keyMetadata = null;

  // cached schema
  private transient Schema avroSchema = null;

  /**
   * Used by Avro reflection to instantiate this class when reading manifest files.
   */
  BaseFile(Schema avroSchema) {
    this.avroSchema = avroSchema;

    Types.StructType schema = AvroSchemaUtil.convert(avroSchema).asNestedType().asStructType();

    // partition type may be null if the field was not projected
    Type partType = schema.fieldType("partition");
    if (partType != null) {
      this.partitionType = partType.asNestedType().asStructType();
    } else {
      this.partitionType = EMPTY_STRUCT_TYPE;
    }

    List fields = schema.fields();
    List allFields = DataFile.getType(partitionType).fields();
    this.fromProjectionPos = new int[fields.size()];
    for (int i = 0; i < fromProjectionPos.length; i += 1) {
      boolean found = false;
      for (int j = 0; j < allFields.size(); j += 1) {
        if (fields.get(i).fieldId() == allFields.get(j).fieldId()) {
          found = true;
          fromProjectionPos[i] = j;
        }
      }

      if (!found) {
        throw new IllegalArgumentException("Cannot find projected field: " + fields.get(i));
      }
    }

    this.partitionData = new PartitionData(partitionType);
  }

  BaseFile(FileContent content, String filePath, FileFormat format,
           PartitionData partition, long fileSizeInBytes, long recordCount,
           Map columnSizes, Map valueCounts, Map nullValueCounts,
           Map lowerBounds, Map upperBounds, List splitOffsets,
           ByteBuffer keyMetadata) {
    this.content = content;
    this.filePath = filePath;
    this.format = format;

    // this constructor is used by DataFiles.Builder, which passes null for unpartitioned data
    if (partition == null) {
      this.partitionData = EMPTY_PARTITION_DATA;
      this.partitionType = EMPTY_PARTITION_DATA.getPartitionType();
    } else {
      this.partitionData = partition;
      this.partitionType = partition.getPartitionType();
    }

    // this will throw NPE if metrics.recordCount is null
    this.recordCount = recordCount;
    this.fileSizeInBytes = fileSizeInBytes;
    this.columnSizes = columnSizes;
    this.valueCounts = valueCounts;
    this.nullValueCounts = nullValueCounts;
    this.lowerBounds = SerializableByteBufferMap.wrap(lowerBounds);
    this.upperBounds = SerializableByteBufferMap.wrap(upperBounds);
    this.splitOffsets = copy(splitOffsets);
    this.keyMetadata = ByteBuffers.toByteArray(keyMetadata);
  }

  /**
   * Copy constructor.
   *
   * @param toCopy a generic data file to copy.
   * @param fullCopy whether to copy all fields or to drop column-level stats
   */
  BaseFile(BaseFile toCopy, boolean fullCopy) {
    this.content = toCopy.content;
    this.filePath = toCopy.filePath;
    this.format = toCopy.format;
    this.partitionData = toCopy.partitionData.copy();
    this.partitionType = toCopy.partitionType;
    this.recordCount = toCopy.recordCount;
    this.fileSizeInBytes = toCopy.fileSizeInBytes;
    if (fullCopy) {
      // TODO: support lazy conversion to/from map
      this.columnSizes = copy(toCopy.columnSizes);
      this.valueCounts = copy(toCopy.valueCounts);
      this.nullValueCounts = copy(toCopy.nullValueCounts);
      this.lowerBounds = SerializableByteBufferMap.wrap(copy(toCopy.lowerBounds));
      this.upperBounds = SerializableByteBufferMap.wrap(copy(toCopy.upperBounds));
    } else {
      this.columnSizes = null;
      this.valueCounts = null;
      this.nullValueCounts = null;
      this.lowerBounds = null;
      this.upperBounds = null;
    }
    this.fromProjectionPos = toCopy.fromProjectionPos;
    this.keyMetadata = toCopy.keyMetadata == null ? null : Arrays.copyOf(toCopy.keyMetadata, toCopy.keyMetadata.length);
    this.splitOffsets = copy(toCopy.splitOffsets);
  }

  /**
   * Constructor for Java serialization.
   */
  BaseFile() {
  }

  protected abstract Schema getAvroSchema(Types.StructType partitionStruct);

  @Override
  public Schema getSchema() {
    if (avroSchema == null) {
      this.avroSchema = getAvroSchema(partitionType);
    }
    return avroSchema;
  }

  @Override
  @SuppressWarnings("unchecked")
  public void put(int i, Object value) {
    int pos = i;
    // if the schema was projected, map the incoming ordinal to the expected one
    if (fromProjectionPos != null) {
      pos = fromProjectionPos[i];
    }
    switch (pos) {
      case 0:
        this.content = value != null ? FileContent.values()[(Integer) value] : FileContent.DATA;
        return;
      case 1:
        // always coerce to String for Serializable
        this.filePath = value.toString();
        return;
      case 2:
        this.format = FileFormat.valueOf(value.toString());
        return;
      case 3:
        this.partitionData = (PartitionData) value;
        return;
      case 4:
        this.recordCount = (Long) value;
        return;
      case 5:
        this.fileSizeInBytes = (Long) value;
        return;
      case 6:
        this.columnSizes = (Map) value;
        return;
      case 7:
        this.valueCounts = (Map) value;
        return;
      case 8:
        this.nullValueCounts = (Map) value;
        return;
      case 9:
        this.lowerBounds = SerializableByteBufferMap.wrap((Map) value);
        return;
      case 10:
        this.upperBounds = SerializableByteBufferMap.wrap((Map) value);
        return;
      case 11:
        this.keyMetadata = ByteBuffers.toByteArray((ByteBuffer) value);
        return;
      case 12:
        this.splitOffsets = (List) value;
        return;
      default:
        // ignore the object, it must be from a newer version of the format
    }
  }

  @Override
  public  void set(int pos, T value) {
    put(pos, value);
  }

  @Override
  public Object get(int i) {
    int pos = i;
    // if the schema was projected, map the incoming ordinal to the expected one
    if (fromProjectionPos != null) {
      pos = fromProjectionPos[i];
    }
    switch (pos) {
      case 0:
        return content.id();
      case 1:
        return filePath;
      case 2:
        return format != null ? format.toString() : null;
      case 3:
        return partitionData;
      case 4:
        return recordCount;
      case 5:
        return fileSizeInBytes;
      case 6:
        return columnSizes;
      case 7:
        return valueCounts;
      case 8:
        return nullValueCounts;
      case 9:
        return lowerBounds;
      case 10:
        return upperBounds;
      case 11:
        return keyMetadata != null ? ByteBuffer.wrap(keyMetadata) : null;
      case 12:
        return splitOffsets;
      default:
        throw new UnsupportedOperationException("Unknown field ordinal: " + pos);
    }
  }

  @Override
  public  T get(int pos, Class javaClass) {
    return javaClass.cast(get(pos));
  }

  @Override
  public int size() {
    return DataFile.getType(EMPTY_STRUCT_TYPE).fields().size();
  }

  public FileContent content() {
    return content;
  }

  public CharSequence path() {
    return filePath;
  }

  public FileFormat format() {
    return format;
  }

  public StructLike partition() {
    return partitionData;
  }

  public long recordCount() {
    return recordCount;
  }

  public long fileSizeInBytes() {
    return fileSizeInBytes;
  }

  public Map columnSizes() {
    return columnSizes;
  }

  public Map valueCounts() {
    return valueCounts;
  }

  public Map nullValueCounts() {
    return nullValueCounts;
  }

  public Map lowerBounds() {
    return lowerBounds;
  }

  public Map upperBounds() {
    return upperBounds;
  }

  public ByteBuffer keyMetadata() {
    return keyMetadata != null ? ByteBuffer.wrap(keyMetadata) : null;
  }

  public List splitOffsets() {
    return splitOffsets;
  }

  private static  Map copy(Map map) {
    if (map != null) {
      Map copy = Maps.newHashMapWithExpectedSize(map.size());
      copy.putAll(map);
      return Collections.unmodifiableMap(copy);
    }
    return null;
  }

  private static  List copy(List list) {
    if (list != null) {
      List copy = Lists.newArrayListWithExpectedSize(list.size());
      copy.addAll(list);
      return Collections.unmodifiableList(copy);
    }
    return null;
  }

  @Override
  public String toString() {
    return MoreObjects.toStringHelper(this)
        .add("content", content.toString().toLowerCase(Locale.ROOT))
        .add("file_path", filePath)
        .add("file_format", format)
        .add("partition", partitionData)
        .add("record_count", recordCount)
        .add("file_size_in_bytes", fileSizeInBytes)
        .add("column_sizes", columnSizes)
        .add("value_counts", valueCounts)
        .add("null_value_counts", nullValueCounts)
        .add("lower_bounds", lowerBounds)
        .add("upper_bounds", upperBounds)
        .add("key_metadata", keyMetadata == null ? "null" : "(redacted)")
        .add("split_offsets", splitOffsets == null ? "null" : splitOffsets)
        .toString();
  }
}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy