All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.iceberg.BaseFile Maven / Gradle / Ivy

The newest version!
/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *   http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied.  See the License for the
 * specific language governing permissions and limitations
 * under the License.
 */
package org.apache.iceberg;

import java.io.Serializable;
import java.nio.ByteBuffer;
import java.util.Arrays;
import java.util.Collections;
import java.util.List;
import java.util.Locale;
import java.util.Map;
import java.util.Set;
import org.apache.avro.Schema;
import org.apache.avro.generic.IndexedRecord;
import org.apache.avro.specific.SpecificData;
import org.apache.iceberg.avro.AvroSchemaUtil;
import org.apache.iceberg.avro.SupportsIndexProjection;
import org.apache.iceberg.relocated.com.google.common.base.MoreObjects;
import org.apache.iceberg.types.Type;
import org.apache.iceberg.types.Types;
import org.apache.iceberg.util.ArrayUtil;
import org.apache.iceberg.util.ByteBuffers;
import org.apache.iceberg.util.SerializableMap;

/** Base class for both {@link DataFile} and {@link DeleteFile}. */
abstract class BaseFile extends SupportsIndexProjection
    implements ContentFile,
        IndexedRecord,
        StructLike,
        SpecificData.SchemaConstructable,
        Serializable {
  private static final FileContent[] FILE_CONTENT_VALUES = FileContent.values();
  static final Types.StructType EMPTY_STRUCT_TYPE = Types.StructType.of();
  static final PartitionData EMPTY_PARTITION_DATA =
      new PartitionData(EMPTY_STRUCT_TYPE) {
        @Override
        public PartitionData copy() {
          return this; // this does not change
        }
      };

  private Types.StructType partitionType;

  private Long fileOrdinal = null;
  private String manifestLocation = null;
  private int partitionSpecId = -1;
  private FileContent content = FileContent.DATA;
  private String filePath = null;
  private FileFormat format = null;
  private PartitionData partitionData = null;
  private Long recordCount = null;
  private long fileSizeInBytes = -1L;
  private Long dataSequenceNumber = null;
  private Long fileSequenceNumber = null;

  // optional fields
  private Map columnSizes = null;
  private Map valueCounts = null;
  private Map nullValueCounts = null;
  private Map nanValueCounts = null;
  private Map lowerBounds = null;
  private Map upperBounds = null;
  private long[] splitOffsets = null;
  private int[] equalityIds = null;
  private byte[] keyMetadata = null;
  private Integer sortOrderId;

  // cached schema
  private transient Schema avroSchema = null;

  // struct type that corresponds to the positions used for internalGet and internalSet
  private static final Types.StructType BASE_TYPE =
      Types.StructType.of(
          DataFile.CONTENT,
          DataFile.FILE_PATH,
          DataFile.FILE_FORMAT,
          DataFile.SPEC_ID,
          Types.NestedField.required(
              DataFile.PARTITION_ID,
              DataFile.PARTITION_NAME,
              EMPTY_STRUCT_TYPE,
              DataFile.PARTITION_DOC),
          DataFile.RECORD_COUNT,
          DataFile.FILE_SIZE,
          DataFile.COLUMN_SIZES,
          DataFile.VALUE_COUNTS,
          DataFile.NULL_VALUE_COUNTS,
          DataFile.NAN_VALUE_COUNTS,
          DataFile.LOWER_BOUNDS,
          DataFile.UPPER_BOUNDS,
          DataFile.KEY_METADATA,
          DataFile.SPLIT_OFFSETS,
          DataFile.EQUALITY_IDS,
          DataFile.SORT_ORDER_ID,
          MetadataColumns.ROW_POSITION);

  /** Used by Avro reflection to instantiate this class when reading manifest files. */
  BaseFile(Schema avroSchema) {
    this(AvroSchemaUtil.convert(avroSchema).asStructType());
    this.avroSchema = avroSchema;
  }

  /** Used by internal readers to instantiate this class with a projection schema. */
  BaseFile(Types.StructType projection) {
    super(BASE_TYPE, projection);
    this.avroSchema = AvroSchemaUtil.convert(projection, "data_file");

    // partition type may be null if the field was not projected
    Type partType = projection.fieldType("partition");
    if (partType != null) {
      this.partitionType = partType.asNestedType().asStructType();
    } else {
      this.partitionType = EMPTY_STRUCT_TYPE;
    }

    this.partitionData = new PartitionData(partitionType);
  }

  BaseFile(
      int specId,
      FileContent content,
      String filePath,
      FileFormat format,
      PartitionData partition,
      long fileSizeInBytes,
      long recordCount,
      Map columnSizes,
      Map valueCounts,
      Map nullValueCounts,
      Map nanValueCounts,
      Map lowerBounds,
      Map upperBounds,
      List splitOffsets,
      int[] equalityFieldIds,
      Integer sortOrderId,
      ByteBuffer keyMetadata) {
    super(BASE_TYPE.fields().size());
    this.partitionSpecId = specId;
    this.content = content;
    this.filePath = filePath;
    this.format = format;

    // this constructor is used by DataFiles.Builder, which passes null for unpartitioned data
    if (partition == null) {
      this.partitionData = EMPTY_PARTITION_DATA;
      this.partitionType = EMPTY_PARTITION_DATA.getPartitionType();
    } else {
      this.partitionData = partition;
      this.partitionType = partition.getPartitionType();
    }

    // this will throw NPE if metrics.recordCount is null
    this.recordCount = recordCount;
    this.fileSizeInBytes = fileSizeInBytes;
    this.columnSizes = columnSizes;
    this.valueCounts = valueCounts;
    this.nullValueCounts = nullValueCounts;
    this.nanValueCounts = nanValueCounts;
    this.lowerBounds = SerializableByteBufferMap.wrap(lowerBounds);
    this.upperBounds = SerializableByteBufferMap.wrap(upperBounds);
    this.splitOffsets = ArrayUtil.toLongArray(splitOffsets);
    this.equalityIds = equalityFieldIds;
    this.sortOrderId = sortOrderId;
    this.keyMetadata = ByteBuffers.toByteArray(keyMetadata);
  }

  /**
   * Copy constructor.
   *
   * @param toCopy a generic data file to copy.
   * @param copyStats whether to copy all fields or to drop column-level stats
   * @param requestedColumnIds column ids for which to keep stats. If null then every
   *     column stat is kept.
   */
  BaseFile(BaseFile toCopy, boolean copyStats, Set requestedColumnIds) {
    super(toCopy);
    this.fileOrdinal = toCopy.fileOrdinal;
    this.manifestLocation = toCopy.manifestLocation;
    this.partitionSpecId = toCopy.partitionSpecId;
    this.content = toCopy.content;
    this.filePath = toCopy.filePath;
    this.format = toCopy.format;
    this.partitionData = toCopy.partitionData.copy();
    this.partitionType = toCopy.partitionType;
    this.recordCount = toCopy.recordCount;
    this.fileSizeInBytes = toCopy.fileSizeInBytes;
    if (copyStats) {
      this.columnSizes = copyMap(toCopy.columnSizes, requestedColumnIds);
      this.valueCounts = copyMap(toCopy.valueCounts, requestedColumnIds);
      this.nullValueCounts = copyMap(toCopy.nullValueCounts, requestedColumnIds);
      this.nanValueCounts = copyMap(toCopy.nanValueCounts, requestedColumnIds);
      this.lowerBounds = copyByteBufferMap(toCopy.lowerBounds, requestedColumnIds);
      this.upperBounds = copyByteBufferMap(toCopy.upperBounds, requestedColumnIds);
    } else {
      this.columnSizes = null;
      this.valueCounts = null;
      this.nullValueCounts = null;
      this.nanValueCounts = null;
      this.lowerBounds = null;
      this.upperBounds = null;
    }
    this.keyMetadata =
        toCopy.keyMetadata == null
            ? null
            : Arrays.copyOf(toCopy.keyMetadata, toCopy.keyMetadata.length);
    this.splitOffsets =
        toCopy.splitOffsets == null
            ? null
            : Arrays.copyOf(toCopy.splitOffsets, toCopy.splitOffsets.length);
    this.equalityIds =
        toCopy.equalityIds != null
            ? Arrays.copyOf(toCopy.equalityIds, toCopy.equalityIds.length)
            : null;
    this.sortOrderId = toCopy.sortOrderId;
    this.dataSequenceNumber = toCopy.dataSequenceNumber;
    this.fileSequenceNumber = toCopy.fileSequenceNumber;
  }

  /** Constructor for Java serialization. */
  BaseFile() {
    super(BASE_TYPE.fields().size());
  }

  @Override
  public int specId() {
    return partitionSpecId;
  }

  void setSpecId(int specId) {
    this.partitionSpecId = specId;
  }

  @Override
  public Long dataSequenceNumber() {
    return dataSequenceNumber;
  }

  public void setDataSequenceNumber(Long dataSequenceNumber) {
    this.dataSequenceNumber = dataSequenceNumber;
  }

  void setManifestLocation(String manifestLocation) {
    this.manifestLocation = manifestLocation;
  }

  @Override
  public Long fileSequenceNumber() {
    return fileSequenceNumber;
  }

  public void setFileSequenceNumber(Long fileSequenceNumber) {
    this.fileSequenceNumber = fileSequenceNumber;
  }

  protected abstract Schema getAvroSchema(Types.StructType partitionStruct);

  @Override
  public Schema getSchema() {
    if (avroSchema == null) {
      this.avroSchema = getAvroSchema(partitionType);
    }
    return avroSchema;
  }

  @Override
  public void put(int i, Object value) {
    set(i, value);
  }

  @Override
  protected  void internalSet(int pos, T value) {
    switch (pos) {
      case 0:
        this.content = value != null ? FILE_CONTENT_VALUES[(Integer) value] : FileContent.DATA;
        return;
      case 1:
        // always coerce to String for Serializable
        this.filePath = value.toString();
        return;
      case 2:
        this.format = FileFormat.fromString(value.toString());
        return;
      case 3:
        this.partitionSpecId = (value != null) ? (Integer) value : -1;
        return;
      case 4:
        this.partitionData = (PartitionData) value;
        return;
      case 5:
        this.recordCount = (Long) value;
        return;
      case 6:
        this.fileSizeInBytes = (Long) value;
        return;
      case 7:
        this.columnSizes = (Map) value;
        return;
      case 8:
        this.valueCounts = (Map) value;
        return;
      case 9:
        this.nullValueCounts = (Map) value;
        return;
      case 10:
        this.nanValueCounts = (Map) value;
        return;
      case 11:
        this.lowerBounds = SerializableByteBufferMap.wrap((Map) value);
        return;
      case 12:
        this.upperBounds = SerializableByteBufferMap.wrap((Map) value);
        return;
      case 13:
        this.keyMetadata = ByteBuffers.toByteArray((ByteBuffer) value);
        return;
      case 14:
        this.splitOffsets = ArrayUtil.toLongArray((List) value);
        return;
      case 15:
        this.equalityIds = ArrayUtil.toIntArray((List) value);
        return;
      case 16:
        this.sortOrderId = (Integer) value;
        return;
      case 17:
        this.fileOrdinal = (long) value;
        return;
      default:
        // ignore the object, it must be from a newer version of the format
    }
  }

  @Override
  protected  T internalGet(int pos, Class javaClass) {
    return javaClass.cast(getByPos(pos));
  }

  private Object getByPos(int basePos) {
    switch (basePos) {
      case 0:
        return content.id();
      case 1:
        return filePath;
      case 2:
        return format != null ? format.toString() : null;
      case 3:
        return partitionSpecId;
      case 4:
        return partitionData;
      case 5:
        return recordCount;
      case 6:
        return fileSizeInBytes;
      case 7:
        return columnSizes;
      case 8:
        return valueCounts;
      case 9:
        return nullValueCounts;
      case 10:
        return nanValueCounts;
      case 11:
        return lowerBounds;
      case 12:
        return upperBounds;
      case 13:
        return keyMetadata();
      case 14:
        return splitOffsets();
      case 15:
        return equalityFieldIds();
      case 16:
        return sortOrderId;
      case 17:
        return fileOrdinal;
      default:
        throw new UnsupportedOperationException("Unknown field ordinal: " + basePos);
    }
  }

  @Override
  public Object get(int pos) {
    return get(pos, Object.class);
  }

  @Override
  public int size() {
    return DataFile.getType(EMPTY_STRUCT_TYPE).fields().size();
  }

  @Override
  public Long pos() {
    return fileOrdinal;
  }

  @Override
  public String manifestLocation() {
    return manifestLocation;
  }

  @Override
  public FileContent content() {
    return content;
  }

  @Override
  public CharSequence path() {
    return filePath;
  }

  @Override
  public FileFormat format() {
    return format;
  }

  @Override
  public StructLike partition() {
    return partitionData;
  }

  @Override
  public long recordCount() {
    return recordCount;
  }

  @Override
  public long fileSizeInBytes() {
    return fileSizeInBytes;
  }

  @Override
  public Map columnSizes() {
    return toReadableMap(columnSizes);
  }

  @Override
  public Map valueCounts() {
    return toReadableMap(valueCounts);
  }

  @Override
  public Map nullValueCounts() {
    return toReadableMap(nullValueCounts);
  }

  @Override
  public Map nanValueCounts() {
    return toReadableMap(nanValueCounts);
  }

  @Override
  public Map lowerBounds() {
    return toReadableByteBufferMap(lowerBounds);
  }

  @Override
  public Map upperBounds() {
    return toReadableByteBufferMap(upperBounds);
  }

  @Override
  public ByteBuffer keyMetadata() {
    return keyMetadata != null ? ByteBuffer.wrap(keyMetadata) : null;
  }

  @Override
  public List splitOffsets() {
    if (hasWellDefinedOffsets()) {
      return ArrayUtil.toUnmodifiableLongList(splitOffsets);
    }

    return null;
  }

  long[] splitOffsetArray() {
    if (hasWellDefinedOffsets()) {
      return splitOffsets;
    }

    return null;
  }

  private boolean hasWellDefinedOffsets() {
    // If the last split offset is past the file size this means the split offsets are corrupted and
    // should not be used
    return splitOffsets != null
        && splitOffsets.length != 0
        && splitOffsets[splitOffsets.length - 1] < fileSizeInBytes;
  }

  @Override
  public List equalityFieldIds() {
    return ArrayUtil.toIntList(equalityIds);
  }

  @Override
  public Integer sortOrderId() {
    return sortOrderId;
  }

  private static  Map copyMap(Map map, Set keys) {
    return keys == null ? SerializableMap.copyOf(map) : SerializableMap.filteredCopyOf(map, keys);
  }

  private static Map copyByteBufferMap(
      Map map, Set keys) {
    return SerializableByteBufferMap.wrap(copyMap(map, keys));
  }

  private static  Map toReadableMap(Map map) {
    if (map == null) {
      return null;
    } else if (map instanceof SerializableMap) {
      return ((SerializableMap) map).immutableMap();
    } else {
      return Collections.unmodifiableMap(map);
    }
  }

  private static Map toReadableByteBufferMap(Map map) {
    if (map == null) {
      return null;
    } else if (map instanceof SerializableByteBufferMap) {
      return ((SerializableByteBufferMap) map).immutableMap();
    } else {
      return Collections.unmodifiableMap(map);
    }
  }

  @Override
  public String toString() {
    return MoreObjects.toStringHelper(this)
        .add("content", content.toString().toLowerCase(Locale.ROOT))
        .add("file_path", filePath)
        .add("file_format", format)
        .add("spec_id", specId())
        .add("partition", partitionData)
        .add("record_count", recordCount)
        .add("file_size_in_bytes", fileSizeInBytes)
        .add("column_sizes", columnSizes)
        .add("value_counts", valueCounts)
        .add("null_value_counts", nullValueCounts)
        .add("nan_value_counts", nanValueCounts)
        .add("lower_bounds", lowerBounds)
        .add("upper_bounds", upperBounds)
        .add("key_metadata", keyMetadata == null ? "null" : "(redacted)")
        .add("split_offsets", splitOffsets == null ? "null" : splitOffsets())
        .add("equality_ids", equalityIds == null ? "null" : equalityFieldIds())
        .add("sort_order_id", sortOrderId)
        .add("data_sequence_number", dataSequenceNumber == null ? "null" : dataSequenceNumber)
        .add("file_sequence_number", fileSequenceNumber == null ? "null" : fileSequenceNumber)
        .toString();
  }
}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy