All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.iceberg.DataFiles Maven / Gradle / Ivy

There is a newer version: 1.7.1
Show newest version
/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *   http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied.  See the License for the
 * specific language governing permissions and limitations
 * under the License.
 */
package org.apache.iceberg;

import java.nio.ByteBuffer;
import java.util.List;
import java.util.Map;
import org.apache.hadoop.fs.FileStatus;
import org.apache.iceberg.encryption.EncryptedOutputFile;
import org.apache.iceberg.encryption.EncryptionKeyMetadata;
import org.apache.iceberg.hadoop.HadoopInputFile;
import org.apache.iceberg.io.InputFile;
import org.apache.iceberg.relocated.com.google.common.base.Preconditions;
import org.apache.iceberg.relocated.com.google.common.collect.ImmutableList;
import org.apache.iceberg.types.Conversions;
import org.apache.iceberg.util.ByteBuffers;

public class DataFiles {

  private DataFiles() {}

  static PartitionData newPartitionData(PartitionSpec spec) {
    return new PartitionData(spec.partitionType());
  }

  static PartitionData copyPartitionData(
      PartitionSpec spec, StructLike partitionData, PartitionData reuse) {
    Preconditions.checkArgument(
        spec.isPartitioned(), "Can't copy partition data to a unpartitioned table");
    PartitionData data = reuse;
    if (data == null) {
      data = newPartitionData(spec);
    }

    Class[] javaClasses = spec.javaClasses();
    List fields = spec.fields();
    for (int i = 0; i < fields.size(); i += 1) {
      data.set(i, partitionData.get(i, javaClasses[i]));
    }

    return data;
  }

  static PartitionData fillFromPath(PartitionSpec spec, String partitionPath, PartitionData reuse) {
    PartitionData data = reuse;
    if (data == null) {
      data = newPartitionData(spec);
    }

    String[] partitions = partitionPath.split("/", -1);
    Preconditions.checkArgument(
        partitions.length <= spec.fields().size(),
        "Invalid partition data, too many fields (expecting %s): %s",
        spec.fields().size(),
        partitionPath);
    Preconditions.checkArgument(
        partitions.length >= spec.fields().size(),
        "Invalid partition data, not enough fields (expecting %s): %s",
        spec.fields().size(),
        partitionPath);

    for (int i = 0; i < partitions.length; i += 1) {
      PartitionField field = spec.fields().get(i);
      String[] parts = partitions[i].split("=", 2);
      Preconditions.checkArgument(
          parts.length == 2 && parts[0] != null && field.name().equals(parts[0]),
          "Invalid partition: %s",
          partitions[i]);

      data.set(i, Conversions.fromPartitionString(data.getType(i), parts[1]));
    }

    return data;
  }

  static PartitionData fillFromValues(
      PartitionSpec spec, List partitionValues, PartitionData reuse) {
    PartitionData data = reuse;
    if (data == null) {
      data = newPartitionData(spec);
    }

    Preconditions.checkArgument(
        partitionValues.size() == spec.fields().size(),
        "Invalid partition data, expecting %s fields, found %s",
        spec.fields().size(),
        partitionValues.size());

    for (int i = 0; i < partitionValues.size(); i += 1) {
      data.set(i, Conversions.fromPartitionString(data.getType(i), partitionValues.get(i)));
    }

    return data;
  }

  public static PartitionData data(PartitionSpec spec, String partitionPath) {
    return fillFromPath(spec, partitionPath, null);
  }

  public static PartitionData copy(PartitionSpec spec, StructLike partition) {
    return copyPartitionData(spec, partition, null);
  }

  public static DataFile fromManifest(ManifestFile manifest) {
    Preconditions.checkArgument(
        manifest.addedFilesCount() != null && manifest.existingFilesCount() != null,
        "Cannot create data file from manifest: data file counts are missing.");

    return DataFiles.builder(PartitionSpec.unpartitioned())
        .withPath(manifest.path())
        .withFormat(FileFormat.AVRO)
        .withRecordCount(manifest.addedFilesCount() + manifest.existingFilesCount())
        .withFileSizeInBytes(manifest.length())
        .build();
  }

  public static Builder builder(PartitionSpec spec) {
    return new Builder(spec);
  }

  public static class Builder {
    private final PartitionSpec spec;
    private final boolean isPartitioned;
    private final int specId;
    private PartitionData partitionData;
    private String filePath = null;
    private FileFormat format = null;
    private long recordCount = -1L;
    private long fileSizeInBytes = -1L;

    // optional fields
    private Map columnSizes = null;
    private Map valueCounts = null;
    private Map nullValueCounts = null;
    private Map nanValueCounts = null;
    private Map lowerBounds = null;
    private Map upperBounds = null;
    private ByteBuffer keyMetadata = null;
    private List splitOffsets = null;
    private Integer sortOrderId = SortOrder.unsorted().orderId();

    public Builder(PartitionSpec spec) {
      this.spec = spec;
      this.specId = spec.specId();
      this.isPartitioned = spec.isPartitioned();
      this.partitionData = isPartitioned ? newPartitionData(spec) : null;
    }

    public void clear() {
      if (isPartitioned) {
        partitionData.clear();
      }
      this.filePath = null;
      this.format = null;
      this.recordCount = -1L;
      this.fileSizeInBytes = -1L;
      this.columnSizes = null;
      this.valueCounts = null;
      this.nullValueCounts = null;
      this.nanValueCounts = null;
      this.lowerBounds = null;
      this.upperBounds = null;
      this.splitOffsets = null;
      this.sortOrderId = SortOrder.unsorted().orderId();
    }

    public Builder copy(DataFile toCopy) {
      if (isPartitioned) {
        Preconditions.checkState(
            specId == toCopy.specId(), "Cannot copy a DataFile with a different spec");
        this.partitionData = copyPartitionData(spec, toCopy.partition(), partitionData);
      }
      this.filePath = toCopy.path().toString();
      this.format = toCopy.format();
      this.recordCount = toCopy.recordCount();
      this.fileSizeInBytes = toCopy.fileSizeInBytes();
      this.columnSizes = toCopy.columnSizes();
      this.valueCounts = toCopy.valueCounts();
      this.nullValueCounts = toCopy.nullValueCounts();
      this.nanValueCounts = toCopy.nanValueCounts();
      this.lowerBounds = toCopy.lowerBounds();
      this.upperBounds = toCopy.upperBounds();
      this.keyMetadata =
          toCopy.keyMetadata() == null ? null : ByteBuffers.copy(toCopy.keyMetadata());
      this.splitOffsets =
          toCopy.splitOffsets() == null ? null : ImmutableList.copyOf(toCopy.splitOffsets());
      this.sortOrderId = toCopy.sortOrderId();
      return this;
    }

    public Builder withStatus(FileStatus stat) {
      this.filePath = stat.getPath().toString();
      this.fileSizeInBytes = stat.getLen();
      return this;
    }

    public Builder withInputFile(InputFile file) {
      if (file instanceof HadoopInputFile) {
        return withStatus(((HadoopInputFile) file).getStat());
      }

      this.filePath = file.location();
      this.fileSizeInBytes = file.getLength();
      return this;
    }

    public Builder withEncryptedOutputFile(EncryptedOutputFile newEncryptedFile) {
      withInputFile(newEncryptedFile.encryptingOutputFile().toInputFile());
      withEncryptionKeyMetadata(newEncryptedFile.keyMetadata());
      return this;
    }

    public Builder withPath(String newFilePath) {
      this.filePath = newFilePath;
      return this;
    }

    public Builder withFormat(String newFormat) {
      this.format = FileFormat.fromString(newFormat);
      return this;
    }

    public Builder withFormat(FileFormat newFormat) {
      this.format = newFormat;
      return this;
    }

    public Builder withPartition(StructLike newPartition) {
      if (isPartitioned) {
        this.partitionData = copyPartitionData(spec, newPartition, partitionData);
      }
      return this;
    }

    public Builder withRecordCount(long newRecordCount) {
      this.recordCount = newRecordCount;
      return this;
    }

    public Builder withFileSizeInBytes(long newFileSizeInBytes) {
      this.fileSizeInBytes = newFileSizeInBytes;
      return this;
    }

    public Builder withPartitionPath(String newPartitionPath) {
      Preconditions.checkArgument(
          isPartitioned || newPartitionPath.isEmpty(),
          "Cannot add partition data for an unpartitioned table");
      if (!newPartitionPath.isEmpty()) {
        this.partitionData = fillFromPath(spec, newPartitionPath, partitionData);
      }
      return this;
    }

    public Builder withPartitionValues(List partitionValues) {
      Preconditions.checkArgument(
          isPartitioned ^ partitionValues.isEmpty(),
          "Table must be partitioned or partition values must be empty");
      if (!partitionValues.isEmpty()) {
        this.partitionData = fillFromValues(spec, partitionValues, partitionData);
      }
      return this;
    }

    public Builder withMetrics(Metrics metrics) {
      // check for null to avoid NPE when unboxing
      this.recordCount = metrics.recordCount() == null ? -1 : metrics.recordCount();
      this.columnSizes = metrics.columnSizes();
      this.valueCounts = metrics.valueCounts();
      this.nullValueCounts = metrics.nullValueCounts();
      this.nanValueCounts = metrics.nanValueCounts();
      this.lowerBounds = metrics.lowerBounds();
      this.upperBounds = metrics.upperBounds();
      return this;
    }

    public Builder withSplitOffsets(List offsets) {
      if (offsets != null) {
        this.splitOffsets = ImmutableList.copyOf(offsets);
      } else {
        this.splitOffsets = null;
      }
      return this;
    }

    /** @deprecated since 1.5.0, will be removed in 1.6.0; must not be set for data files. */
    @Deprecated
    public Builder withEqualityFieldIds(List equalityIds) {
      throw new UnsupportedOperationException("Equality field IDs must not be set for data files");
    }

    public Builder withEncryptionKeyMetadata(ByteBuffer newKeyMetadata) {
      this.keyMetadata = newKeyMetadata;
      return this;
    }

    public Builder withEncryptionKeyMetadata(EncryptionKeyMetadata newKeyMetadata) {
      return withEncryptionKeyMetadata(newKeyMetadata.buffer());
    }

    public Builder withSortOrder(SortOrder newSortOrder) {
      if (newSortOrder != null) {
        this.sortOrderId = newSortOrder.orderId();
      }
      return this;
    }

    public DataFile build() {
      Preconditions.checkArgument(filePath != null, "File path is required");
      if (format == null) {
        this.format = FileFormat.fromFileName(filePath);
      }
      Preconditions.checkArgument(format != null, "File format is required");
      Preconditions.checkArgument(fileSizeInBytes >= 0, "File size is required");
      Preconditions.checkArgument(recordCount >= 0, "Record count is required");

      return new GenericDataFile(
          specId,
          filePath,
          format,
          isPartitioned ? partitionData.copy() : null,
          fileSizeInBytes,
          new Metrics(
              recordCount,
              columnSizes,
              valueCounts,
              nullValueCounts,
              nanValueCounts,
              lowerBounds,
              upperBounds),
          keyMetadata,
          splitOffsets,
          sortOrderId);
    }
  }
}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy