All Downloads are FREE. Search and download functionalities are using the official Maven repository.

software.amazon.glue.operations.IcebergDataManifestParser Maven / Gradle / Ivy

The newest version!
/*
 * Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License").
 * You may not use this file except in compliance with the License.
 * A copy of the License is located at
 *
 *  http://aws.amazon.com/apache2.0
 *
 * or in the "license" file accompanying this file. This file is distributed
 * on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
 * express or implied. See the License for the specific language governing
 * permissions and limitations under the License.
 */
package software.amazon.glue.operations;

import com.fasterxml.jackson.core.JsonGenerator;
import com.fasterxml.jackson.databind.JsonNode;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.nio.ByteBuffer;
import java.nio.charset.StandardCharsets;
import java.util.List;
import java.util.Map;
import org.apache.commons.compress.utils.Lists;
import org.apache.iceberg.ContentFile;
import org.apache.iceberg.DataFile;
import org.apache.iceberg.DataFiles;
import org.apache.iceberg.FileFormat;
import org.apache.iceberg.Metrics;
import org.apache.iceberg.PartitionData;
import org.apache.iceberg.PartitionSpec;
import org.apache.iceberg.SingleValueParser;
import org.apache.iceberg.StructLike;
import org.apache.iceberg.io.InputFile;
import org.apache.iceberg.io.OutputFile;
import org.apache.iceberg.relocated.com.google.common.base.Preconditions;
import org.apache.iceberg.relocated.com.google.common.io.CharStreams;
import org.apache.iceberg.util.JsonUtil;

public class IcebergDataManifestParser {
  private static final String SPEC_ID = "spec-id";
  private static final String CONTENT = "content";
  private static final String FILE_PATH = "file-path";
  private static final String FILE_FORMAT = "file-format";
  private static final String PARTITION = "partition";
  private static final String RECORD_COUNT = "record-count";
  private static final String FILE_SIZE = "file-size-in-bytes";
  private static final String COLUMN_SIZES = "column-sizes";
  private static final String VALUE_COUNTS = "value-counts";
  private static final String NULL_VALUE_COUNTS = "null-value-counts";
  private static final String NAN_VALUE_COUNTS = "nan-value-counts";
  private static final String LOWER_BOUNDS = "lower-bounds";
  private static final String UPPER_BOUNDS = "upper-bounds";
  private static final String KEY_METADATA = "key-metadata";
  private static final String SPLIT_OFFSETS = "split-offsets";
  private static final String EQUALITY_IDS = "equality-ids";
  private static final String SORT_ORDER_ID = "sort-order-id";
  private static final String CONTENT_FILES = "content-files";

  public static void writeDataFilesToJson(
      OutputFile outputFile, List dataFiles, Map specs) {
    try (JsonGenerator generator = JsonUtil.factory().createGenerator(outputFile.create())) {
      generator.writeStartObject();
      generator.writeArrayFieldStart(CONTENT_FILES);
      for (DataFile dataFile : dataFiles) {
        Preconditions.checkArgument(
            specs.containsKey(dataFile.specId()), "Datafile referencing invalid partition spec");
        writeDataFileToJson(generator, dataFile, specs.get(dataFile.specId()));
      }
      generator.writeEndArray();
      generator.writeEndObject();
    } catch (IOException e) {
      throw new RuntimeException("Failed to generate data manifest file", e);
    }
  }

  private static void writeDataFileToJson(
      JsonGenerator generator, DataFile dataFile, PartitionSpec spec) throws IOException {
    Preconditions.checkArgument(dataFile != null, "Invalid content file: null");
    Preconditions.checkArgument(spec != null, "Invalid partition spec: null");
    Preconditions.checkArgument(generator != null, "Invalid JSON generator: null");
    Preconditions.checkArgument(
        dataFile.specId() == spec.specId(),
        "Invalid partition spec id from content file: expected = %s, actual = %s",
        spec.specId(),
        dataFile.specId());
    Preconditions.checkArgument(
        spec.isPartitioned() == hasPartitionData(dataFile.partition()),
        "Invalid partition data from content file: expected = %s, actual = %s",
        spec.isPartitioned() ? "partitioned" : "unpartitioned",
        hasPartitionData(dataFile.partition()) ? "partitioned" : "unpartitioned");
    generator.writeStartObject();
    generator.writeNumberField(SPEC_ID, dataFile.specId());
    generator.writeStringField(CONTENT, dataFile.content().name());
    generator.writeStringField(FILE_PATH, dataFile.path().toString());
    generator.writeStringField(FILE_FORMAT, dataFile.format().name());
    if (dataFile.partition() != null) {
      generator.writeFieldName(PARTITION);
      SingleValueParser.toJson(spec.partitionType(), dataFile.partition(), generator);
    }
    generator.writeNumberField(FILE_SIZE, dataFile.fileSizeInBytes());
    metricsToJson(dataFile, generator);
    if (dataFile.keyMetadata() != null) {
      generator.writeFieldName(KEY_METADATA);
      SingleValueParser.toJson(DataFile.KEY_METADATA.type(), dataFile.keyMetadata(), generator);
    }

    if (dataFile.splitOffsets() != null) {
      JsonUtil.writeLongArray(SPLIT_OFFSETS, dataFile.splitOffsets(), generator);
    }

    if (dataFile.equalityFieldIds() != null) {
      JsonUtil.writeIntegerArray(EQUALITY_IDS, dataFile.equalityFieldIds(), generator);
    }

    if (dataFile.sortOrderId() != null) {
      generator.writeNumberField(SORT_ORDER_ID, dataFile.sortOrderId());
    }
    generator.writeEndObject();
  }

  public static List readDataFilesFromJson(
      InputFile inputFile, Map specs) throws IOException {
    List dataFiles = Lists.newArrayList();
    try (InputStream inputStream = inputFile.newStream()) {
      String value =
          CharStreams.toString(new InputStreamReader(inputStream, StandardCharsets.UTF_8));
      JsonNode rootNode = JsonUtil.mapper().readTree(value);
      JsonNode contentFilesNode = rootNode.get(CONTENT_FILES);
      if (contentFilesNode != null && contentFilesNode.isArray()) {
        for (JsonNode dataFileNode : contentFilesNode) {
          dataFiles.add(parseDataFileFromJson(dataFileNode, specs));
        }
      }
    }
    return dataFiles;
  }

  public static DataFile parseDataFileFromJson(
      JsonNode jsonNode, Map specs) {
    Preconditions.checkArgument(jsonNode != null, "Invalid JSON node for content file: null");
    Preconditions.checkArgument(
        jsonNode.isObject(), "Invalid JSON node for content file: non-object (%s)", jsonNode);

    int specId = JsonUtil.getInt(SPEC_ID, jsonNode);
    PartitionSpec spec = specs.get(specId);
    String filePath = JsonUtil.getString(FILE_PATH, jsonNode);
    FileFormat fileFormat = FileFormat.fromString(JsonUtil.getString(FILE_FORMAT, jsonNode));

    PartitionData partitionData = null;
    if (jsonNode.has(PARTITION)) {
      partitionData = new PartitionData(spec.partitionType());
      StructLike structLike =
          (StructLike) SingleValueParser.fromJson(spec.partitionType(), jsonNode.get(PARTITION));
      Preconditions.checkState(
          partitionData.size() == structLike.size(),
          "Invalid partition data size: expected = %s, actual = %s",
          partitionData.size(),
          structLike.size());
      for (int pos = 0; pos < partitionData.size(); ++pos) {
        Class javaClass = spec.partitionType().fields().get(pos).type().typeId().javaClass();
        partitionData.set(pos, structLike.get(pos, javaClass));
      }
    }

    long fileSizeInBytes = JsonUtil.getLong(FILE_SIZE, jsonNode);
    Metrics metrics = metricsFromJson(jsonNode);
    ByteBuffer keyMetadata = JsonUtil.getByteBufferOrNull(KEY_METADATA, jsonNode);
    List splitOffsets = JsonUtil.getLongListOrNull(SPLIT_OFFSETS, jsonNode);
    return DataFiles.builder(spec)
        .withPath(filePath)
        .withFormat(fileFormat)
        .withPartition(partitionData)
        .withFileSizeInBytes(fileSizeInBytes)
        .withMetrics(metrics)
        .withSplitOffsets(splitOffsets)
        .withEncryptionKeyMetadata(keyMetadata)
        .withSplitOffsets(splitOffsets)
        .build();
  }

  private static void metricsToJson(ContentFile contentFile, JsonGenerator generator)
      throws IOException {
    generator.writeNumberField(RECORD_COUNT, contentFile.recordCount());

    if (contentFile.columnSizes() != null) {
      generator.writeFieldName(COLUMN_SIZES);
      SingleValueParser.toJson(DataFile.COLUMN_SIZES.type(), contentFile.columnSizes(), generator);
    }

    if (contentFile.valueCounts() != null) {
      generator.writeFieldName(VALUE_COUNTS);
      SingleValueParser.toJson(DataFile.VALUE_COUNTS.type(), contentFile.valueCounts(), generator);
    }

    if (contentFile.nullValueCounts() != null) {
      generator.writeFieldName(NULL_VALUE_COUNTS);
      SingleValueParser.toJson(
          DataFile.NULL_VALUE_COUNTS.type(), contentFile.nullValueCounts(), generator);
    }

    if (contentFile.nullValueCounts() != null) {
      generator.writeFieldName(NAN_VALUE_COUNTS);
      SingleValueParser.toJson(
          DataFile.NAN_VALUE_COUNTS.type(), contentFile.nanValueCounts(), generator);
    }

    if (contentFile.lowerBounds() != null) {
      generator.writeFieldName(LOWER_BOUNDS);
      SingleValueParser.toJson(DataFile.LOWER_BOUNDS.type(), contentFile.lowerBounds(), generator);
    }

    if (contentFile.upperBounds() != null) {
      generator.writeFieldName(UPPER_BOUNDS);
      SingleValueParser.toJson(DataFile.UPPER_BOUNDS.type(), contentFile.upperBounds(), generator);
    }
  }

  private static Metrics metricsFromJson(JsonNode jsonNode) {
    long recordCount = JsonUtil.getLong(RECORD_COUNT, jsonNode);

    Map columnSizes = null;
    if (jsonNode.has(COLUMN_SIZES)) {
      columnSizes =
          (Map)
              SingleValueParser.fromJson(DataFile.COLUMN_SIZES.type(), jsonNode.get(COLUMN_SIZES));
    }

    Map valueCounts = null;
    if (jsonNode.has(VALUE_COUNTS)) {
      valueCounts =
          (Map)
              SingleValueParser.fromJson(DataFile.VALUE_COUNTS.type(), jsonNode.get(VALUE_COUNTS));
    }

    Map nullValueCounts = null;
    if (jsonNode.has(NULL_VALUE_COUNTS)) {
      nullValueCounts =
          (Map)
              SingleValueParser.fromJson(
                  DataFile.NULL_VALUE_COUNTS.type(), jsonNode.get(NULL_VALUE_COUNTS));
    }

    Map nanValueCounts = null;
    if (jsonNode.has(NAN_VALUE_COUNTS)) {
      nanValueCounts =
          (Map)
              SingleValueParser.fromJson(
                  DataFile.NAN_VALUE_COUNTS.type(), jsonNode.get(NAN_VALUE_COUNTS));
    }

    Map lowerBounds = null;
    if (jsonNode.has(LOWER_BOUNDS)) {
      lowerBounds =
          (Map)
              SingleValueParser.fromJson(DataFile.LOWER_BOUNDS.type(), jsonNode.get(LOWER_BOUNDS));
    }

    Map upperBounds = null;
    if (jsonNode.has(UPPER_BOUNDS)) {
      upperBounds =
          (Map)
              SingleValueParser.fromJson(DataFile.UPPER_BOUNDS.type(), jsonNode.get(UPPER_BOUNDS));
    }

    return new Metrics(
        recordCount,
        columnSizes,
        valueCounts,
        nullValueCounts,
        nanValueCounts,
        lowerBounds,
        upperBounds);
  }

  private static boolean hasPartitionData(StructLike partitionData) {
    return partitionData != null && partitionData.size() > 0;
  }
}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy