All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.netease.arctic.utils.ArcticDataFiles Maven / Gradle / Ivy

The newest version!
package com.netease.arctic.utils;

import com.netease.arctic.shade.org.apache.iceberg.PartitionField;
import com.netease.arctic.shade.org.apache.iceberg.PartitionSpec;
import com.netease.arctic.shade.org.apache.iceberg.data.GenericRecord;
import com.netease.arctic.shade.org.apache.iceberg.expressions.Literal;
import com.netease.arctic.shade.org.apache.iceberg.relocated.com.google.common.base.Preconditions;
import com.netease.arctic.shade.org.apache.iceberg.types.Type;
import com.netease.arctic.shade.org.apache.iceberg.types.Types;

import java.math.BigDecimal;
import java.nio.charset.StandardCharsets;
import java.text.ParseException;
import java.text.SimpleDateFormat;
import java.time.Instant;
import java.time.OffsetDateTime;
import java.time.ZoneOffset;
import java.time.temporal.ChronoUnit;
import java.util.Arrays;
import java.util.Date;
import java.util.List;
import java.util.TimeZone;
import java.util.UUID;
import java.util.stream.Collectors;

public class ArcticDataFiles {
  public static final OffsetDateTime EPOCH = Instant.ofEpochSecond(0).atOffset(ZoneOffset.UTC);
  public static final SimpleDateFormat SDF = new SimpleDateFormat("yyyy-MM-dd-hh");
  private static final int EPOCH_YEAR = EPOCH.getYear();
  private static final String HIVE_NULL = "__HIVE_DEFAULT_PARTITION__";
  private static final String MONTH_TYPE = "month";
  private static final String HOUR_TYPE = "hour";

  /**
   * return the number of months away from the epoch, reverse {@link TransformUtil#humanMonth}
   */
  public static Integer readMonthData(String dateStr) {
    String[] dateParts = dateStr.split("-", -1);
    int year = Integer.parseInt(dateParts[0]);
    int month = Integer.parseInt(dateParts[1]);
    return Math.multiplyExact((year - EPOCH_YEAR), 12) + month - 1;
  }

  /**
   * return the number of hours away from the epoch, reverse {@link TransformUtil#humanHour}
   */
  private static Integer readHoursData(String asString) {
    try {
      SDF.setTimeZone(TimeZone.getTimeZone("UTC"));
      Date date = SDF.parse(asString);
      OffsetDateTime parse = OffsetDateTime.parse(date.toInstant().toString());
      return Math.toIntExact(ChronoUnit.HOURS.between(EPOCH, parse));
    } catch (ParseException e) {
      throw new UnsupportedOperationException("Failed to parse date string:" + asString);
    }
  }

  public static Object fromPartitionString(PartitionField field, Type type, String asString) {
    if (asString == null || HIVE_NULL.equals(asString)) {
      return null;
    }

    switch (type.typeId()) {
      case BOOLEAN:
        return Boolean.valueOf(asString);
      case INTEGER:
        if (MONTH_TYPE.equals(field.transform().toString())) {
          return readMonthData(asString);
        } else if (HOUR_TYPE.equals(field.transform().toString())) {
          return readHoursData(asString);
        }
        return Integer.valueOf(asString);
      case STRING:
        return asString;
      case LONG:
        return Long.valueOf(asString);
      case FLOAT:
        return Float.valueOf(asString);
      case DOUBLE:
        return Double.valueOf(asString);
      case UUID:
        return UUID.fromString(asString);
      case FIXED:
        Types.FixedType fixed = (Types.FixedType) type;
        return Arrays.copyOf(
            asString.getBytes(StandardCharsets.UTF_8), fixed.length());
      case BINARY:
        return asString.getBytes(StandardCharsets.UTF_8);
      case DECIMAL:
        return new BigDecimal(asString);
      case DATE:
        return Literal.of(asString).to(Types.DateType.get()).value();
      default:
        throw new UnsupportedOperationException(
            "Unsupported type for fromPartitionString: " + type);
    }
  }

  public static GenericRecord data(PartitionSpec spec, String partitionPath) {
    GenericRecord data = genericRecord(spec);
    String[] partitions = partitionPath.split("/", -1);
    Preconditions.checkArgument(partitions.length <= spec.fields().size(),
        "Invalid partition data, too many fields (expecting %s): %s",
        spec.fields().size(), partitionPath);
    Preconditions.checkArgument(partitions.length >= spec.fields().size(),
        "Invalid partition data, not enough fields (expecting %s): %s",
        spec.fields().size(), partitionPath);

    for (int i = 0; i < partitions.length; i += 1) {
      PartitionField field = spec.fields().get(i);
      String[] parts = partitions[i].split("=", 2);
      Preconditions.checkArgument(parts.length == 2 &&
              parts[0] != null &&
              field.name().equals(parts[0]),
          "Invalid partition: %s", partitions[i]);

      data.set(i, ArcticDataFiles.fromPartitionString(field, spec.partitionType().fieldType(parts[0]), parts[1]));
    }

    return data;
  }

  private static GenericRecord genericRecord(PartitionSpec spec) {
    List collect = spec.fields().stream().map(s -> {
      if (s.transform().toString().equals("identity")) {
        return s.name();
      } else if (s.name().endsWith("_" + s.transform().toString())) {
        return s.name().substring(0, s.name().lastIndexOf("_" + s.transform().toString()));
      } else if (s.transform().toString().contains("bucket")) {
        return s.name().substring(0, s.name().lastIndexOf("_bucket"));
      } else if (s.transform().toString().contains("truncate")) {
        return s.name().substring(0, s.name().lastIndexOf("_trunc"));
      } else {
        return s.name();
      }
    }).collect(Collectors.toList());
    return GenericRecord.create(spec.schema().select(collect));
  }
}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy