All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.iceberg.flink.sink.shuffle.DataStatisticsSerializer Maven / Gradle / Ivy

/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *   http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied.  See the License for the
 * specific language governing permissions and limitations
 * under the License.
 */
package org.apache.iceberg.flink.sink.shuffle;

import java.io.IOException;
import java.util.Map;
import java.util.Objects;
import org.apache.datasketches.memory.Memory;
import org.apache.datasketches.sampling.ReservoirItemsSketch;
import org.apache.flink.annotation.Internal;
import org.apache.flink.api.common.typeutils.CompositeTypeSerializerSnapshot;
import org.apache.flink.api.common.typeutils.TypeSerializer;
import org.apache.flink.api.common.typeutils.TypeSerializerSnapshot;
import org.apache.flink.api.common.typeutils.base.EnumSerializer;
import org.apache.flink.api.common.typeutils.base.LongSerializer;
import org.apache.flink.api.common.typeutils.base.MapSerializer;
import org.apache.flink.core.memory.DataInputView;
import org.apache.flink.core.memory.DataOutputView;
import org.apache.iceberg.SortKey;
import org.apache.iceberg.relocated.com.google.common.collect.Maps;

@Internal
class DataStatisticsSerializer extends TypeSerializer {
  private final TypeSerializer sortKeySerializer;
  private final EnumSerializer statisticsTypeSerializer;
  private final MapSerializer mapSerializer;
  private final SortKeySketchSerializer sketchSerializer;

  DataStatisticsSerializer(TypeSerializer sortKeySerializer) {
    this.sortKeySerializer = sortKeySerializer;
    this.statisticsTypeSerializer = new EnumSerializer<>(StatisticsType.class);
    this.mapSerializer = new MapSerializer<>(sortKeySerializer, LongSerializer.INSTANCE);
    this.sketchSerializer = new SortKeySketchSerializer(sortKeySerializer);
  }

  @Override
  public boolean isImmutableType() {
    return false;
  }

  @SuppressWarnings("ReferenceEquality")
  @Override
  public TypeSerializer duplicate() {
    TypeSerializer duplicateSortKeySerializer = sortKeySerializer.duplicate();
    return (duplicateSortKeySerializer == sortKeySerializer)
        ? this
        : new DataStatisticsSerializer(duplicateSortKeySerializer);
  }

  @Override
  public DataStatistics createInstance() {
    return new MapDataStatistics();
  }

  @SuppressWarnings("unchecked")
  @Override
  public DataStatistics copy(DataStatistics obj) {
    StatisticsType statisticsType = obj.type();
    if (statisticsType == StatisticsType.Map) {
      MapDataStatistics from = (MapDataStatistics) obj;
      Map fromStats = (Map) from.result();
      Map toStats = Maps.newHashMap(fromStats);
      return new MapDataStatistics(toStats);
    } else if (statisticsType == StatisticsType.Sketch) {
      // because ReservoirItemsSketch doesn't expose enough public methods for cloning,
      // this implementation adopted the less efficient serialization and deserialization.
      SketchDataStatistics from = (SketchDataStatistics) obj;
      ReservoirItemsSketch fromStats = (ReservoirItemsSketch) from.result();
      byte[] bytes = fromStats.toByteArray(sketchSerializer);
      Memory memory = Memory.wrap(bytes);
      ReservoirItemsSketch toStats =
          ReservoirItemsSketch.heapify(memory, sketchSerializer);
      return new SketchDataStatistics(toStats);
    } else {
      throw new IllegalArgumentException("Unsupported data statistics type: " + statisticsType);
    }
  }

  @Override
  public DataStatistics copy(DataStatistics from, DataStatistics reuse) {
    // not much benefit to reuse
    return copy(from);
  }

  @Override
  public int getLength() {
    return -1;
  }

  @SuppressWarnings("unchecked")
  @Override
  public void serialize(DataStatistics obj, DataOutputView target) throws IOException {
    StatisticsType statisticsType = obj.type();
    statisticsTypeSerializer.serialize(obj.type(), target);
    if (statisticsType == StatisticsType.Map) {
      Map mapStatistics = (Map) obj.result();
      mapSerializer.serialize(mapStatistics, target);
    } else if (statisticsType == StatisticsType.Sketch) {
      ReservoirItemsSketch sketch = (ReservoirItemsSketch) obj.result();
      byte[] sketchBytes = sketch.toByteArray(sketchSerializer);
      target.writeInt(sketchBytes.length);
      target.write(sketchBytes);
    } else {
      throw new IllegalArgumentException("Unsupported data statistics type: " + statisticsType);
    }
  }

  @Override
  public DataStatistics deserialize(DataInputView source) throws IOException {
    StatisticsType statisticsType = statisticsTypeSerializer.deserialize(source);
    if (statisticsType == StatisticsType.Map) {
      Map mapStatistics = mapSerializer.deserialize(source);
      return new MapDataStatistics(mapStatistics);
    } else if (statisticsType == StatisticsType.Sketch) {
      int numBytes = source.readInt();
      byte[] sketchBytes = new byte[numBytes];
      source.read(sketchBytes);
      Memory sketchMemory = Memory.wrap(sketchBytes);
      ReservoirItemsSketch sketch =
          ReservoirItemsSketch.heapify(sketchMemory, sketchSerializer);
      return new SketchDataStatistics(sketch);
    } else {
      throw new IllegalArgumentException("Unsupported data statistics type: " + statisticsType);
    }
  }

  @Override
  public DataStatistics deserialize(DataStatistics reuse, DataInputView source) throws IOException {
    // not much benefit to reuse
    return deserialize(source);
  }

  @Override
  public void copy(DataInputView source, DataOutputView target) throws IOException {
    serialize(deserialize(source), target);
  }

  @Override
  public boolean equals(Object obj) {
    if (!(obj instanceof DataStatisticsSerializer)) {
      return false;
    }

    DataStatisticsSerializer other = (DataStatisticsSerializer) obj;
    return Objects.equals(sortKeySerializer, other.sortKeySerializer);
  }

  @Override
  public int hashCode() {
    return sortKeySerializer.hashCode();
  }

  @Override
  public TypeSerializerSnapshot snapshotConfiguration() {
    return new DataStatisticsSerializerSnapshot(this);
  }

  public static class DataStatisticsSerializerSnapshot
      extends CompositeTypeSerializerSnapshot {
    private static final int CURRENT_VERSION = 1;

    /** Constructor for read instantiation. */
    @SuppressWarnings({"unused", "checkstyle:RedundantModifier"})
    public DataStatisticsSerializerSnapshot() {}

    @SuppressWarnings("checkstyle:RedundantModifier")
    public DataStatisticsSerializerSnapshot(DataStatisticsSerializer serializer) {
      super(serializer);
    }

    @Override
    protected int getCurrentOuterSnapshotVersion() {
      return CURRENT_VERSION;
    }

    @Override
    protected TypeSerializer[] getNestedSerializers(DataStatisticsSerializer outerSerializer) {
      return new TypeSerializer[] {outerSerializer.sortKeySerializer};
    }

    @Override
    protected DataStatisticsSerializer createOuterSerializerWithNestedSerializers(
        TypeSerializer[] nestedSerializers) {
      SortKeySerializer sortKeySerializer = (SortKeySerializer) nestedSerializers[0];
      return new DataStatisticsSerializer(sortKeySerializer);
    }
  }
}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy