All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.beam.sdk.extensions.smb.BucketMetadataUtil Maven / Gradle / Ivy

The newest version!
/*
 * Copyright 2019 Spotify AB.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied.  See the License for the
 * specific language governing permissions and limitations
 * under the License.
 */

package org.apache.beam.sdk.extensions.smb;

import static com.google.common.base.Verify.verify;

import java.util.*;
import java.util.concurrent.ConcurrentHashMap;
import java.util.function.BiFunction;
import org.apache.beam.sdk.extensions.smb.SMBFilenamePolicy.FileAssignment;
import org.apache.beam.sdk.io.fs.ResourceId;
import org.apache.beam.sdk.values.KV;
import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.annotations.VisibleForTesting;
import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.base.Preconditions;

public class BucketMetadataUtil {
  private static final int BATCH_SIZE = 100;
  private static final BucketMetadataUtil INSTANCE = new BucketMetadataUtil(BATCH_SIZE);
  private final int batchSize;

  public static BucketMetadataUtil get() {
    return INSTANCE;
  }

  public static class SourceMetadataValue {
    public final BucketMetadata metadata;
    public final FileAssignment fileAssignment;

    SourceMetadataValue(BucketMetadata metadata, FileAssignment fileAssignment) {
      this.metadata = metadata;
      this.fileAssignment = fileAssignment;
    }
  }

  // just a wrapper class for clarity
  public static class SourceMetadata {
    public final Map> mapping;

    SourceMetadata(Map> mapping) {
      verify(!mapping.isEmpty());
      this.mapping = mapping;
    }

    /** @return smallest number of buckets for this set of inputs. */
    int leastNumBuckets() {
      return mapping.values().stream().mapToInt(v -> v.metadata.getNumBuckets()).min().getAsInt();
    }
  }

  //////////////////////////////////////////////////////////////////////////////

  @VisibleForTesting
  BucketMetadataUtil(int batchSize) {
    this.batchSize = batchSize;
  }

  private  Map> fetchMetadata(List directories) {
    final int total = directories.size();
    final Map> metadata = new ConcurrentHashMap<>();
    int start = 0;
    while (start < total) {
      directories.stream()
          .skip(start)
          .limit(batchSize)
          .parallel()
          .forEach(dir -> metadata.put(dir, BucketMetadata.get(dir)));
      start += batchSize;
    }
    return metadata;
  }

  private  SourceMetadata getSourceMetadata(
      Map>> directories,
      BiFunction, BucketMetadata, Boolean>
          compatibilityCompareFn) {
    final Map> bucketMetadatas =
        fetchMetadata(new ArrayList<>(directories.keySet()));
    Preconditions.checkState(!bucketMetadatas.isEmpty(), "Failed to find metadata");

    Map> mapping = new HashMap<>();
    Map.Entry> first =
        bucketMetadatas.entrySet().stream().findAny().get();
    bucketMetadatas.forEach(
        (dir, metadata) -> {
          Preconditions.checkState(
              metadata.isCompatibleWith(first.getValue())
                  && compatibilityCompareFn.apply(metadata, first.getValue()),
              "Incompatible partitions. Metadata %s is incompatible with metadata %s. %s != %s",
              dir,
              first.getKey(),
              metadata,
              first.getValue());
          final FileAssignment fileAssignment =
              new SMBFilenamePolicy(
                      dir, metadata.getFilenamePrefix(), directories.get(dir).getKey())
                  .forDestination();
          mapping.put(dir, new SourceMetadataValue<>(metadata, fileAssignment));
        });
    return new SourceMetadata<>(mapping);
  }

  public  SourceMetadata getPrimaryKeyedSourceMetadata(
      Map>> directories) {
    return getSourceMetadata(directories, BucketMetadata::isPartitionCompatibleForPrimaryKey);
  }

  public  SourceMetadata getPrimaryAndSecondaryKeyedSourceMetadata(
      Map>> directories) {
    return getSourceMetadata(
        directories, BucketMetadata::isPartitionCompatibleForPrimaryAndSecondaryKey);
  }
}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy