org.apache.beam.sdk.extensions.smb.BucketMetadataUtil Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of scio-smb_2.13 Show documentation
Show all versions of scio-smb_2.13 Show documentation
Sort Merge Bucket source/sink implementations for Apache Beam
The newest version!
/*
* Copyright 2019 Spotify AB.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package org.apache.beam.sdk.extensions.smb;
import static com.google.common.base.Verify.verify;
import java.util.*;
import java.util.concurrent.ConcurrentHashMap;
import java.util.function.BiFunction;
import org.apache.beam.sdk.extensions.smb.SMBFilenamePolicy.FileAssignment;
import org.apache.beam.sdk.io.fs.ResourceId;
import org.apache.beam.sdk.values.KV;
import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.annotations.VisibleForTesting;
import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.base.Preconditions;
public class BucketMetadataUtil {
private static final int BATCH_SIZE = 100;
private static final BucketMetadataUtil INSTANCE = new BucketMetadataUtil(BATCH_SIZE);
private final int batchSize;
public static BucketMetadataUtil get() {
return INSTANCE;
}
public static class SourceMetadataValue {
public final BucketMetadata, ?, V> metadata;
public final FileAssignment fileAssignment;
SourceMetadataValue(BucketMetadata, ?, V> metadata, FileAssignment fileAssignment) {
this.metadata = metadata;
this.fileAssignment = fileAssignment;
}
}
// just a wrapper class for clarity
public static class SourceMetadata {
public final Map> mapping;
SourceMetadata(Map> mapping) {
verify(!mapping.isEmpty());
this.mapping = mapping;
}
/** @return smallest number of buckets for this set of inputs. */
int leastNumBuckets() {
return mapping.values().stream().mapToInt(v -> v.metadata.getNumBuckets()).min().getAsInt();
}
}
//////////////////////////////////////////////////////////////////////////////
@VisibleForTesting
BucketMetadataUtil(int batchSize) {
this.batchSize = batchSize;
}
private Map> fetchMetadata(List directories) {
final int total = directories.size();
final Map> metadata = new ConcurrentHashMap<>();
int start = 0;
while (start < total) {
directories.stream()
.skip(start)
.limit(batchSize)
.parallel()
.forEach(dir -> metadata.put(dir, BucketMetadata.get(dir)));
start += batchSize;
}
return metadata;
}
private SourceMetadata getSourceMetadata(
Map>> directories,
BiFunction, BucketMetadata, ?, V>, Boolean>
compatibilityCompareFn) {
final Map> bucketMetadatas =
fetchMetadata(new ArrayList<>(directories.keySet()));
Preconditions.checkState(!bucketMetadatas.isEmpty(), "Failed to find metadata");
Map> mapping = new HashMap<>();
Map.Entry> first =
bucketMetadatas.entrySet().stream().findAny().get();
bucketMetadatas.forEach(
(dir, metadata) -> {
Preconditions.checkState(
metadata.isCompatibleWith(first.getValue())
&& compatibilityCompareFn.apply(metadata, first.getValue()),
"Incompatible partitions. Metadata %s is incompatible with metadata %s. %s != %s",
dir,
first.getKey(),
metadata,
first.getValue());
final FileAssignment fileAssignment =
new SMBFilenamePolicy(
dir, metadata.getFilenamePrefix(), directories.get(dir).getKey())
.forDestination();
mapping.put(dir, new SourceMetadataValue<>(metadata, fileAssignment));
});
return new SourceMetadata<>(mapping);
}
public SourceMetadata getPrimaryKeyedSourceMetadata(
Map>> directories) {
return getSourceMetadata(directories, BucketMetadata::isPartitionCompatibleForPrimaryKey);
}
public SourceMetadata getPrimaryAndSecondaryKeyedSourceMetadata(
Map>> directories) {
return getSourceMetadata(
directories, BucketMetadata::isPartitionCompatibleForPrimaryAndSecondaryKey);
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy