com.google.cloud.bigtable.beam.validation.BufferedHadoopHashTableSource Maven / Gradle / Ivy
/*
* Copyright 2021 Google Inc. All Rights Reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.google.cloud.bigtable.beam.validation;
import static com.google.cloud.bigtable.beam.validation.SyncTableUtils.immutableBytesToString;
import com.google.cloud.bigtable.beam.validation.HadoopHashTableSource.RangeHash;
import com.google.common.base.Objects;
import com.google.common.base.Preconditions;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import org.apache.beam.sdk.coders.Coder;
import org.apache.beam.sdk.coders.KvCoder;
import org.apache.beam.sdk.coders.ListCoder;
import org.apache.beam.sdk.coders.StringUtf8Coder;
import org.apache.beam.sdk.io.BoundedSource;
import org.apache.beam.sdk.options.PipelineOptions;
import org.apache.beam.sdk.values.KV;
import org.apache.hadoop.hbase.util.Bytes;
/**
* Buffers the RangeHashes generated by {@link HadoopHashTableSource}. This is an optimization that
* allows {@link ComputeAndValidateHashFromBigtableDoFn} to issue fewer ReadRow APIs with larger row
* ranges.
*
* Hadoop HashTable output is sorted by row-key and contains a row-range and hash. Beam
* Pcollection do not guarantee any ordering. To fetch a batch of ranges in 1 ReadRows operation,
* this source buffers then and outputs a List guaranteeing the sorted order of ranges.
*
* Emits a batch of sorted RangeHashes keyed by the start key of the first range.
*/
class BufferedHadoopHashTableSource extends BoundedSource>> {
private static final long serialVersionUID = 39842743L;
private static final int DEFAULT_BATCH_SIZE = 50;
private static final Coder>> CODER =
KvCoder.of(StringUtf8Coder.of(), ListCoder.of(RangeHashCoder.of()));;
// Max number of RangeHashes to buffer.
private final int maxBufferSize;
private final HadoopHashTableSource hashTableSource;
public BufferedHadoopHashTableSource(HadoopHashTableSource source) {
this(source, DEFAULT_BATCH_SIZE);
}
public BufferedHadoopHashTableSource(HadoopHashTableSource hashTableSource, int maxBufferSize) {
this.hashTableSource = hashTableSource;
this.maxBufferSize = maxBufferSize;
}
@Override
public List extends BoundedSource>>> split(
long desiredBundleSizeBytes, PipelineOptions options) throws IOException {
@SuppressWarnings("unchecked")
List splitHashTableSources =
(List) hashTableSource.split(desiredBundleSizeBytes, options);
List splitSources =
new ArrayList<>(splitHashTableSources.size());
// Keep the splits same as HashTableSource.
for (HadoopHashTableSource splitHashTableSource : splitHashTableSources) {
// Add the last range for [lastPartition, stopRow).
splitSources.add(new BufferedHadoopHashTableSource(splitHashTableSource));
}
return splitSources;
}
@Override
public Coder>> getOutputCoder() {
return CODER;
}
@Override
public long getEstimatedSizeBytes(PipelineOptions options) throws Exception {
// HashTable data files don't expose a method to estimate size or lineCount.
return hashTableSource.getEstimatedSizeBytes(options);
}
@Override
public BoundedReader>> createReader(PipelineOptions options)
throws IOException {
return new BufferedHashBasedReader(this, hashTableSource.createReader(options));
}
@Override
public boolean equals(Object o) {
if (this == o) {
return true;
}
if (!(o instanceof BufferedHadoopHashTableSource)) {
return false;
}
BufferedHadoopHashTableSource that = (BufferedHadoopHashTableSource) o;
return maxBufferSize == that.maxBufferSize
&& Objects.equal(hashTableSource, that.hashTableSource);
}
@Override
public int hashCode() {
return Objects.hashCode(maxBufferSize, hashTableSource);
}
@Override
public String toString() {
return "BufferedHadoopHashTableSource ["
+ immutableBytesToString(hashTableSource.startRowInclusive)
+ ", "
+ immutableBytesToString(hashTableSource.stopRowExclusive)
+ "), maxBufferSize="
+ maxBufferSize;
}
private static class BufferedHashBasedReader extends BoundedReader>> {
private final BoundedReader hashReader;
private final BufferedHadoopHashTableSource source;
private List buffer;
public BufferedHashBasedReader(
BufferedHadoopHashTableSource source, BoundedReader hashReader) {
this.source = source;
this.hashReader = hashReader;
this.buffer = new ArrayList<>(source.maxBufferSize);
}
@Override
public boolean start() throws IOException {
if (!hashReader.start()) {
// HashReader does not have any hashes, return empty reader.
return false;
}
// Start returned true, consume the current RangeHash.
buffer.add(hashReader.getCurrent());
bufferRangeHashes();
// Buffer is not empty, return true to consume the current buffer.
return true;
}
// Reads from hashReader and buffers the RangeHashes.
// Returns true if any RangeHashes were read from hashReader.
private boolean bufferRangeHashes() throws IOException {
boolean readRangeHashes = false;
while (buffer.size() < source.maxBufferSize && hashReader.advance()) {
readRangeHashes = true;
buffer.add(hashReader.getCurrent());
}
return readRangeHashes;
}
@Override
public boolean advance() throws IOException {
// Reset the buffer for next batch.
buffer = new ArrayList<>(source.maxBufferSize);
return bufferRangeHashes();
}
@Override
public KV> getCurrent() {
// getCurrent only gets called when buffer is not empty.
Preconditions.checkState(
!buffer.isEmpty(), "getCurrent() should only be called when start/advance return true.");
// GroupBy key is a string and not ImmutableBytesWritable because the WritableCoder is not
// deterministic. The outputted PCollection is grouped by the K and needs a deterministic
// coder. Having a String K leads to an unfortunate double encoding, ImmutableBytesWritable->
// HEX string -> UTF8 encoded string. The number of batches are significantly smaller than
// data fetched from Bigtable and should not have meaningful impact on the job performance.
return KV.of(Bytes.toStringBinary(buffer.get(0).startInclusive.copyBytes()), buffer);
}
@Override
public void close() throws IOException {
hashReader.close();
}
@Override
public BoundedSource>> getCurrentSource() {
return source;
}
}
}