All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.google.cloud.bigtable.beam.validation.BufferedHadoopHashTableSource Maven / Gradle / Ivy

There is a newer version: 2.14.8
Show newest version
/*
 * Copyright 2021 Google Inc. All Rights Reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package com.google.cloud.bigtable.beam.validation;

import static com.google.cloud.bigtable.beam.validation.SyncTableUtils.immutableBytesToString;

import com.google.cloud.bigtable.beam.validation.HadoopHashTableSource.RangeHash;
import com.google.common.base.Objects;
import com.google.common.base.Preconditions;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import org.apache.beam.sdk.coders.Coder;
import org.apache.beam.sdk.coders.KvCoder;
import org.apache.beam.sdk.coders.ListCoder;
import org.apache.beam.sdk.coders.StringUtf8Coder;
import org.apache.beam.sdk.io.BoundedSource;
import org.apache.beam.sdk.options.PipelineOptions;
import org.apache.beam.sdk.values.KV;
import org.apache.hadoop.hbase.util.Bytes;

/**
 * Buffers the RangeHashes generated by {@link HadoopHashTableSource}. This is an optimization that
 * allows {@link ComputeAndValidateHashFromBigtableDoFn} to issue fewer ReadRow APIs with larger row
 * ranges.
 *
 * 

Hadoop HashTable output is sorted by row-key and contains a row-range and hash. Beam * Pcollection do not guarantee any ordering. To fetch a batch of ranges in 1 ReadRows operation, * this source buffers then and outputs a List guaranteeing the sorted order of ranges. * *

Emits a batch of sorted RangeHashes keyed by the start key of the first range. */ class BufferedHadoopHashTableSource extends BoundedSource>> { private static final long serialVersionUID = 39842743L; private static final int DEFAULT_BATCH_SIZE = 50; private static final Coder>> CODER = KvCoder.of(StringUtf8Coder.of(), ListCoder.of(RangeHashCoder.of()));; // Max number of RangeHashes to buffer. private final int maxBufferSize; private final HadoopHashTableSource hashTableSource; public BufferedHadoopHashTableSource(HadoopHashTableSource source) { this(source, DEFAULT_BATCH_SIZE); } public BufferedHadoopHashTableSource(HadoopHashTableSource hashTableSource, int maxBufferSize) { this.hashTableSource = hashTableSource; this.maxBufferSize = maxBufferSize; } @Override public List>>> split( long desiredBundleSizeBytes, PipelineOptions options) throws IOException { @SuppressWarnings("unchecked") List splitHashTableSources = (List) hashTableSource.split(desiredBundleSizeBytes, options); List splitSources = new ArrayList<>(splitHashTableSources.size()); // Keep the splits same as HashTableSource. for (HadoopHashTableSource splitHashTableSource : splitHashTableSources) { // Add the last range for [lastPartition, stopRow). splitSources.add(new BufferedHadoopHashTableSource(splitHashTableSource)); } return splitSources; } @Override public Coder>> getOutputCoder() { return CODER; } @Override public long getEstimatedSizeBytes(PipelineOptions options) throws Exception { // HashTable data files don't expose a method to estimate size or lineCount. return hashTableSource.getEstimatedSizeBytes(options); } @Override public BoundedReader>> createReader(PipelineOptions options) throws IOException { return new BufferedHashBasedReader(this, hashTableSource.createReader(options)); } @Override public boolean equals(Object o) { if (this == o) { return true; } if (!(o instanceof BufferedHadoopHashTableSource)) { return false; } BufferedHadoopHashTableSource that = (BufferedHadoopHashTableSource) o; return maxBufferSize == that.maxBufferSize && Objects.equal(hashTableSource, that.hashTableSource); } @Override public int hashCode() { return Objects.hashCode(maxBufferSize, hashTableSource); } @Override public String toString() { return "BufferedHadoopHashTableSource [" + immutableBytesToString(hashTableSource.startRowInclusive) + ", " + immutableBytesToString(hashTableSource.stopRowExclusive) + "), maxBufferSize=" + maxBufferSize; } private static class BufferedHashBasedReader extends BoundedReader>> { private final BoundedReader hashReader; private final BufferedHadoopHashTableSource source; private List buffer; public BufferedHashBasedReader( BufferedHadoopHashTableSource source, BoundedReader hashReader) { this.source = source; this.hashReader = hashReader; this.buffer = new ArrayList<>(source.maxBufferSize); } @Override public boolean start() throws IOException { if (!hashReader.start()) { // HashReader does not have any hashes, return empty reader. return false; } // Start returned true, consume the current RangeHash. buffer.add(hashReader.getCurrent()); bufferRangeHashes(); // Buffer is not empty, return true to consume the current buffer. return true; } // Reads from hashReader and buffers the RangeHashes. // Returns true if any RangeHashes were read from hashReader. private boolean bufferRangeHashes() throws IOException { boolean readRangeHashes = false; while (buffer.size() < source.maxBufferSize && hashReader.advance()) { readRangeHashes = true; buffer.add(hashReader.getCurrent()); } return readRangeHashes; } @Override public boolean advance() throws IOException { // Reset the buffer for next batch. buffer = new ArrayList<>(source.maxBufferSize); return bufferRangeHashes(); } @Override public KV> getCurrent() { // getCurrent only gets called when buffer is not empty. Preconditions.checkState( !buffer.isEmpty(), "getCurrent() should only be called when start/advance return true."); // GroupBy key is a string and not ImmutableBytesWritable because the WritableCoder is not // deterministic. The outputted PCollection is grouped by the K and needs a deterministic // coder. Having a String K leads to an unfortunate double encoding, ImmutableBytesWritable-> // HEX string -> UTF8 encoded string. The number of batches are significantly smaller than // data fetched from Bigtable and should not have meaningful impact on the job performance. return KV.of(Bytes.toStringBinary(buffer.get(0).startInclusive.copyBytes()), buffer); } @Override public void close() throws IOException { hashReader.close(); } @Override public BoundedSource>> getCurrentSource() { return source; } } }





© 2015 - 2025 Weber Informatics LLC | Privacy Policy