com.google.cloud.bigtable.beam.validation.BufferedHadoopHashTableSource Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of bigtable-beam-import Show documentation
There is a newer version: 2.14.8
/*
 * Copyright 2021 Google Inc. All Rights Reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package com.google.cloud.bigtable.beam.validation;

import static com.google.cloud.bigtable.beam.validation.SyncTableUtils.immutableBytesToString;

import com.google.cloud.bigtable.beam.validation.HadoopHashTableSource.RangeHash;
import com.google.common.base.Objects;
import com.google.common.base.Preconditions;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import org.apache.beam.sdk.coders.Coder;
import org.apache.beam.sdk.coders.KvCoder;
import org.apache.beam.sdk.coders.ListCoder;
import org.apache.beam.sdk.coders.StringUtf8Coder;
import org.apache.beam.sdk.io.BoundedSource;
import org.apache.beam.sdk.options.PipelineOptions;
import org.apache.beam.sdk.values.KV;
import org.apache.hadoop.hbase.util.Bytes;

/**
 * Buffers the RangeHashes generated by {@link HadoopHashTableSource}. This is an optimization that
 * allows {@link ComputeAndValidateHashFromBigtableDoFn} to issue fewer ReadRow APIs with larger row
 * ranges.
 *
 * Hadoop HashTable output is sorted by row-key and contains a row-range and hash. Beam
 * Pcollection do not guarantee any ordering. To fetch a batch of ranges in 1 ReadRows operation,
 * this source buffers then and outputs a List guaranteeing the sorted order of ranges.
 *
 * Emits a batch of sorted RangeHashes keyed by the start key of the first range.
 */
class BufferedHadoopHashTableSource extends BoundedSource>> {

  private static final long serialVersionUID = 39842743L;

  private static final int DEFAULT_BATCH_SIZE = 50;
  private static final Coder>> CODER =
      KvCoder.of(StringUtf8Coder.of(), ListCoder.of(RangeHashCoder.of()));;

  // Max number of RangeHashes to buffer.
  private final int maxBufferSize;
  private final HadoopHashTableSource hashTableSource;

  public BufferedHadoopHashTableSource(HadoopHashTableSource source) {
    this(source, DEFAULT_BATCH_SIZE);
  }

  public BufferedHadoopHashTableSource(HadoopHashTableSource hashTableSource, int maxBufferSize) {
    this.hashTableSource = hashTableSource;
    this.maxBufferSize = maxBufferSize;
  }

  @Override
  public List>>> split(
      long desiredBundleSizeBytes, PipelineOptions options) throws IOException {

    @SuppressWarnings("unchecked")
    List splitHashTableSources =
        (List) hashTableSource.split(desiredBundleSizeBytes, options);

    List splitSources =
        new ArrayList<>(splitHashTableSources.size());
    // Keep the splits same as HashTableSource.
    for (HadoopHashTableSource splitHashTableSource : splitHashTableSources) {
      // Add the last range for [lastPartition, stopRow).
      splitSources.add(new BufferedHadoopHashTableSource(splitHashTableSource));
    }
    return splitSources;
  }

  @Override
  public Coder>> getOutputCoder() {
    return CODER;
  }

  @Override
  public long getEstimatedSizeBytes(PipelineOptions options) throws Exception {
    // HashTable data files don't expose a method to estimate size or lineCount.
    return hashTableSource.getEstimatedSizeBytes(options);
  }

  @Override
  public BoundedReader>> createReader(PipelineOptions options)
      throws IOException {
    return new BufferedHashBasedReader(this, hashTableSource.createReader(options));
  }

  @Override
  public boolean equals(Object o) {
    if (this == o) {
      return true;
    }
    if (!(o instanceof BufferedHadoopHashTableSource)) {
      return false;
    }
    BufferedHadoopHashTableSource that = (BufferedHadoopHashTableSource) o;
    return maxBufferSize == that.maxBufferSize
        && Objects.equal(hashTableSource, that.hashTableSource);
  }

  @Override
  public int hashCode() {
    return Objects.hashCode(maxBufferSize, hashTableSource);
  }

  @Override
  public String toString() {
    return "BufferedHadoopHashTableSource ["
        + immutableBytesToString(hashTableSource.startRowInclusive)
        + ", "
        + immutableBytesToString(hashTableSource.stopRowExclusive)
        + "), maxBufferSize="
        + maxBufferSize;
  }

  private static class BufferedHashBasedReader extends BoundedReader>> {

    private final BoundedReader hashReader;
    private final BufferedHadoopHashTableSource source;

    private List buffer;

    public BufferedHashBasedReader(
        BufferedHadoopHashTableSource source, BoundedReader hashReader) {
      this.source = source;
      this.hashReader = hashReader;
      this.buffer = new ArrayList<>(source.maxBufferSize);
    }

    @Override
    public boolean start() throws IOException {
      if (!hashReader.start()) {
        // HashReader does not have any hashes, return empty reader.
        return false;
      }
      // Start returned true, consume the current RangeHash.
      buffer.add(hashReader.getCurrent());
      bufferRangeHashes();
      // Buffer is not empty, return true to consume the current buffer.
      return true;
    }

    // Reads from hashReader and buffers the RangeHashes.
    // Returns true if any RangeHashes were read from hashReader.
    private boolean bufferRangeHashes() throws IOException {
      boolean readRangeHashes = false;
      while (buffer.size() < source.maxBufferSize && hashReader.advance()) {
        readRangeHashes = true;
        buffer.add(hashReader.getCurrent());
      }
      return readRangeHashes;
    }

    @Override
    public boolean advance() throws IOException {
      // Reset the buffer for next batch.
      buffer = new ArrayList<>(source.maxBufferSize);

      return bufferRangeHashes();
    }

    @Override
    public KV> getCurrent() {
      // getCurrent only gets called when buffer is not empty.
      Preconditions.checkState(
          !buffer.isEmpty(), "getCurrent() should only be called when start/advance return true.");
      // GroupBy key is a string and not ImmutableBytesWritable because the WritableCoder is not
      // deterministic. The outputted PCollection is grouped by the K and needs a deterministic
      // coder. Having a String K leads to an unfortunate double encoding, ImmutableBytesWritable->
      // HEX string -> UTF8 encoded string. The number of batches are significantly smaller than
      // data fetched from Bigtable and should not have meaningful impact on the job performance.
      return KV.of(Bytes.toStringBinary(buffer.get(0).startInclusive.copyBytes()), buffer);
    }

    @Override
    public void close() throws IOException {
      hashReader.close();
    }

    @Override
    public BoundedSource>> getCurrentSource() {
      return source;
    }
  }
}