com.google.cloud.bigtable.beam.validation.ComputeAndValidateHashFromBigtableDoFn Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of bigtable-beam-import Show documentation
There is a newer version: 2.14.7
/*
 * Copyright 2021 Google LLC
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package com.google.cloud.bigtable.beam.validation;

import static com.google.cloud.bigtable.beam.validation.SyncTableUtils.immutableBytesToString;

import com.google.bigtable.repackaged.com.google.common.base.Preconditions;
import com.google.bigtable.repackaged.com.google.common.collect.Lists;
import com.google.cloud.bigtable.beam.AbstractCloudBigtableTableDoFn;
import com.google.cloud.bigtable.beam.CloudBigtableConfiguration;
import com.google.cloud.bigtable.beam.TemplateUtils;
import com.google.cloud.bigtable.beam.validation.HadoopHashTableSource.RangeHash;
import com.google.cloud.bigtable.beam.validation.SyncTableJob.SyncTableOptions;
import com.google.common.annotations.VisibleForTesting;
import java.io.IOException;
import java.util.Iterator;
import java.util.List;
import org.apache.beam.sdk.metrics.Counter;
import org.apache.beam.sdk.metrics.Metrics;
import org.apache.beam.sdk.options.ValueProvider;
import org.apache.beam.sdk.transforms.DoFn;
import org.apache.beam.sdk.values.KV;
import org.apache.hadoop.hbase.HConstants;
import org.apache.hadoop.hbase.TableName;
import org.apache.hadoop.hbase.client.Result;
import org.apache.hadoop.hbase.client.ResultScanner;
import org.apache.hadoop.hbase.client.Scan;
import org.apache.hadoop.hbase.client.Table;
import org.apache.hadoop.hbase.io.ImmutableBytesWritable;
import org.apache.hadoop.hbase.mapreduce.BigtableTableHashAccessor.BigtableResultHasher;

/**
 * A {@link DoFn} that takes a row range and hash from HBase and validates the hash from rows read
 * from Cloud Bigtable.
 */
class ComputeAndValidateHashFromBigtableDoFn
    extends AbstractCloudBigtableTableDoFn>>, RangeHash> {

  private static final long serialVersionUID = 2349094L;
  private final ValueProvider tableName;
  private final ValueProvider projectId;
  private final ValueProvider sourceHashDir;

  private final TableHashWrapperFactory tableHashWrapperFactory;

  // Counter for reporting matching and mismatching ranges. Names are similar to HBase sync-table
  // job.
  private final Counter matches = Metrics.counter("cbt-dataflow-validate", "ranges_matched");
  private final Counter mismatches = Metrics.counter("cbt-dataflow-validate", "ranges_not_matched");

  public ComputeAndValidateHashFromBigtableDoFn(SyncTableOptions options) {
    super(TemplateUtils.buildSyncTableConfig(options));
    this.tableName = options.getBigtableTableId();
    // Create a local copy of ValueProviders, PipelineOptions are not serializable.
    projectId = options.getBigtableProject();
    sourceHashDir = options.getHashTableOutputDir();
    tableHashWrapperFactory = new TableHashWrapperFactory();
  }

  @VisibleForTesting
  ComputeAndValidateHashFromBigtableDoFn(
      CloudBigtableConfiguration config,
      ValueProvider tableName,
      ValueProvider projectId,
      ValueProvider sourceHashDir,
      TableHashWrapperFactory factory) {
    super(config);
    this.tableName = tableName;
    this.tableHashWrapperFactory = factory;
    this.sourceHashDir = projectId;
    this.projectId = sourceHashDir;
  }

  @ProcessElement
  public void processElement(ProcessContext context) throws Exception {
    List> wrapperdRangeHashes = Lists.newArrayList(context.element().getValue());
    // BufferedHadoopHashTableSource generates only 1 item per groupby key, key is startKey for the
    // Sorted ranges.
    Preconditions.checkState(
        wrapperdRangeHashes.size() == 1, "Can not have multiple entries for a key");
    List rangeHashes = wrapperdRangeHashes.get(0);
    Preconditions.checkState(!rangeHashes.isEmpty(), "Can not have empty ranges in DO_FN");

    // If a metric is not logged, it is absent from all the metrics (as opposed to being
    // 0). By logging a 0 value for the metrics we guarantee that they shows up on Dataflow UIs.
    mismatches.inc(0);
    matches.inc(0);

    ImmutableBytesWritable rangeStartInclusive = rangeHashes.get(0).startInclusive;
    ImmutableBytesWritable rangeEndExclusive =
        rangeHashes.get(rangeHashes.size() - 1).stopExclusive;

    BigtableResultHasher resultHasher = new BigtableResultHasher();
    resultHasher.startBatch(rangeStartInclusive);

    // Since all the row-ranges are sorted in HashTable's data files, 1 big scan can be used
    // to read all the row ranges. Parallelism is achieved by splitting the HashTable's data
    // files into smaller bundle of row-ranges in GroupBy.
    ResultScanner scanner =
        createBigtableScan(rangeStartInclusive.copyBytes(), rangeEndExclusive.copyBytes());

    Iterator rangeHashIterator = rangeHashes.iterator();
    long numRows = 0;

    RangeHash currentRangeHash = rangeHashIterator.next();

    // Process each row and validate hashes
    for (Result result : scanner) {
      numRows++;
      if (numRows % 10_000 == 0) {
        // Heartbeat in logs in case a large scan gets hung.
        DOFN_LOG.debug("Processed " + numRows + " rows ");
      }

      ImmutableBytesWritable rowKey = new ImmutableBytesWritable(result.getRow());

      // Check if the rowKey belongs to current range, if not keep iterating through the
      // rangeHashes until rowKey's range is found.
      while (!isWithinUpperBound(currentRangeHash.stopExclusive, rowKey)) {
        validateBatchHash(context, resultHasher, currentRangeHash);
        // THIS SHOULD NEVER HAPPEN. Bigtable is being scanned till the last
        // RangeHash.endKeyExclusive(), so bigtable's result should not outlast the
        // rangeHashes.
        Preconditions.checkState(
            rangeHashIterator.hasNext(),
            "Buffer reached to end while scan is still active at row : %s. "
                + "Affected Range: [%s, %s)."
                + immutableBytesToString(result.getRow())
                + immutableBytesToString(rangeStartInclusive)
                + immutableBytesToString(rangeEndExclusive));
        currentRangeHash = rangeHashIterator.next();
      }

      // Always Hash the current row.
      resultHasher.hashResult(result);
    }

    // Bigtable scan is finished at this point and rangeHashes may contain additional row ranges.
    // Last range will always be unverified as the range end is exclusive and
    // currentRow > rangeEndExclusive will never by true. Verify the last range.
    validateBatchHash(context, resultHasher, currentRangeHash);

    // If there are remaining ranges in the rangeHashes they all need to reported as mismatched as
    // there is nothing in Cloud Bigtable for those row ranges.
    // for (int i = bufferIndex; i < rangeHashes.size(); i++) {
    while (rangeHashIterator.hasNext()) {
      currentRangeHash = rangeHashIterator.next();
      reportMismatch(context, currentRangeHash);
    }

    DOFN_LOG.debug(
        "Finishing context by outputting {}  keys in range [{}, {}).",
        rangeHashes.size(),
        immutableBytesToString(rangeStartInclusive),
        immutableBytesToString(rangeEndExclusive));
  }

  private ResultScanner createBigtableScan(byte[] startKeyInclusive, byte[] stopKeyExclusive)
      throws IOException {
    Table table = getConnection().getTable(TableName.valueOf(tableName.get()));
    // Get the scan from TableHash, HashTable can be run to hash a small part of data (selected
    // column families, timestamp range, maxVersions etc), this scan allows us to fetch the same
    // data from Cloud Bigtable to match.
    TableHashWrapper tableHash =
        tableHashWrapperFactory.getTableHash(projectId.get(), sourceHashDir.get());
    Scan scan = tableHash.getScan();
    // Set the workitem boundaries on the scan.
    if (startKeyInclusive.length > 0) {
      scan.withStartRow(startKeyInclusive, true);
    }
    if (stopKeyExclusive.length > 0) {
      scan.withStopRow(stopKeyExclusive, false);
    }

    return table.getScanner(scan);
  }

  /**
   * Determines if row >= stopExclusive for a row range (start, stopExclusive). Empty stopExclusive
   * represents a range with no upper bound.
   */
  private static boolean isWithinUpperBound(
      ImmutableBytesWritable stopExclusive, ImmutableBytesWritable row) {
    return stopExclusive.equals(HConstants.EMPTY_END_ROW) || row.compareTo(stopExclusive) < 0;
  }

  private void validateBatchHash(
      ProcessContext context, BigtableResultHasher resultHasher, RangeHash currentRangeHash) {
    // The batch is always started, so its safe to finish the batch. If there were no rows, we will
    // get a hash for empty batch.
    resultHasher.finishBatch();
    if (!resultHasher.getBatchHash().equals(currentRangeHash.hash)) {
      reportMismatch(context, currentRangeHash);
    } else {
      matches.inc();
    }
    // Start a new batch
    resultHasher.startBatch(currentRangeHash.stopExclusive);
  }

  private void reportMismatch(ProcessContext context, RangeHash currentRangeHash) {
    mismatches.inc();
    DOFN_LOG.info(
        "MISMATCH ON RANGE [{}, {}).",
        immutableBytesToString(currentRangeHash.startInclusive),
        immutableBytesToString(currentRangeHash.stopExclusive));
    context.output(currentRangeHash);
  }
}