All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.google.cloud.bigtable.beam.validation.HadoopHashTableSource Maven / Gradle / Ivy

/*
 * Copyright 2021 Google LLC
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package com.google.cloud.bigtable.beam.validation;

import static com.google.cloud.bigtable.beam.validation.SyncTableUtils.immutableBytesToString;

import com.google.bigtable.repackaged.com.google.api.core.InternalApi;
import com.google.bigtable.repackaged.com.google.common.annotations.VisibleForTesting;
import com.google.cloud.bigtable.beam.validation.HadoopHashTableSource.RangeHash;
import com.google.cloud.bigtable.beam.validation.TableHashWrapper.TableHashReader;
import com.google.common.base.Objects;
import com.google.common.base.Preconditions;
import com.google.common.collect.ImmutableList;
import java.io.IOException;
import java.io.ObjectInputStream;
import java.io.ObjectOutputStream;
import java.io.Serializable;
import java.util.ArrayList;
import java.util.List;
import javax.annotation.Nullable;
import org.apache.beam.sdk.coders.Coder;
import org.apache.beam.sdk.coders.DefaultCoder;
import org.apache.beam.sdk.io.BoundedSource;
import org.apache.beam.sdk.options.PipelineOptions;
import org.apache.beam.sdk.options.ValueProvider;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.hbase.HConstants;
import org.apache.hadoop.hbase.io.ImmutableBytesWritable;

/**
 * A beam source to read output of Hadoop HashTable job. The source creates 1 workitem per HashTable
 * data file and emits a row-range/hash pair.
 */
@InternalApi
public class HadoopHashTableSource extends BoundedSource implements Serializable {

  private static final long serialVersionUID = 2383724L;

  private static final Coder CODER = RangeHashCoder.of();

  /**
   * A simple POJO encapsulating a row range and the corresponding hash generated by HashTable job.
   * TODO Evaluate if we can use AutoValue for this class.
   */
  @DefaultCoder(RangeHashCoder.class)
  public static class RangeHash {

    public final ImmutableBytesWritable startInclusive;
    public final ImmutableBytesWritable stopExclusive;
    public final ImmutableBytesWritable hash;

    private RangeHash(
        ImmutableBytesWritable startInclusive,
        ImmutableBytesWritable stopExclusive,
        ImmutableBytesWritable hash) {
      this.startInclusive = startInclusive;
      this.stopExclusive = stopExclusive;
      this.hash = hash;
    }

    static RangeHash of(
        ImmutableBytesWritable startInclusive,
        ImmutableBytesWritable stopExclusive,
        ImmutableBytesWritable hash) {
      Preconditions.checkNotNull(startInclusive);
      Preconditions.checkNotNull(stopExclusive);
      Preconditions.checkNotNull(hash);
      return new RangeHash(startInclusive, stopExclusive, hash);
    }

    @Override
    public String toString() {
      return String.format(
          "RangeHash{ range = [ %s, %s), hash: %s }",
          immutableBytesToString(startInclusive),
          immutableBytesToString(stopExclusive),
          immutableBytesToString(hash));
    }

    @Override
    public boolean equals(Object o) {
      if (this == o) {
        return true;
      }
      if (!(o instanceof RangeHash)) {
        return false;
      }
      RangeHash rangeHash = (RangeHash) o;
      return Objects.equal(startInclusive, rangeHash.startInclusive)
          && Objects.equal(stopExclusive, rangeHash.stopExclusive)
          && Objects.equal(hash, rangeHash.hash);
    }

    @Override
    public int hashCode() {
      return Objects.hashCode(startInclusive, stopExclusive, hash);
    }
  }

  public static final Log LOG = LogFactory.getLog(HadoopHashTableSource.class);

  private final ValueProvider projectId;

  // Path to the output of HashTable job. Usually in GCS.
  private final ValueProvider sourceHashDir;

  // Row range owned by this source.
  // The Start and Stop row are serialized in a custom way.
  @VisibleForTesting @Nullable transient ImmutableBytesWritable startRowInclusive;

  @VisibleForTesting @Nullable transient ImmutableBytesWritable stopRowExclusive;

  private final TableHashWrapperFactory tableHashWrapperFactory;

  /**
   * Creates a HadoopHashTableSource that reads HashTable data from hashTableOutputDir in GCS bucket
   * in project $(projectId).
   */
  public HadoopHashTableSource(
      ValueProvider projectId, ValueProvider sourceHashDir) {
    this(projectId, sourceHashDir, /*startRowInclusive*/ null, /*stopRowExclusive*/ null);
  }

  /**
   * Constructor to initialize a HadoopHashTableSource for a given row-range. Used for creating
   * split sources.
   */
  @VisibleForTesting
  HadoopHashTableSource(
      ValueProvider projectId,
      ValueProvider sourceHashDir,
      @Nullable ImmutableBytesWritable startRowInclusive,
      @Nullable ImmutableBytesWritable stopRowExclusive) {
    this(
        projectId,
        sourceHashDir,
        startRowInclusive,
        stopRowExclusive,
        new TableHashWrapperFactory());
  }

  @VisibleForTesting
  HadoopHashTableSource(
      ValueProvider projectId,
      ValueProvider hadoopHashTableOutputDir,
      @Nullable ImmutableBytesWritable startRowInclusive,
      @Nullable ImmutableBytesWritable stopRowExclusive,
      TableHashWrapperFactory tableHashWrapperFactory) {
    this.projectId = projectId;
    this.sourceHashDir = hadoopHashTableOutputDir;
    // startRow and stopRow will be null when the template is initialized. startRow and stopRow are
    // read from the hashTableOutputDir, which is only available at pipeline runtime.
    this.startRowInclusive = startRowInclusive;
    this.stopRowExclusive = stopRowExclusive;
    this.tableHashWrapperFactory = tableHashWrapperFactory;
  }

  @Override
  public List> split(
      long desiredBundleSizeBytes, PipelineOptions options) throws IOException {
    // This method relies on the partitioning done by HBase-HashTable job. There is a possibility
    // of stragglers. SyncTable handles it by using a group by and further splitting workitems.
    TableHashWrapper hash =
        tableHashWrapperFactory.getTableHash(projectId.get(), sourceHashDir.get());

    ImmutableList partitions = hash.getPartitions();
    int numPartitions = partitions.size();

    List splitSources = new ArrayList<>(numPartitions + 1);
    if (numPartitions == 0) {
      // There are 0 partitions and 1 hashfile, return single source with full key range.
      splitSources.add(
          new HadoopHashTableSource(
              projectId,
              sourceHashDir,
              hash.getStartRow(),
              hash.getStopRow(),
              tableHashWrapperFactory));
      return splitSources;
    }

    // Use the HashTable start key. The value is HConstants.EMPTY_START_ROW for full table scan.
    ImmutableBytesWritable nextStartRow = hash.getStartRow();
    ImmutableBytesWritable stopRow = hash.getStopRow();

    // The output of HashTable is organized as partition file and a set of datafiles.
    // Partition file contains a list of partitions, these partitions split the key-range of a table
    // into roughly equal row-ranges and hashes for these row-ranges are stored in a single
    // datafile.
    //
    // There are always numPartitions +1 data files. Datafile(i) covers hashes for [partition{i-1},
    // partition{i}).
    // So a partition file containing entries [b,f] for a table with row range [a,z] will have 3
    // data files containing hashes.
    // file0 will contain [a(nextStartRow), b), file1 will contain [b,f), and file3 will contain
    // [f,z(stopRow))
    for (int i = 0; i < numPartitions; i++) {
      // TODO make a utility function that generates [start, end) format from start/end.
      LOG.debug(
          "Adding: ["
              + immutableBytesToString(nextStartRow.get())
              + ", "
              + immutableBytesToString(partitions.get(i).get())
              + ")");
      splitSources.add(
          new HadoopHashTableSource(
              projectId, sourceHashDir, nextStartRow, partitions.get(i), tableHashWrapperFactory));
      nextStartRow = partitions.get(i);
    }
    // Add the last range for [lastPartition, stopRow).
    LOG.debug(
        "Adding: ["
            + immutableBytesToString(nextStartRow.get())
            + ", "
            + immutableBytesToString(stopRow.get())
            + ")");
    // Add the last range for [lastPartition, stopRow).
    splitSources.add(
        new HadoopHashTableSource(
            projectId, sourceHashDir, nextStartRow, stopRow, tableHashWrapperFactory));
    LOG.info("Returning " + splitSources.size() + " sources from " + numPartitions + " partitions");
    return splitSources;
  }

  @Override
  public Coder getOutputCoder() {
    return CODER;
  }

  @Override
  public long getEstimatedSizeBytes(PipelineOptions options) throws Exception {
    // HashTable data files don't expose a method to estimate size or lineCount.
    return 0;
  }

  @Override
  public BoundedReader createReader(PipelineOptions options) throws IOException {
    TableHashWrapper hash =
        tableHashWrapperFactory.getTableHash(projectId.get(), sourceHashDir.get());

    // The row range for an un-split source is determined from the output of HashTable job.
    // HashTableOutputDir is a runtime parameter and hence not available at construction time, so
    // populate the start and stop here.
    if (startRowInclusive == null || stopRowExclusive == null) {
      startRowInclusive = hash.getStartRow();
      stopRowExclusive = hash.getStopRow();
    }

    return new HashBasedReader(
        this,
        startRowInclusive,
        stopRowExclusive,
        hash.newReader(
            SyncTableUtils.createConfiguration(this.projectId.get(), this.sourceHashDir.get()),
            startRowInclusive));
  }

  @Override
  public boolean equals(Object o) {
    if (this == o) {
      return true;
    }
    if (!(o instanceof HadoopHashTableSource)) {
      return false;
    }
    HadoopHashTableSource that = (HadoopHashTableSource) o;
    return Objects.equal(projectId, that.projectId)
        && Objects.equal(sourceHashDir, that.sourceHashDir)
        && Objects.equal(startRowInclusive, that.startRowInclusive)
        && Objects.equal(stopRowExclusive, that.stopRowExclusive);
  }

  @Override
  public int hashCode() {
    return Objects.hashCode(projectId, sourceHashDir, startRowInclusive, stopRowExclusive);
  }

  @Override
  public String toString() {
    return "HadoopHashTableSource ["
        + immutableBytesToString(startRowInclusive)
        + ", "
        + immutableBytesToString(stopRowExclusive)
        + ')';
  }

  private void writeObject(ObjectOutputStream s) throws IOException {
    s.defaultWriteObject();
    // Start and Stop can be null, write a boolean to indicate if start/stop is expected.
    if (startRowInclusive == null) {
      s.writeBoolean(false);
    } else {
      s.writeBoolean(true);
      s.writeObject(startRowInclusive.copyBytes());
    }

    if (stopRowExclusive == null) {
      s.writeBoolean(false);
    } else {
      s.writeBoolean(true);
      s.writeObject(stopRowExclusive.copyBytes());
    }
  }

  private void readObject(ObjectInputStream s) throws IOException, ClassNotFoundException {
    s.defaultReadObject();
    // start/stop can be null, they are preceded by a boolean indicating their presence.
    if (s.readBoolean() == true) {
      startRowInclusive = new ImmutableBytesWritable((byte[]) s.readObject());
    }
    if (s.readBoolean() == true) {
      stopRowExclusive = new ImmutableBytesWritable((byte[]) s.readObject());
    }
  }

  @VisibleForTesting
  static class HashBasedReader extends BoundedReader {

    private final HadoopHashTableSource source;
    private final TableHashReader reader;

    @VisibleForTesting final ImmutableBytesWritable startRowInclusive;
    @VisibleForTesting final ImmutableBytesWritable stopRowExclusive;

    // Flag indicating that this workitem is finished.
    private boolean isDone = false;
    private ImmutableBytesWritable currentRangeStartKey;
    // Hash for the current range.
    private ImmutableBytesWritable currentHash;
    private RangeHash currentRangeHash;

    public HashBasedReader(
        HadoopHashTableSource source,
        ImmutableBytesWritable startRowInclusive,
        ImmutableBytesWritable stopRowExclusive,
        TableHashReader reader) {
      this.source = source;
      this.startRowInclusive = startRowInclusive;
      this.stopRowExclusive = stopRowExclusive;
      this.reader = reader;
    }

    @Override
    public boolean start() throws IOException {
      LOG.debug(
          "Starting a new reader at key range ["
              + immutableBytesToString(startRowInclusive)
              + " ,"
              + immutableBytesToString(stopRowExclusive)
              + ").");

      if (readNextKey()) {
        // Dataflow calls start, followed by getCurrent. HashBased reader needs to read on TableHash
        // twice to return a RangeHash since it specifies both range-start and range-end.
        advance();
        return true;
      }

      isDone = true;
      return false;
    }

    @Override
    public boolean advance() throws IOException {
      if (isDone) {
        LOG.debug("Ending workitem at key " + immutableBytesToString(currentRangeStartKey) + " .");
        return false;
      }

      ImmutableBytesWritable startKey = this.currentRangeStartKey;
      ImmutableBytesWritable hash = this.currentHash;

      // if there is nothing to read, we are done. readNextKey advances the currentRangeStartKey.
      isDone = !readNextKey();
      currentRangeHash = RangeHash.of(startKey, currentRangeStartKey, hash);

      return true;
    }

    // Returns true if a key can be read for this workitem.
    private boolean readNextKey() throws IOException {
      if (reader.next()) {
        currentRangeStartKey = reader.getCurrentKey();
        if ( // StopRow is not set, everything is in bounds.
        (stopRowExclusive.equals(HConstants.EMPTY_END_ROW)
            || currentRangeStartKey.compareTo(stopRowExclusive) < 0)) { // currentKey < stopKey
          // There is a key to read and the key is within the bounds of this workitem. Return true.
          currentHash = reader.getCurrentHash();
          return true;
        } else {
          // There is a key to read but its outside of the bounds of this workitem.
          currentHash = null;
          return false;
        }
      }

      // Nothing left to read for this workitem. Next range would have started from
      // stopRowExclusive.
      currentRangeStartKey = stopRowExclusive;
      currentHash = null;
      return false;
    }

    @Override
    public RangeHash getCurrent() {
      return currentRangeHash;
    }

    @Override
    public void close() throws IOException {
      LOG.info(
          "Finishing a reader for key range ["
              + immutableBytesToString(startRowInclusive)
              + " ,"
              + immutableBytesToString(stopRowExclusive)
              + "). Ending at "
              + immutableBytesToString(currentRangeStartKey));
      reader.close();
    }

    @Override
    public BoundedSource getCurrentSource() {
      return source;
    }
  }
}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy