com.google.cloud.bigtable.beam.validation.HadoopHashTableSource Maven / Gradle / Ivy
/*
* Copyright 2021 Google LLC
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.google.cloud.bigtable.beam.validation;
import static com.google.cloud.bigtable.beam.validation.SyncTableUtils.immutableBytesToString;
import com.google.bigtable.repackaged.com.google.api.core.InternalApi;
import com.google.bigtable.repackaged.com.google.common.annotations.VisibleForTesting;
import com.google.cloud.bigtable.beam.validation.HadoopHashTableSource.RangeHash;
import com.google.cloud.bigtable.beam.validation.TableHashWrapper.TableHashReader;
import com.google.common.base.Objects;
import com.google.common.base.Preconditions;
import com.google.common.collect.ImmutableList;
import java.io.IOException;
import java.io.ObjectInputStream;
import java.io.ObjectOutputStream;
import java.io.Serializable;
import java.util.ArrayList;
import java.util.List;
import javax.annotation.Nullable;
import org.apache.beam.sdk.coders.Coder;
import org.apache.beam.sdk.coders.DefaultCoder;
import org.apache.beam.sdk.io.BoundedSource;
import org.apache.beam.sdk.options.PipelineOptions;
import org.apache.beam.sdk.options.ValueProvider;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.hbase.HConstants;
import org.apache.hadoop.hbase.io.ImmutableBytesWritable;
/**
* A beam source to read output of Hadoop HashTable job. The source creates 1 workitem per HashTable
* data file and emits a row-range/hash pair.
*/
@InternalApi
public class HadoopHashTableSource extends BoundedSource implements Serializable {
private static final long serialVersionUID = 2383724L;
private static final Coder CODER = RangeHashCoder.of();
/**
* A simple POJO encapsulating a row range and the corresponding hash generated by HashTable job.
* TODO Evaluate if we can use AutoValue for this class.
*/
@DefaultCoder(RangeHashCoder.class)
public static class RangeHash {
public final ImmutableBytesWritable startInclusive;
public final ImmutableBytesWritable stopExclusive;
public final ImmutableBytesWritable hash;
private RangeHash(
ImmutableBytesWritable startInclusive,
ImmutableBytesWritable stopExclusive,
ImmutableBytesWritable hash) {
this.startInclusive = startInclusive;
this.stopExclusive = stopExclusive;
this.hash = hash;
}
static RangeHash of(
ImmutableBytesWritable startInclusive,
ImmutableBytesWritable stopExclusive,
ImmutableBytesWritable hash) {
Preconditions.checkNotNull(startInclusive);
Preconditions.checkNotNull(stopExclusive);
Preconditions.checkNotNull(hash);
return new RangeHash(startInclusive, stopExclusive, hash);
}
@Override
public String toString() {
return String.format(
"RangeHash{ range = [ %s, %s), hash: %s }",
immutableBytesToString(startInclusive),
immutableBytesToString(stopExclusive),
immutableBytesToString(hash));
}
@Override
public boolean equals(Object o) {
if (this == o) {
return true;
}
if (!(o instanceof RangeHash)) {
return false;
}
RangeHash rangeHash = (RangeHash) o;
return Objects.equal(startInclusive, rangeHash.startInclusive)
&& Objects.equal(stopExclusive, rangeHash.stopExclusive)
&& Objects.equal(hash, rangeHash.hash);
}
@Override
public int hashCode() {
return Objects.hashCode(startInclusive, stopExclusive, hash);
}
}
public static final Log LOG = LogFactory.getLog(HadoopHashTableSource.class);
private final ValueProvider projectId;
// Path to the output of HashTable job. Usually in GCS.
private final ValueProvider sourceHashDir;
// Row range owned by this source.
// The Start and Stop row are serialized in a custom way.
@VisibleForTesting @Nullable transient ImmutableBytesWritable startRowInclusive;
@VisibleForTesting @Nullable transient ImmutableBytesWritable stopRowExclusive;
private final TableHashWrapperFactory tableHashWrapperFactory;
/**
* Creates a HadoopHashTableSource that reads HashTable data from hashTableOutputDir in GCS bucket
* in project $(projectId).
*/
public HadoopHashTableSource(
ValueProvider projectId, ValueProvider sourceHashDir) {
this(projectId, sourceHashDir, /*startRowInclusive*/ null, /*stopRowExclusive*/ null);
}
/**
* Constructor to initialize a HadoopHashTableSource for a given row-range. Used for creating
* split sources.
*/
@VisibleForTesting
HadoopHashTableSource(
ValueProvider projectId,
ValueProvider sourceHashDir,
@Nullable ImmutableBytesWritable startRowInclusive,
@Nullable ImmutableBytesWritable stopRowExclusive) {
this(
projectId,
sourceHashDir,
startRowInclusive,
stopRowExclusive,
new TableHashWrapperFactory());
}
@VisibleForTesting
HadoopHashTableSource(
ValueProvider projectId,
ValueProvider hadoopHashTableOutputDir,
@Nullable ImmutableBytesWritable startRowInclusive,
@Nullable ImmutableBytesWritable stopRowExclusive,
TableHashWrapperFactory tableHashWrapperFactory) {
this.projectId = projectId;
this.sourceHashDir = hadoopHashTableOutputDir;
// startRow and stopRow will be null when the template is initialized. startRow and stopRow are
// read from the hashTableOutputDir, which is only available at pipeline runtime.
this.startRowInclusive = startRowInclusive;
this.stopRowExclusive = stopRowExclusive;
this.tableHashWrapperFactory = tableHashWrapperFactory;
}
@Override
public List extends BoundedSource> split(
long desiredBundleSizeBytes, PipelineOptions options) throws IOException {
// This method relies on the partitioning done by HBase-HashTable job. There is a possibility
// of stragglers. SyncTable handles it by using a group by and further splitting workitems.
TableHashWrapper hash =
tableHashWrapperFactory.getTableHash(projectId.get(), sourceHashDir.get());
ImmutableList partitions = hash.getPartitions();
int numPartitions = partitions.size();
List splitSources = new ArrayList<>(numPartitions + 1);
if (numPartitions == 0) {
// There are 0 partitions and 1 hashfile, return single source with full key range.
splitSources.add(
new HadoopHashTableSource(
projectId,
sourceHashDir,
hash.getStartRow(),
hash.getStopRow(),
tableHashWrapperFactory));
return splitSources;
}
// Use the HashTable start key. The value is HConstants.EMPTY_START_ROW for full table scan.
ImmutableBytesWritable nextStartRow = hash.getStartRow();
ImmutableBytesWritable stopRow = hash.getStopRow();
// The output of HashTable is organized as partition file and a set of datafiles.
// Partition file contains a list of partitions, these partitions split the key-range of a table
// into roughly equal row-ranges and hashes for these row-ranges are stored in a single
// datafile.
//
// There are always numPartitions +1 data files. Datafile(i) covers hashes for [partition{i-1},
// partition{i}).
// So a partition file containing entries [b,f] for a table with row range [a,z] will have 3
// data files containing hashes.
// file0 will contain [a(nextStartRow), b), file1 will contain [b,f), and file3 will contain
// [f,z(stopRow))
for (int i = 0; i < numPartitions; i++) {
// TODO make a utility function that generates [start, end) format from start/end.
LOG.debug(
"Adding: ["
+ immutableBytesToString(nextStartRow.get())
+ ", "
+ immutableBytesToString(partitions.get(i).get())
+ ")");
splitSources.add(
new HadoopHashTableSource(
projectId, sourceHashDir, nextStartRow, partitions.get(i), tableHashWrapperFactory));
nextStartRow = partitions.get(i);
}
// Add the last range for [lastPartition, stopRow).
LOG.debug(
"Adding: ["
+ immutableBytesToString(nextStartRow.get())
+ ", "
+ immutableBytesToString(stopRow.get())
+ ")");
// Add the last range for [lastPartition, stopRow).
splitSources.add(
new HadoopHashTableSource(
projectId, sourceHashDir, nextStartRow, stopRow, tableHashWrapperFactory));
LOG.info("Returning " + splitSources.size() + " sources from " + numPartitions + " partitions");
return splitSources;
}
@Override
public Coder getOutputCoder() {
return CODER;
}
@Override
public long getEstimatedSizeBytes(PipelineOptions options) throws Exception {
// HashTable data files don't expose a method to estimate size or lineCount.
return 0;
}
@Override
public BoundedReader createReader(PipelineOptions options) throws IOException {
TableHashWrapper hash =
tableHashWrapperFactory.getTableHash(projectId.get(), sourceHashDir.get());
// The row range for an un-split source is determined from the output of HashTable job.
// HashTableOutputDir is a runtime parameter and hence not available at construction time, so
// populate the start and stop here.
if (startRowInclusive == null || stopRowExclusive == null) {
startRowInclusive = hash.getStartRow();
stopRowExclusive = hash.getStopRow();
}
return new HashBasedReader(
this,
startRowInclusive,
stopRowExclusive,
hash.newReader(
SyncTableUtils.createConfiguration(this.projectId.get(), this.sourceHashDir.get()),
startRowInclusive));
}
@Override
public boolean equals(Object o) {
if (this == o) {
return true;
}
if (!(o instanceof HadoopHashTableSource)) {
return false;
}
HadoopHashTableSource that = (HadoopHashTableSource) o;
return Objects.equal(projectId, that.projectId)
&& Objects.equal(sourceHashDir, that.sourceHashDir)
&& Objects.equal(startRowInclusive, that.startRowInclusive)
&& Objects.equal(stopRowExclusive, that.stopRowExclusive);
}
@Override
public int hashCode() {
return Objects.hashCode(projectId, sourceHashDir, startRowInclusive, stopRowExclusive);
}
@Override
public String toString() {
return "HadoopHashTableSource ["
+ immutableBytesToString(startRowInclusive)
+ ", "
+ immutableBytesToString(stopRowExclusive)
+ ')';
}
private void writeObject(ObjectOutputStream s) throws IOException {
s.defaultWriteObject();
// Start and Stop can be null, write a boolean to indicate if start/stop is expected.
if (startRowInclusive == null) {
s.writeBoolean(false);
} else {
s.writeBoolean(true);
s.writeObject(startRowInclusive.copyBytes());
}
if (stopRowExclusive == null) {
s.writeBoolean(false);
} else {
s.writeBoolean(true);
s.writeObject(stopRowExclusive.copyBytes());
}
}
private void readObject(ObjectInputStream s) throws IOException, ClassNotFoundException {
s.defaultReadObject();
// start/stop can be null, they are preceded by a boolean indicating their presence.
if (s.readBoolean() == true) {
startRowInclusive = new ImmutableBytesWritable((byte[]) s.readObject());
}
if (s.readBoolean() == true) {
stopRowExclusive = new ImmutableBytesWritable((byte[]) s.readObject());
}
}
@VisibleForTesting
static class HashBasedReader extends BoundedReader {
private final HadoopHashTableSource source;
private final TableHashReader reader;
@VisibleForTesting final ImmutableBytesWritable startRowInclusive;
@VisibleForTesting final ImmutableBytesWritable stopRowExclusive;
// Flag indicating that this workitem is finished.
private boolean isDone = false;
private ImmutableBytesWritable currentRangeStartKey;
// Hash for the current range.
private ImmutableBytesWritable currentHash;
private RangeHash currentRangeHash;
public HashBasedReader(
HadoopHashTableSource source,
ImmutableBytesWritable startRowInclusive,
ImmutableBytesWritable stopRowExclusive,
TableHashReader reader) {
this.source = source;
this.startRowInclusive = startRowInclusive;
this.stopRowExclusive = stopRowExclusive;
this.reader = reader;
}
@Override
public boolean start() throws IOException {
LOG.debug(
"Starting a new reader at key range ["
+ immutableBytesToString(startRowInclusive)
+ " ,"
+ immutableBytesToString(stopRowExclusive)
+ ").");
if (readNextKey()) {
// Dataflow calls start, followed by getCurrent. HashBased reader needs to read on TableHash
// twice to return a RangeHash since it specifies both range-start and range-end.
advance();
return true;
}
isDone = true;
return false;
}
@Override
public boolean advance() throws IOException {
if (isDone) {
LOG.debug("Ending workitem at key " + immutableBytesToString(currentRangeStartKey) + " .");
return false;
}
ImmutableBytesWritable startKey = this.currentRangeStartKey;
ImmutableBytesWritable hash = this.currentHash;
// if there is nothing to read, we are done. readNextKey advances the currentRangeStartKey.
isDone = !readNextKey();
currentRangeHash = RangeHash.of(startKey, currentRangeStartKey, hash);
return true;
}
// Returns true if a key can be read for this workitem.
private boolean readNextKey() throws IOException {
if (reader.next()) {
currentRangeStartKey = reader.getCurrentKey();
if ( // StopRow is not set, everything is in bounds.
(stopRowExclusive.equals(HConstants.EMPTY_END_ROW)
|| currentRangeStartKey.compareTo(stopRowExclusive) < 0)) { // currentKey < stopKey
// There is a key to read and the key is within the bounds of this workitem. Return true.
currentHash = reader.getCurrentHash();
return true;
} else {
// There is a key to read but its outside of the bounds of this workitem.
currentHash = null;
return false;
}
}
// Nothing left to read for this workitem. Next range would have started from
// stopRowExclusive.
currentRangeStartKey = stopRowExclusive;
currentHash = null;
return false;
}
@Override
public RangeHash getCurrent() {
return currentRangeHash;
}
@Override
public void close() throws IOException {
LOG.info(
"Finishing a reader for key range ["
+ immutableBytesToString(startRowInclusive)
+ " ,"
+ immutableBytesToString(stopRowExclusive)
+ "). Ending at "
+ immutableBytesToString(currentRangeStartKey));
reader.close();
}
@Override
public BoundedSource getCurrentSource() {
return source;
}
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy