com.google.cloud.bigtable.beam.validation.SyncTableJob Maven / Gradle / Ivy
/*
* Copyright 2021 Google LLC
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.google.cloud.bigtable.beam.validation;
import com.google.bigtable.repackaged.com.google.api.core.InternalExtensionOnly;
import com.google.bigtable.repackaged.com.google.gson.Gson;
import com.google.cloud.bigtable.beam.sequencefiles.Utils;
import com.google.cloud.bigtable.beam.validation.HadoopHashTableSource.RangeHash;
import com.google.common.annotations.VisibleForTesting;
import java.util.List;
import org.apache.beam.sdk.Pipeline;
import org.apache.beam.sdk.PipelineResult;
import org.apache.beam.sdk.extensions.gcp.options.GcpOptions;
import org.apache.beam.sdk.io.Read;
import org.apache.beam.sdk.io.TextIO;
import org.apache.beam.sdk.metrics.MetricQueryResults;
import org.apache.beam.sdk.metrics.MetricResult;
import org.apache.beam.sdk.options.Default;
import org.apache.beam.sdk.options.Description;
import org.apache.beam.sdk.options.PipelineOptionsFactory;
import org.apache.beam.sdk.options.ValueProvider;
import org.apache.beam.sdk.transforms.GroupByKey;
import org.apache.beam.sdk.transforms.MapElements;
import org.apache.beam.sdk.transforms.ParDo;
import org.apache.beam.sdk.transforms.SimpleFunction;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
/**
* A job that takes HBase HashTable output and compares the hashes from Cloud Bigtable table.
*
* Execute the following command to run the job directly:
*
*
* mvn compile exec:java \
* -DmainClass=com.google.cloud.bigtable.beam.validation.SyncTableJob \
* -Dexec.args="--runner=DataflowRunner \
* --project=$PROJECT \
* --bigtableInstanceId=$INSTANCE \
* --bigtableTableId=$TABLE \
* --sourceHashDir=$SOURCE_HASH_DIR \
* --outputPrefix=$OUtPUT_PREFIX \
* --stagingLocation=$STAGING_LOC \
* --tempLocation=$TMP_LOC \
* --region=$REGION \
* --workerZone=$WORKER_ZONE"
*
*
* Execute the following command to create the Dataflow template:
*
*
* mvn compile exec:java \
* -DmainClass=com.google.cloud.bigtable.beam.validation.SyncTableJob \
* -Dexec.args="--runner=DataflowRunner \
* --project=$PROJECT \
* --stagingLocation=gs://$STAGING_PATH \
* --templateLocation=gs://$TEMPLATE_PATH \
* --wait=false"
*
*
* There are a few ways to run the pipeline using the template. See Dataflow doc for details:
* https://cloud.google.com/dataflow/docs/templates/executing-templates. Optionally, you can upload
* a metadata file that contains information about the runtime parameters that can be used for
* parameter validation purpose and more. A sample metadata file can be found at
* "src/main/resources/SyncTableJob_metadata".
*
*
An example using gcloud command line:
*
*
* gcloud beta dataflow jobs run $JOB_NAME \
* --gcs-location gs://$TEMPLATE_PATH \
* --parameters bigtableProject=$PROJECT,bigtableInstanceId=$INSTANCE,bigtableTableId=$TABLE,sourceHashDir=gs://$SOURCE_HASH_DIR,outputPrefix=$OUTPUT_PREFIX
*
*/
@InternalExtensionOnly
public class SyncTableJob {
private static final Log LOG = LogFactory.getLog(SyncTableJob.class);
public interface SyncTableOptions extends GcpOptions {
@Description("This Bigtable App Profile id.")
ValueProvider getBigtableAppProfileId();
@SuppressWarnings("unused")
void setBigtableAppProfileId(ValueProvider appProfileId);
@Description("The project that contains the table to export. Defaults to --project.")
@Default.InstanceFactory(Utils.DefaultBigtableProjectFactory.class)
ValueProvider getBigtableProject();
@SuppressWarnings("unused")
void setBigtableProject(ValueProvider projectId);
@Description("The Bigtable instance id that contains the table to export.")
ValueProvider getBigtableInstanceId();
@SuppressWarnings("unused")
void setBigtableInstanceId(ValueProvider instanceId);
@Description("The Bigtable table id to export.")
ValueProvider getBigtableTableId();
@SuppressWarnings("unused")
void setBigtableTableId(ValueProvider tableId);
@Description("HBase HashTable job output dir.")
ValueProvider getHashTableOutputDir();
@SuppressWarnings("unused")
// Rename it to sourceHashDir as in HBase sync table job.
void setHashTableOutputDir(ValueProvider hashTableOutputDir);
@Description("File pattern for files containing mismatched row ranges.")
ValueProvider getOutputPrefix();
@SuppressWarnings("unused")
void setOutputPrefix(ValueProvider outputPrefix);
// When creating a template, this flag must be set to false.
@Description("Wait for pipeline to finish.")
@Default.Boolean(true)
boolean getWait();
@SuppressWarnings("unused")
void setWait(boolean wait);
}
public static void main(String[] args) {
PipelineOptionsFactory.register(SyncTableOptions.class);
SyncTableOptions opts =
PipelineOptionsFactory.fromArgs(args).withValidation().as(SyncTableOptions.class);
LOG.info("===> Building Pipeline");
Pipeline pipeline = buildPipeline(opts);
LOG.info("===> Running Pipeline");
PipelineResult result = pipeline.run();
if (opts.getWait()) {
Utils.waitForPipelineToFinish(result);
}
// Log all the counters for number of matches and number of mismatches.
MetricQueryResults metrics = result.metrics().allMetrics();
for (MetricResult counter : metrics.getCounters()) {
LOG.warn(counter.getName() + ":" + counter.getAttempted());
}
}
@VisibleForTesting
public static Pipeline buildPipeline(SyncTableOptions opts) {
Pipeline pipeline = Pipeline.create(Utils.tweakOptions(opts));
pipeline
.apply(
"Read HBase HashTable output",
Read.from(
new BufferedHadoopHashTableSource(
new HadoopHashTableSource(
opts.getBigtableProject(), opts.getHashTableOutputDir()))))
.apply(
"group by and create granular workitems", GroupByKey.>create())
.apply("validate hash", ParDo.of(new ComputeAndValidateHashFromBigtableDoFn(opts)))
.apply("Serialize the ranges", MapElements.via(new RangeHashToString()))
.apply("Write to file", TextIO.write().to(opts.getOutputPrefix()).withSuffix(".txt"));
return pipeline;
}
static class RangeHashToString extends SimpleFunction {
// TODO maybe explore a sequenceFile sink for RangeHash. Hadoop jobs using this output may be
// easier to write for sequence file.
private static final Gson GSON = new Gson();
@Override
public String apply(RangeHash input) {
return GSON.toJson(input);
}
}
}
© 2015 - 2024 Weber Informatics LLC | Privacy Policy