com.google.cloud.bigtable.beam.validation.SyncTableJob Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of bigtable-beam-import Show documentation
There is a newer version: 2.14.7
/*
 * Copyright 2021 Google LLC
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package com.google.cloud.bigtable.beam.validation;

import com.google.bigtable.repackaged.com.google.api.core.InternalExtensionOnly;
import com.google.bigtable.repackaged.com.google.gson.Gson;
import com.google.cloud.bigtable.beam.sequencefiles.Utils;
import com.google.cloud.bigtable.beam.validation.HadoopHashTableSource.RangeHash;
import com.google.common.annotations.VisibleForTesting;
import java.util.List;
import org.apache.beam.sdk.Pipeline;
import org.apache.beam.sdk.PipelineResult;
import org.apache.beam.sdk.extensions.gcp.options.GcpOptions;
import org.apache.beam.sdk.io.Read;
import org.apache.beam.sdk.io.TextIO;
import org.apache.beam.sdk.metrics.MetricQueryResults;
import org.apache.beam.sdk.metrics.MetricResult;
import org.apache.beam.sdk.options.Default;
import org.apache.beam.sdk.options.Description;
import org.apache.beam.sdk.options.PipelineOptionsFactory;
import org.apache.beam.sdk.options.ValueProvider;
import org.apache.beam.sdk.transforms.GroupByKey;
import org.apache.beam.sdk.transforms.MapElements;
import org.apache.beam.sdk.transforms.ParDo;
import org.apache.beam.sdk.transforms.SimpleFunction;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;

/**
 * A job that takes HBase HashTable output and compares the hashes from Cloud Bigtable table.
 *
 * Execute the following command to run the job directly:
 *
 * 
 *   mvn compile exec:java \
 *      -DmainClass=com.google.cloud.bigtable.beam.validation.SyncTableJob \
 *      -Dexec.args="--runner=DataflowRunner \
 *            --project=$PROJECT \
 *            --bigtableInstanceId=$INSTANCE \
 *            --bigtableTableId=$TABLE \
 *            --sourceHashDir=$SOURCE_HASH_DIR \
 *            --outputPrefix=$OUtPUT_PREFIX \
 *            --stagingLocation=$STAGING_LOC \
 *            --tempLocation=$TMP_LOC \
 *            --region=$REGION \
 *            --workerZone=$WORKER_ZONE"
 * 
 *
 * Execute the following command to create the Dataflow template:
 *
 * 
 * mvn compile exec:java \
 *   -DmainClass=com.google.cloud.bigtable.beam.validation.SyncTableJob \
 *   -Dexec.args="--runner=DataflowRunner \
 *                --project=$PROJECT \
 *                --stagingLocation=gs://$STAGING_PATH \
 *                --templateLocation=gs://$TEMPLATE_PATH \
 *                --wait=false"
 * 
 *
 * There are a few ways to run the pipeline using the template. See Dataflow doc for details:
 * https://cloud.google.com/dataflow/docs/templates/executing-templates. Optionally, you can upload
 * a metadata file that contains information about the runtime parameters that can be used for
 * parameter validation purpose and more. A sample metadata file can be found at
 * "src/main/resources/SyncTableJob_metadata".
 *
 * 
An example using gcloud command line:
 *
 * 
 * gcloud beta dataflow jobs run $JOB_NAME \
 *   --gcs-location gs://$TEMPLATE_PATH \
 *   --parameters bigtableProject=$PROJECT,bigtableInstanceId=$INSTANCE,bigtableTableId=$TABLE,sourceHashDir=gs://$SOURCE_HASH_DIR,outputPrefix=$OUTPUT_PREFIX
 * 
 */
@InternalExtensionOnly
public class SyncTableJob {

  private static final Log LOG = LogFactory.getLog(SyncTableJob.class);

  public interface SyncTableOptions extends GcpOptions {

    @Description("This Bigtable App Profile id.")
    ValueProvider getBigtableAppProfileId();

    @SuppressWarnings("unused")
    void setBigtableAppProfileId(ValueProvider appProfileId);

    @Description("The project that contains the table to export. Defaults to --project.")
    @Default.InstanceFactory(Utils.DefaultBigtableProjectFactory.class)
    ValueProvider getBigtableProject();

    @SuppressWarnings("unused")
    void setBigtableProject(ValueProvider projectId);

    @Description("The Bigtable instance id that contains the table to export.")
    ValueProvider getBigtableInstanceId();

    @SuppressWarnings("unused")
    void setBigtableInstanceId(ValueProvider instanceId);

    @Description("The Bigtable table id to export.")
    ValueProvider getBigtableTableId();

    @SuppressWarnings("unused")
    void setBigtableTableId(ValueProvider tableId);

    @Description("HBase HashTable job output dir.")
    ValueProvider getHashTableOutputDir();

    @SuppressWarnings("unused")
    // Rename it to sourceHashDir as in HBase sync table job.
    void setHashTableOutputDir(ValueProvider hashTableOutputDir);

    @Description("File pattern for files containing mismatched row ranges.")
    ValueProvider getOutputPrefix();

    @SuppressWarnings("unused")
    void setOutputPrefix(ValueProvider outputPrefix);

    // When creating a template, this flag must be set to false.
    @Description("Wait for pipeline to finish.")
    @Default.Boolean(true)
    boolean getWait();

    @SuppressWarnings("unused")
    void setWait(boolean wait);
  }

  public static void main(String[] args) {
    PipelineOptionsFactory.register(SyncTableOptions.class);

    SyncTableOptions opts =
        PipelineOptionsFactory.fromArgs(args).withValidation().as(SyncTableOptions.class);

    LOG.info("===> Building Pipeline");
    Pipeline pipeline = buildPipeline(opts);

    LOG.info("===> Running Pipeline");
    PipelineResult result = pipeline.run();

    if (opts.getWait()) {
      Utils.waitForPipelineToFinish(result);
    }

    // Log all the counters for number of matches and number of mismatches.
    MetricQueryResults metrics = result.metrics().allMetrics();
    for (MetricResult counter : metrics.getCounters()) {
      LOG.warn(counter.getName() + ":" + counter.getAttempted());
    }
  }

  @VisibleForTesting
  public static Pipeline buildPipeline(SyncTableOptions opts) {
    Pipeline pipeline = Pipeline.create(Utils.tweakOptions(opts));
    pipeline
        .apply(
            "Read HBase HashTable output",
            Read.from(
                new BufferedHadoopHashTableSource(
                    new HadoopHashTableSource(
                        opts.getBigtableProject(), opts.getHashTableOutputDir()))))
        .apply(
            "group by and create granular workitems", GroupByKey.>create())
        .apply("validate hash", ParDo.of(new ComputeAndValidateHashFromBigtableDoFn(opts)))
        .apply("Serialize the ranges", MapElements.via(new RangeHashToString()))
        .apply("Write to file", TextIO.write().to(opts.getOutputPrefix()).withSuffix(".txt"));
    return pipeline;
  }

  static class RangeHashToString extends SimpleFunction {
    // TODO maybe explore a sequenceFile sink for RangeHash. Hadoop jobs using this output may be
    // easier to write for sequence file.
    private static final Gson GSON = new Gson();

    @Override
    public String apply(RangeHash input) {
      return GSON.toJson(input);
    }
  }
}