All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.google.cloud.bigtable.mapreduce.validation.BigtableSyncTableJob Maven / Gradle / Ivy

Go to download

This project contains tweaks to the hbase 1.* map reduce jobs that work for bigtable. Specifically, HBase's Import M/R job has ZooKeeper referrence which needed to be removed in order to work with Bigtable.

There is a newer version: 2.14.8
Show newest version
/*
 * Copyright 2022 Google LLC
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package com.google.cloud.bigtable.mapreduce.validation;

import com.google.cloud.bigtable.hbase.BigtableConfiguration;
import java.io.IOException;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hbase.HBaseConfiguration;
import org.apache.hadoop.hbase.HConstants;
import org.apache.hadoop.hbase.client.Scan;
import org.apache.hadoop.hbase.mapreduce.BigtableSyncTableAccessor;
import org.apache.hadoop.hbase.mapreduce.SyncTable;
import org.apache.hadoop.hbase.mapreduce.SyncTable.SyncMapper.Counter;
import org.apache.hadoop.hbase.mapreduce.TableInputFormat;
import org.apache.hadoop.hbase.mapreduce.TableMapReduceUtil;
import org.apache.hadoop.hbase.zookeeper.ZKConfig;
import org.apache.hadoop.mapreduce.Counters;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.util.GenericOptionsParser;
import org.apache.hadoop.util.ToolRunner;

/**
 * Bigtable SyncTable Job
 *
 * 

Base source authored by {@link org.apache.hadoop.hbase.mapreduce.SyncTable}. * *

SyncTable is part 2 of 2 stages used for byte level validation between a source and target * table only. BigtableSyncTableJob extends functionality to provide connectivity to Bigtable as a * source or target and relies on {@link org.apache.hadoop.hbase.mapreduce.SyncTable} for core * functionality. BigtableSyncTableJob overrides dryrun configuration as true by default to perform * a read-only validation report only. * *

SyncTable is a Mapreduce job that generates hashes on the target table and compares these * hashes with the output from {@Link org.apache.hadoop.hbase.mapreduce.HashTable}. For diverging * hashes, cell-level comparison is performed. */ public class BigtableSyncTableJob extends SyncTable { private static final Log LOG = LogFactory.getLog(BigtableSyncTableJob.class); public static final String SOURCE_BT_PROJECTID_CONF_KEY = "sync.table.source.bt.projectid"; public static final String SOURCE_BT_INSTANCE_CONF_KEY = "sync.table.source.bt.instance"; public static final String SOURCE_BT_APP_PROFILE_CONF_KEY = "sync.table.source.bt.app.profile"; public static final String TARGET_BT_PROJECTID_CONF_KEY = "sync.table.target.bt.projectid"; public static final String TARGET_BT_INSTANCE_CONF_KEY = "sync.table.target.bt.instance"; public static final String TARGET_BT_APP_PROFILE_CONF_KEY = "sync.table.target.bt.app.profile"; private String sourceBigtableProjectId; private String sourceBigtableInstance; private String sourceBigtableAppProfile; private String targetBigtableProjectId; private String targetBigtableInstance; private String targetBigtableAppProfile; private static final int NUM_ARGS = 3; protected Counters counters; public BigtableSyncTableJob(Configuration conf) { super(conf); } @Override public int run(String[] args) throws Exception { String[] otherArgs = new GenericOptionsParser(getConf(), args).getRemainingArgs(); if (!doCommandLine(this, otherArgs)) { return 1; } Job job = setupJobWithBigtable(otherArgs); if (!job.waitForCompletion(true)) { LOG.info("Map-reduce job failed!"); return 1; } counters = job.getCounters(); long batches = counters.findCounter(Counter.BATCHES).getValue(); long hashesMatched = counters.findCounter(Counter.HASHES_MATCHED).getValue(); long hashesNotMatched = counters.findCounter(Counter.HASHES_NOT_MATCHED).getValue(); LOG.info("############# Num of validation batches = " + batches); LOG.info("############# Num of validation hashes matched = " + hashesMatched); LOG.info("############# Num of validation hashes not matched = " + hashesNotMatched); return 0; } /** * SyncTable command line parsing utility * * @param args * @return */ protected boolean doCommandLine(SyncTable syncTable, final String[] args) { if (!parseCommandLine(syncTable, NUM_ARGS, args)) { return false; } // return false if options and args are not properly set if (!verifyRequiredArgsSet( syncTable, args, sourceBigtableProjectId, sourceBigtableInstance, targetBigtableProjectId, targetBigtableInstance)) { return false; } return true; } /** * set default sync table configs explicitly * * @param syncTable */ private void setDefaultConfigs(SyncTable syncTable) { // dry run is enabled by default BigtableSyncTableAccessor.setDryRun(syncTable, true); } /** * parse and set command line options and args * * @param syncTable * @param NUM_ARGS * @param args * @return */ private boolean parseCommandLine(SyncTable syncTable, int NUM_ARGS, final String[] args) { // set any defaults & override with any options/args that are set on cli setDefaultConfigs(syncTable); if (args.length < NUM_ARGS) { printUsage(null, args); return false; } try { BigtableSyncTableAccessor.setSourceHashDir(syncTable, new Path(args[args.length - 3])); BigtableSyncTableAccessor.setSourceTableName(syncTable, args[args.length - 2]); BigtableSyncTableAccessor.setTargetTableName(syncTable, args[args.length - 1]); int cntArgs = 0; for (int i = 0; i < args.length; i++) { String arg = args[i]; if (arg.equals("-h") || arg.startsWith("--h")) { printUsage(null, args); return false; } if (!arg.startsWith("--")) { cntArgs++; } } if (cntArgs != 3) { printUsage("expected " + NUM_ARGS + ", but received " + cntArgs++, args); return false; } for (int i = 0; i < args.length - NUM_ARGS; i++) { String cmd = args[i]; final String sourceZkClusterKey = "--sourcezkcluster="; if (cmd.startsWith(sourceZkClusterKey)) { BigtableSyncTableAccessor.setSourceZkCluster( syncTable, cmd.substring(sourceZkClusterKey.length())); continue; } String sourceBigtableProjectIdKey = "--sourcebigtableproject="; if (cmd.startsWith(sourceBigtableProjectIdKey)) { sourceBigtableProjectId = cmd.substring(sourceBigtableProjectIdKey.length()); continue; } String sourceBigtableInstanceKey = "--sourcebigtableinstance="; if (cmd.startsWith(sourceBigtableInstanceKey)) { sourceBigtableInstance = cmd.substring(sourceBigtableInstanceKey.length()); continue; } String sourceBigtableAppProfileKey = "--sourcebigtableappprofile="; if (cmd.startsWith(sourceBigtableAppProfileKey)) { sourceBigtableAppProfile = cmd.substring(sourceBigtableAppProfileKey.length()); continue; } final String targetZkClusterKey = "--targetzkcluster="; if (cmd.startsWith(targetZkClusterKey)) { BigtableSyncTableAccessor.setTargetZkCluster( syncTable, cmd.substring(targetZkClusterKey.length())); continue; } final String targetBigtableProjectIdKey = "--targetbigtableproject="; if (cmd.startsWith(targetBigtableProjectIdKey)) { targetBigtableProjectId = cmd.substring(targetBigtableProjectIdKey.length()); continue; } final String targetBigtableInstanceKey = "--targetbigtableinstance="; if (cmd.startsWith(targetBigtableInstanceKey)) { targetBigtableInstance = cmd.substring(targetBigtableInstanceKey.length()); continue; } final String targetBigtableAppProfileKey = "--targetbigtableappprofile="; if (cmd.startsWith(targetBigtableAppProfileKey)) { targetBigtableAppProfile = cmd.substring(targetBigtableAppProfileKey.length()); continue; } final String dryRunKey = "--dryrun="; if (cmd.startsWith(dryRunKey)) { BigtableSyncTableAccessor.setDryRun( syncTable, Boolean.parseBoolean(cmd.substring(dryRunKey.length()))); continue; } final String doDeletesKey = "--doDeletes="; if (cmd.startsWith(doDeletesKey)) { BigtableSyncTableAccessor.setDoDeletes( syncTable, Boolean.parseBoolean(cmd.substring(doDeletesKey.length()))); continue; } final String doPutsKey = "--doPuts="; if (cmd.startsWith(doPutsKey)) { BigtableSyncTableAccessor.setDoPuts( syncTable, Boolean.parseBoolean(cmd.substring(doPutsKey.length()))); continue; } final String ignoreTimestampsKey = "--ignoreTimestamps="; if (cmd.startsWith(ignoreTimestampsKey)) { BigtableSyncTableAccessor.setIgnoreTimestamps( syncTable, Boolean.parseBoolean(cmd.substring(ignoreTimestampsKey.length()))); continue; } printUsage("Invalid argument '" + cmd + "'", args); return false; } } catch (Exception e) { e.printStackTrace(); printUsage("Can't start because " + e.getMessage(), args); return false; } return true; } /** * verify required options and args are correctly set * * @param args * @param sourceBigtableProjectId * @param sourceBigtableInstance * @param targetBigtableProjectId * @param targetBigtableInstance * @return boolean state if required args are set */ public boolean verifyRequiredArgsSet( SyncTable syncTable, String[] args, String sourceBigtableProjectId, String sourceBigtableInstance, String targetBigtableProjectId, String targetBigtableInstance) { // neither source zk or source bigtable config set if (BigtableSyncTableAccessor.getSourceZkCluster(syncTable) == null && (sourceBigtableProjectId == null && sourceBigtableInstance == null)) { printUsage( "--sourcezkcluster or --sourcebigtableproject and --sourcebigtableinstance required.", args); return false; } // both source zk and source bigtable configs are set if (BigtableSyncTableAccessor.getSourceZkCluster(syncTable) != null && (sourceBigtableProjectId != null || sourceBigtableInstance != null)) { printUsage( "--sourcezkcluster and --sourcebigtableproject and --sourcebigtableinstance are set. Set one or the other.", args); return false; } // source bigtable project or source instance not set if (BigtableSyncTableAccessor.getSourceZkCluster(syncTable) == null && ((sourceBigtableProjectId == null || sourceBigtableInstance == null))) { printUsage("--sourcebigtableproject and --sourcebigtableinstance required.", args); return false; } // neither target zk or target bigtable config set if (BigtableSyncTableAccessor.getTargetZkCluster(syncTable) == null && (targetBigtableProjectId == null && targetBigtableInstance == null)) { printUsage( "--targetzkcluster or --targetbigtableproject and --targetbigtableinstance required.", args); return false; } // both target zk and target bigtable configs are set if (BigtableSyncTableAccessor.getTargetZkCluster(syncTable) != null && (targetBigtableProjectId != null || targetBigtableInstance != null)) { printUsage( "--targetzkcluster or --targetbigtableproject and --targetbigtableinstance are set. Set one or the other.", args); return false; } // target bigtable project or target instance not set if (BigtableSyncTableAccessor.getTargetZkCluster(syncTable) == null && ((targetBigtableProjectId == null || targetBigtableInstance == null))) { printUsage("--targetbigtableproject and --targetbigtableinstance required.", args); return false; } if (BigtableSyncTableAccessor.getSourceHashDir(syncTable) == null) { printUsage(" argument is required.", args); return false; } if (BigtableSyncTableAccessor.getSourceTableName(syncTable) == null) { printUsage(" argument is required.", args); return false; } if (BigtableSyncTableAccessor.getTargetTableName(syncTable) == null) { printUsage(" argument is required.", args); return false; } return true; } public static void printUsage(final String errorMsg, String[] args) { if (errorMsg != null && errorMsg.length() > 0) { System.err.println("ERROR: " + errorMsg); } if (args != null) { System.err.print("Input provided: "); for (int i = 0; i < args.length; i++) { if (i != 0) { System.err.print(" "); } System.err.print(args[i]); } System.err.println(); } System.err.println("Usage: SyncTable [options] "); System.err.println(); System.err.println("Options:"); System.err.println(" Source Configuration:"); System.err.println(" sourcezkcluster ZK cluster key of the source table"); System.err.println(" (defaults to cluster in classpath's config)"); System.err.println(" Or Source Configuration:"); System.err.println(" sourcebigtableproject Bigtable project id of the source table"); System.err.println(" (defaults to cluster in classpath's config)"); System.err.println(" sourcebigtableinstance Bigtable instance id of the source table"); System.err.println(" (defaults to cluster in classpath's config)"); System.err.println(" sourcebigtableappprofile (optional) Bigtable app profile"); System.err.println(" Target Configuration:"); System.err.println(" targetzkcluster ZK cluster key of the target table"); System.err.println(" (defaults to cluster in classpath's config)"); System.err.println(" Or Target Configuration:"); System.err.println(" targetbigtableproject Bigtable project id of the target table"); System.err.println(" (defaults to cluster in classpath's config)"); System.err.println(" targetbigtableinstance Bigtable instance id of the target table"); System.err.println(" (defaults to cluster in classpath's config)"); System.err.println(" targetbigtableappprofile (optional) Bigtable app profile"); System.err.println(); System.err.println(" dryrun if true, output counters but no writes"); System.err.println(" (defaults to true)"); System.err.println(" doDeletes if false, does not perform deletes"); System.err.println(" (defaults to true)"); System.err.println(" doPuts if false, does not perform puts"); System.err.println(" (defaults to true)"); System.err.println(" ignoreTimestamps if true, ignores cells timestamps while comparing "); System.err.println(" cell values. Any missing cell on target then gets"); System.err.println(" added with current time as timestamp "); System.err.println(" (defaults to false)"); System.err.println(); System.err.println("Args:"); System.err.println(" sourcehashdir path to HashTable output dir for source table"); System.err.println(" (see org.apache.hadoop.hbase.mapreduce.HashTable)"); System.err.println(" sourcetable Name of the source table to sync from"); System.err.println(" targettable Name of the target table to sync to"); System.err.println(); System.err.println("Examples:"); System.err.println(" For SyncTable validation of tableA from a remote source cluster"); System.err.println(" to a local target cluster:"); System.err.println( " $ bin/hbase " + "com.google.cloud.bigtable.mapreduce.validation.SyncTable --targetbigtableproject=project123" + " --targetbigtableinstance=instance123 --sourcezkcluster=zk1.example.com,zk2.example.com,zk3.example.com:2181:/hbase" + " gs://bucket/hashes/tableA tableA tableA"); } /** * Set up job configuration for Bigtable as Target * * @param otherArgs * @return * @throws IOException */ private Job setupJobWithBigtable(String[] otherArgs) throws IOException { // set up default job configurations Job job = super.createSubmittableJob(otherArgs); Configuration jobConf = job.getConfiguration(); // set conf for job startup as InputFormat is initialized on target String targetZkCluster = BigtableSyncTableAccessor.getTargetZkCluster(this); if (targetZkCluster != null) { ZKConfig.ZKClusterKey zkClusterKey = ZKConfig.transformClusterKey(targetZkCluster); jobConf.set(HConstants.ZOOKEEPER_QUORUM, zkClusterKey.getQuorumString()); jobConf.setInt(HConstants.ZOOKEEPER_CLIENT_PORT, zkClusterKey.getClientPort()); jobConf.set(HConstants.ZOOKEEPER_ZNODE_PARENT, zkClusterKey.getZnodeParent()); } // set bigtable target configuration if (targetBigtableProjectId != null && targetBigtableInstance != null) { if (targetZkCluster != null) { LOG.warn( "targetZkCluster config(" + targetZkCluster + ") overridden with targetBigtableProjectId(" + targetBigtableProjectId + "), targetBigtableInstance(" + targetBigtableInstance + ")"); } jobConf.set(TARGET_BT_PROJECTID_CONF_KEY, targetBigtableProjectId); jobConf.set(TARGET_BT_INSTANCE_CONF_KEY, targetBigtableInstance); if (targetBigtableAppProfile != null) { jobConf.set(TARGET_BT_APP_PROFILE_CONF_KEY, targetBigtableAppProfile); } BigtableConfiguration.configure( jobConf, jobConf.get(TARGET_BT_PROJECTID_CONF_KEY), jobConf.get(TARGET_BT_INSTANCE_CONF_KEY), jobConf.get(TARGET_BT_APP_PROFILE_CONF_KEY, "")); } // enable dry run as default String DRY_RUN_CONF_KEY = BigtableSyncTableAccessor.getConfDryRunKey(); jobConf.setBoolean(DRY_RUN_CONF_KEY, jobConf.getBoolean(DRY_RUN_CONF_KEY, true)); // Set up bigtable configurations for job. Note that the job conf is shared for source and // target databases and additional configuration in mapper initializes proper // connections. String sourceZkCluster = BigtableSyncTableAccessor.getSourceZkCluster(this); if (sourceBigtableProjectId != null && sourceBigtableInstance != null) { if (sourceZkCluster != null) { LOG.warn( "sourceZkCluster config(" + sourceZkCluster + ") overriden with sourceBigtableProjectId(" + sourceBigtableProjectId + "), sourceBigtableInstance(" + sourceBigtableInstance + ")"); } jobConf.set(SOURCE_BT_PROJECTID_CONF_KEY, sourceBigtableProjectId); jobConf.set(SOURCE_BT_INSTANCE_CONF_KEY, sourceBigtableInstance); if (sourceBigtableAppProfile != null) { jobConf.set(SOURCE_BT_APP_PROFILE_CONF_KEY, sourceBigtableAppProfile); } } Scan jobScan = TableMapReduceUtil.convertStringToScan(jobConf.get(TableInputFormat.SCAN)); TableMapReduceUtil.initTableMapperJob( BigtableSyncTableAccessor.getTargetTableName(this), jobScan, BigtableSyncMapper.class, null, null, job); return job; } public String getSourceBigtableProjectId() { return sourceBigtableProjectId; } public String getSourceBigtableInstance() { return sourceBigtableInstance; } public String getSourceBigtableAppProfile() { return sourceBigtableAppProfile; } public String getTargetBigtableProjectId() { return targetBigtableProjectId; } public String getTargetBigtableInstance() { return targetBigtableInstance; } public String getTargetBigtableAppProfile() { return targetBigtableAppProfile; } /** Main entry point. */ public static void main(String[] args) throws Exception { int exitCode = ToolRunner.run(new BigtableSyncTableJob(HBaseConfiguration.create()), args); if (exitCode == 0) { System.out.println("job appears to have completed successfully."); } else { System.err.println("job is exiting with exit code='" + exitCode + "'"); } System.exit(exitCode); } }





© 2015 - 2025 Weber Informatics LLC | Privacy Policy