com.google.cloud.bigtable.mapreduce.hbasesnapshots.ImportHBaseSnapshotJob Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of bigtable-hbase-1.x-mapreduce Show documentation
This project contains tweaks to the hbase 1.* map reduce jobs that work for bigtable. Specifically, HBase's Import M/R job has ZooKeeper referrence which needed to be removed in order to work with Bigtable.
There is a newer version: 2.14.8
Show newest version
/*
 * Copyright 2021 Google LLC
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package com.google.cloud.bigtable.mapreduce.hbasesnapshots;

import static com.google.cloud.bigtable.mapreduce.hbasesnapshots.ImportJobCommon.IMPORT_SNAPSHOT_JOBNAME_KEY;
import static com.google.cloud.bigtable.mapreduce.hbasesnapshots.ImportJobCommon.SNAPSHOTNAME_KEY;
import static com.google.cloud.bigtable.mapreduce.hbasesnapshots.ImportJobCommon.SNAPSHOT_RESTOREDIR_KEY;
import static com.google.cloud.bigtable.mapreduce.hbasesnapshots.ImportJobCommon.SNAPSHOT_SPLITS_PER_REGION_DEFAULT;
import static com.google.cloud.bigtable.mapreduce.hbasesnapshots.ImportJobCommon.SNAPSHOT_SPLITS_PER_REGION_KEY;
import static com.google.cloud.bigtable.mapreduce.hbasesnapshots.ImportJobCommon.TABLENAME_KEY;

import com.google.cloud.bigtable.hbase.BigtableConfiguration;
import com.google.cloud.bigtable.hbase.BigtableOptionsFactory;
import java.io.IOException;
import java.util.Collections;
import java.util.Iterator;
import java.util.List;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.CommonConfigurationKeysPublic;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hbase.Cell;
import org.apache.hadoop.hbase.HBaseConfiguration;
import org.apache.hadoop.hbase.HConstants;
import org.apache.hadoop.hbase.client.Put;
import org.apache.hadoop.hbase.client.Result;
import org.apache.hadoop.hbase.client.Scan;
import org.apache.hadoop.hbase.io.ImmutableBytesWritable;
import org.apache.hadoop.hbase.mapreduce.MutationSerialization;
import org.apache.hadoop.hbase.mapreduce.ResultSerialization;
import org.apache.hadoop.hbase.mapreduce.TableMapper;
import org.apache.hadoop.hbase.mapreduce.TableOutputFormat;
import org.apache.hadoop.hbase.util.RegionSplitter.UniformSplit;
import org.apache.hadoop.io.Writable;
import org.apache.hadoop.mapreduce.Counters;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.util.GenericOptionsParser;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

/** A Mapreduce job to import an HBase snapshot to Bigtable. */
public class ImportHBaseSnapshotJob extends Configured implements Tool {

  private static final Log LOG = LogFactory.getLog(ImportHBaseSnapshotJob.class);

  /*
   * @param errorMsg Error message.  Can be null.
   */
  private static void usage(final String errorMsg) {
    if (errorMsg != null && errorMsg.length() > 0) {
      System.out.println("ERROR: " + errorMsg);
    }
    System.out.println(
        "Usage: hadoop jar       \n"
            + "Required properties: \n"
            + "  -D"
            + BigtableOptionsFactory.PROJECT_ID_KEY
            + "=\n"
            + "  -D"
            + BigtableOptionsFactory.INSTANCE_ID_KEY
            + "=\n"
            + "Required arguments:\n"
            + "  snapshot-name     - name of hbase snapshot to read\n"
            + "  snapshot-dir      - directory of snapshot, e.g., gs://hbase-migration-table1-bucket/export/table1-snapshot\n"
            + "  tablename         - bigtable table to import into\n"
            + "  temp-dir          - a temporary base directory to restore hlinks of the snapshot into for read, e.g., gs://hbase-migration-table1-bucket/export/table1-restore\n"
            + "Optional properties: \n"
            + "  -D"
            + BigtableOptionsFactory.BIGTABLE_BUFFERED_MUTATOR_ENABLE_THROTTLING
            + "=(true|false)\n"
            + "  -D"
            + BigtableOptionsFactory.BIGTABLE_BUFFERED_MUTATOR_THROTTLING_THRESHOLD_MILLIS
            + "=\n");

    System.exit(1);
  }

  /**
   * Job runner
   *
   * @param args command line input
   * @return job success flag
   * @throws Exception terminal for application failure
   */
  @Override
  public int run(String[] args) throws Exception {
    Configuration conf = HBaseConfiguration.create(getConf());
    String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs();

    if (setConfFromArgs(conf, otherArgs) != 0) return -1;

    LOG.info(
        "Using Arguments: "
            + "\n"
            + "snapshotName = "
            + conf.get(SNAPSHOTNAME_KEY)
            + "\n"
            + "snapshotDir = "
            + conf.get(HConstants.HBASE_DIR)
            + "\n"
            + "bigtableName = "
            + conf.get(TableOutputFormat.OUTPUT_TABLE)
            + "\n"
            + "tempDir = "
            + conf.get(SNAPSHOT_RESTOREDIR_KEY)
            + "\n"
            + CommonConfigurationKeysPublic.FS_DEFAULT_NAME_KEY
            + " = "
            + conf.get(CommonConfigurationKeysPublic.FS_DEFAULT_NAME_KEY)
            + "\n");

    Job job = createSubmittableJob(conf);
    int successStatus = (job.waitForCompletion(true) ? 0 : 1);

    Counters counters = job.getCounters();
    long numRows = counters.findCounter(ScanCounter.NUM_ROWS).getValue();
    long numCells = counters.findCounter(ScanCounter.NUM_CELLS).getValue();

    LOG.info("############# Num of ROWS imported= " + numRows);
    LOG.info("############# Num of CELLS imported= " + numCells);

    return successStatus;
  }

  protected static int setConfFromArgs(Configuration conf, String[] args) {
    // basic pre-condition check
    if (args.length < 4) {
      usage("Wrong number of arguments: " + args.length);
      return -1;
    }
    if (conf.get(BigtableOptionsFactory.PROJECT_ID_KEY) == null) {
      usage("Must specify the property " + BigtableOptionsFactory.PROJECT_ID_KEY);
      return -1;
    }
    if (conf.get(BigtableOptionsFactory.INSTANCE_ID_KEY) == null) {
      usage("Must specify the property " + BigtableOptionsFactory.INSTANCE_ID_KEY);
      return -1;
    }

    // set arguments in configuration for ease of use
    conf.set(SNAPSHOTNAME_KEY, args[0]);
    conf.set(HConstants.HBASE_DIR, args[1]);
    conf.set(TABLENAME_KEY, args[2]);
    conf.set(SNAPSHOT_RESTOREDIR_KEY, args[3]);

    // set default fs
    conf.set(CommonConfigurationKeysPublic.FS_DEFAULT_NAME_KEY, conf.get(HConstants.HBASE_DIR));

    // set default splits per region and default if not set (not required, set for consistency)
    conf.setInt(
        SNAPSHOT_SPLITS_PER_REGION_KEY,
        conf.getInt(SNAPSHOT_SPLITS_PER_REGION_KEY, SNAPSHOT_SPLITS_PER_REGION_DEFAULT));

    // default job name
    conf.setIfUnset(
        IMPORT_SNAPSHOT_JOBNAME_KEY,
        ImportHBaseSnapshotJob.class.getName()
            + " - from snapshot "
            + conf.get(SNAPSHOTNAME_KEY)
            + " to "
            + conf.get(TABLENAME_KEY));

    // implicit bigtable configs
    BigtableConfiguration.configure(
        conf,
        conf.get(BigtableOptionsFactory.PROJECT_ID_KEY),
        conf.get(BigtableOptionsFactory.INSTANCE_ID_KEY),
        conf.get(BigtableOptionsFactory.APP_PROFILE_ID_KEY, ""));

    // Set user agent
    conf.set(BigtableOptionsFactory.CUSTOM_USER_AGENT_KEY, "HBaseMRImport");

    // implicit table outputformat configs that are used in the job to write map output to a table
    conf.set(TableOutputFormat.OUTPUT_TABLE, conf.get(TABLENAME_KEY));
    conf.setStrings(
        "io.serializations",
        conf.get("io.serializations"),
        MutationSerialization.class.getName(),
        ResultSerialization.class.getName());

    return 0;
  }

  /**
   * Sets up the actual job.
   *
   * @param conf The current configuration.
   * @return The newly created job.
   * @throws java.io.IOException When setting up the job fails.
   */
  protected static Job createSubmittableJob(Configuration conf) throws IOException {
    Job job = Job.getInstance(conf, conf.get(IMPORT_SNAPSHOT_JOBNAME_KEY));
    job.setJarByClass(ImportHBaseSnapshotJob.class);
    ShuffledTableMapReduceUtil.initTableSnapshotMapperJob(
        conf.get(SNAPSHOTNAME_KEY),
        new Scan().setMaxVersions(),
        SnapshotMapper.class,
        ImmutableBytesWritable.class,
        Writable.class,
        job,
        true,
        new Path(conf.get(SNAPSHOT_RESTOREDIR_KEY)),
        new UniformSplit(),
        conf.getInt(SNAPSHOT_SPLITS_PER_REGION_KEY, SNAPSHOT_SPLITS_PER_REGION_DEFAULT));

    job.setNumReduceTasks(0);
    job.setOutputFormatClass(TableOutputFormat.class);

    return job;
  }

  protected enum ScanCounter {
    NUM_ROWS,
    NUM_CELLS,
  }

  /** MAPPER */
  protected static class SnapshotMapper extends TableMapper {

    private static final Logger LOGGER = LoggerFactory.getLogger(SnapshotMapper.class);

    private Boolean isEmptyRowWarned = false;

    protected static final int MAX_CELLS = 100_000 - 1;

    @Override
    protected void setup(Context ctx) {
      // noop
    }

    @Override
    /**
     * convert result to puts. deleted cells are not handled as Scan will not read through
     * tombstones (see {@link org.apache.hadoop.hbase.client.Scan#setRaw(boolean)} if required).
     * Additionally @see google.bigtable.v2#mutaterowrequest
     * for details on splitting result to MAX_CELLS.
     */
    protected void map(ImmutableBytesWritable key, Result result, Context ctx)
        throws IOException, InterruptedException {

      List cells = checkEmptyRow(result);
      if (cells.isEmpty()) {
        return;
      }

      ctx.getCounter(ScanCounter.NUM_ROWS).increment(1);
      ctx.getCounter(ScanCounter.NUM_CELLS).increment(cells.size());

      // Split the row into multiple puts if it exceeds the maximum mutation limit
      Iterator cellIt = cells.iterator();

      while (cellIt.hasNext()) {
        Put put = new Put(key.get());

        // splitting the result across puts for atomic mutations is unsafe. careful handling for
        // this scenario may be required when hitting condition.
        for (int i = 0; i < MAX_CELLS && cellIt.hasNext(); i++) {
          put.add(cellIt.next());
        }

        ctx.write(key, put);
      }
    }

    @Override
    protected void cleanup(Context ctx) {
      // noop
    }

    // Warns about empty row on first occurrence only and replaces a null array with 0-length one.
    private List checkEmptyRow(Result result) {
      List cells = result.listCells();
      if (cells == null) {
        cells = Collections.emptyList();
      }
      if (!isEmptyRowWarned && cells.isEmpty()) {
        LOGGER.warn("Encountered empty row. Was input file serialized by HBase 0.94?");
        isEmptyRowWarned = true;
      }
      return cells;
    }
  }

  /**
   * Call this method to avoid the {@link #main(String[])} System.exit.
   *
   * @param args
   * @return exitCode
   * @throws Exception
   */
  public static int innerMain(final Configuration conf, final String[] args) throws Exception {
    return ToolRunner.run(conf, new ImportHBaseSnapshotJob(), args);
  }

  public static void main(String[] args) throws Exception {
    int exitCode = ToolRunner.run(new ImportHBaseSnapshotJob(), args);

    if (exitCode == 0) {
      System.out.println("job appears to have completed successfully.");
    } else {
      System.err.println("job is exiting with exit code='" + exitCode + "'");
    }

    System.exit(exitCode);
  }
}