All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.hadoop.hbase.mapreduce.SyncTable Maven / Gradle / Ivy

Go to download

This module contains implementations of InputFormat, OutputFormat, Mapper, Reducer, etc which are needed for running MR jobs on tables, WALs, HFiles and other HBase specific constructs. It also contains a bunch of tools: RowCounter, ImportTsv, Import, Export, CompactionTool, ExportSnapshot, WALPlayer, etc

There is a newer version: 3.0.0-beta-1
Show newest version
/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.hadoop.hbase.mapreduce;

import java.io.IOException;
import java.util.Collections;
import java.util.Iterator;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hbase.Cell;
import org.apache.hadoop.hbase.CellBuilderFactory;
import org.apache.hadoop.hbase.CellBuilderType;
import org.apache.hadoop.hbase.CellComparator;
import org.apache.hadoop.hbase.CellUtil;
import org.apache.hadoop.hbase.HBaseConfiguration;
import org.apache.hadoop.hbase.TableName;
import org.apache.hadoop.hbase.client.Connection;
import org.apache.hadoop.hbase.client.ConnectionFactory;
import org.apache.hadoop.hbase.client.Delete;
import org.apache.hadoop.hbase.client.Mutation;
import org.apache.hadoop.hbase.client.Put;
import org.apache.hadoop.hbase.client.Result;
import org.apache.hadoop.hbase.client.ResultScanner;
import org.apache.hadoop.hbase.client.Scan;
import org.apache.hadoop.hbase.client.Table;
import org.apache.hadoop.hbase.io.ImmutableBytesWritable;
import org.apache.hadoop.hbase.util.Bytes;
import org.apache.hadoop.mapreduce.Counters;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.output.NullOutputFormat;
import org.apache.hadoop.mapreduce.security.TokenCache;
import org.apache.hadoop.util.GenericOptionsParser;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
import org.apache.yetus.audience.InterfaceAudience;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import org.apache.hbase.thirdparty.com.google.common.base.Throwables;

@InterfaceAudience.Private
public class SyncTable extends Configured implements Tool {

  private static final Logger LOG = LoggerFactory.getLogger(SyncTable.class);

  static final String SOURCE_HASH_DIR_CONF_KEY = "sync.table.source.hash.dir";
  static final String SOURCE_TABLE_CONF_KEY = "sync.table.source.table.name";
  static final String TARGET_TABLE_CONF_KEY = "sync.table.target.table.name";
  static final String SOURCE_ZK_CLUSTER_CONF_KEY = "sync.table.source.zk.cluster";
  static final String TARGET_ZK_CLUSTER_CONF_KEY = "sync.table.target.zk.cluster";
  static final String DRY_RUN_CONF_KEY = "sync.table.dry.run";
  static final String DO_DELETES_CONF_KEY = "sync.table.do.deletes";
  static final String DO_PUTS_CONF_KEY = "sync.table.do.puts";
  static final String IGNORE_TIMESTAMPS = "sync.table.ignore.timestamps";

  Path sourceHashDir;
  String sourceTableName;
  String targetTableName;

  String sourceZkCluster;
  String targetZkCluster;
  boolean dryRun;
  boolean doDeletes = true;
  boolean doPuts = true;
  boolean ignoreTimestamps;

  Counters counters;

  public SyncTable(Configuration conf) {
    super(conf);
  }

  private void initCredentialsForHBase(String zookeeper, Job job) throws IOException {
    Configuration peerConf =
      HBaseConfiguration.createClusterConf(job.getConfiguration(), zookeeper);
    if ("kerberos".equalsIgnoreCase(peerConf.get("hbase.security.authentication"))) {
      TableMapReduceUtil.initCredentialsForCluster(job, peerConf);
    }
  }

  public Job createSubmittableJob(String[] args) throws IOException {
    FileSystem fs = sourceHashDir.getFileSystem(getConf());
    if (!fs.exists(sourceHashDir)) {
      throw new IOException("Source hash dir not found: " + sourceHashDir);
    }

    Job job = Job.getInstance(getConf(),
      getConf().get("mapreduce.job.name", "syncTable_" + sourceTableName + "-" + targetTableName));
    Configuration jobConf = job.getConfiguration();
    if ("kerberos".equalsIgnoreCase(jobConf.get("hadoop.security.authentication"))) {
      TokenCache.obtainTokensForNamenodes(job.getCredentials(), new Path[] { sourceHashDir },
        getConf());
    }

    HashTable.TableHash tableHash = HashTable.TableHash.read(getConf(), sourceHashDir);
    LOG.info("Read source hash manifest: " + tableHash);
    LOG.info("Read " + tableHash.partitions.size() + " partition keys");
    if (!tableHash.tableName.equals(sourceTableName)) {
      LOG.warn("Table name mismatch - manifest indicates hash was taken from: "
        + tableHash.tableName + " but job is reading from: " + sourceTableName);
    }
    if (tableHash.numHashFiles != tableHash.partitions.size() + 1) {
      throw new RuntimeException("Hash data appears corrupt. The number of of hash files created"
        + " should be 1 more than the number of partition keys.  However, the manifest file "
        + " says numHashFiles=" + tableHash.numHashFiles + " but the number of partition keys"
        + " found in the partitions file is " + tableHash.partitions.size());
    }

    Path dataDir = new Path(sourceHashDir, HashTable.HASH_DATA_DIR);
    int dataSubdirCount = 0;
    for (FileStatus file : fs.listStatus(dataDir)) {
      if (file.getPath().getName().startsWith(HashTable.OUTPUT_DATA_FILE_PREFIX)) {
        dataSubdirCount++;
      }
    }

    if (dataSubdirCount != tableHash.numHashFiles) {
      throw new RuntimeException("Hash data appears corrupt. The number of of hash files created"
        + " should be 1 more than the number of partition keys.  However, the number of data dirs"
        + " found is " + dataSubdirCount + " but the number of partition keys"
        + " found in the partitions file is " + tableHash.partitions.size());
    }

    job.setJarByClass(HashTable.class);
    jobConf.set(SOURCE_HASH_DIR_CONF_KEY, sourceHashDir.toString());
    jobConf.set(SOURCE_TABLE_CONF_KEY, sourceTableName);
    jobConf.set(TARGET_TABLE_CONF_KEY, targetTableName);
    if (sourceZkCluster != null) {
      jobConf.set(SOURCE_ZK_CLUSTER_CONF_KEY, sourceZkCluster);
      initCredentialsForHBase(sourceZkCluster, job);
    }
    if (targetZkCluster != null) {
      jobConf.set(TARGET_ZK_CLUSTER_CONF_KEY, targetZkCluster);
      initCredentialsForHBase(targetZkCluster, job);
    }
    jobConf.setBoolean(DRY_RUN_CONF_KEY, dryRun);
    jobConf.setBoolean(DO_DELETES_CONF_KEY, doDeletes);
    jobConf.setBoolean(DO_PUTS_CONF_KEY, doPuts);
    jobConf.setBoolean(IGNORE_TIMESTAMPS, ignoreTimestamps);

    TableMapReduceUtil.initTableMapperJob(targetTableName, tableHash.initScan(), SyncMapper.class,
      null, null, job);

    job.setNumReduceTasks(0);

    if (dryRun) {
      job.setOutputFormatClass(NullOutputFormat.class);
    } else {
      // No reducers. Just write straight to table. Call initTableReducerJob
      // because it sets up the TableOutputFormat.
      TableMapReduceUtil.initTableReducerJob(targetTableName, null, job, null, targetZkCluster,
        null, null);

      // would be nice to add an option for bulk load instead
    }

    // Obtain an authentication token, for the specified cluster, on behalf of the current user
    if (sourceZkCluster != null) {
      Configuration peerConf =
        HBaseConfiguration.createClusterConf(job.getConfiguration(), sourceZkCluster);
      TableMapReduceUtil.initCredentialsForCluster(job, peerConf);
    }
    return job;
  }

  public static class SyncMapper extends TableMapper {
    Path sourceHashDir;

    Connection sourceConnection;
    Connection targetConnection;
    Table sourceTable;
    Table targetTable;
    boolean dryRun;
    boolean doDeletes = true;
    boolean doPuts = true;
    boolean ignoreTimestamp;

    HashTable.TableHash sourceTableHash;
    HashTable.TableHash.Reader sourceHashReader;
    ImmutableBytesWritable currentSourceHash;
    ImmutableBytesWritable nextSourceKey;
    HashTable.ResultHasher targetHasher;

    Throwable mapperException;

    public static enum Counter {
      BATCHES,
      HASHES_MATCHED,
      HASHES_NOT_MATCHED,
      SOURCEMISSINGROWS,
      SOURCEMISSINGCELLS,
      TARGETMISSINGROWS,
      TARGETMISSINGCELLS,
      ROWSWITHDIFFS,
      DIFFERENTCELLVALUES,
      MATCHINGROWS,
      MATCHINGCELLS,
      EMPTY_BATCHES,
      RANGESMATCHED,
      RANGESNOTMATCHED
    };

    @Override
    protected void setup(Context context) throws IOException {

      Configuration conf = context.getConfiguration();
      sourceHashDir = new Path(conf.get(SOURCE_HASH_DIR_CONF_KEY));
      sourceConnection = openConnection(conf, SOURCE_ZK_CLUSTER_CONF_KEY, null);
      targetConnection =
        openConnection(conf, TARGET_ZK_CLUSTER_CONF_KEY, TableOutputFormat.OUTPUT_CONF_PREFIX);
      sourceTable = openTable(sourceConnection, conf, SOURCE_TABLE_CONF_KEY);
      targetTable = openTable(targetConnection, conf, TARGET_TABLE_CONF_KEY);
      dryRun = conf.getBoolean(DRY_RUN_CONF_KEY, false);
      doDeletes = conf.getBoolean(DO_DELETES_CONF_KEY, true);
      doPuts = conf.getBoolean(DO_PUTS_CONF_KEY, true);
      ignoreTimestamp = conf.getBoolean(IGNORE_TIMESTAMPS, false);

      sourceTableHash = HashTable.TableHash.read(conf, sourceHashDir);
      LOG.info("Read source hash manifest: " + sourceTableHash);
      LOG.info("Read " + sourceTableHash.partitions.size() + " partition keys");

      TableSplit split = (TableSplit) context.getInputSplit();
      ImmutableBytesWritable splitStartKey = new ImmutableBytesWritable(split.getStartRow());

      sourceHashReader = sourceTableHash.newReader(conf, splitStartKey);
      findNextKeyHashPair();

      // create a hasher, but don't start it right away
      // instead, find the first hash batch at or after the start row
      // and skip any rows that come before. they will be caught by the previous task
      targetHasher = new HashTable.ResultHasher();
      targetHasher.ignoreTimestamps = ignoreTimestamp;
    }

    private static Connection openConnection(Configuration conf, String zkClusterConfKey,
      String configPrefix) throws IOException {
      String zkCluster = conf.get(zkClusterConfKey);
      Configuration clusterConf =
        HBaseConfiguration.createClusterConf(conf, zkCluster, configPrefix);
      return ConnectionFactory.createConnection(clusterConf);
    }

    private static Table openTable(Connection connection, Configuration conf,
      String tableNameConfKey) throws IOException {
      return connection.getTable(TableName.valueOf(conf.get(tableNameConfKey)));
    }

    /**
     * Attempt to read the next source key/hash pair. If there are no more, set nextSourceKey to
     * null
     */
    private void findNextKeyHashPair() throws IOException {
      boolean hasNext = sourceHashReader.next();
      if (hasNext) {
        nextSourceKey = sourceHashReader.getCurrentKey();
      } else {
        // no more keys - last hash goes to the end
        nextSourceKey = null;
      }
    }

    @Override
    protected void map(ImmutableBytesWritable key, Result value, Context context)
      throws IOException, InterruptedException {
      try {
        // first, finish any hash batches that end before the scanned row
        while (nextSourceKey != null && key.compareTo(nextSourceKey) >= 0) {
          moveToNextBatch(context);
        }

        // next, add the scanned row (as long as we've reached the first batch)
        if (targetHasher.isBatchStarted()) {
          targetHasher.hashResult(value);
        }
      } catch (Throwable t) {
        mapperException = t;
        Throwables.propagateIfInstanceOf(t, IOException.class);
        Throwables.propagateIfInstanceOf(t, InterruptedException.class);
        Throwables.propagate(t);
      }
    }

    /**
     * If there is an open hash batch, complete it and sync if there are diffs. Start a new batch,
     * and seek to read the
     */
    private void moveToNextBatch(Context context) throws IOException, InterruptedException {
      if (targetHasher.isBatchStarted()) {
        finishBatchAndCompareHashes(context);
      }
      targetHasher.startBatch(nextSourceKey);
      currentSourceHash = sourceHashReader.getCurrentHash();

      findNextKeyHashPair();
    }

    /**
     * Finish the currently open hash batch. Compare the target hash to the given source hash. If
     * they do not match, then sync the covered key range.
     */
    private void finishBatchAndCompareHashes(Context context)
      throws IOException, InterruptedException {
      targetHasher.finishBatch();
      context.getCounter(Counter.BATCHES).increment(1);
      if (targetHasher.getBatchSize() == 0) {
        context.getCounter(Counter.EMPTY_BATCHES).increment(1);
      }
      ImmutableBytesWritable targetHash = targetHasher.getBatchHash();
      if (targetHash.equals(currentSourceHash)) {
        context.getCounter(Counter.HASHES_MATCHED).increment(1);
      } else {
        context.getCounter(Counter.HASHES_NOT_MATCHED).increment(1);

        ImmutableBytesWritable stopRow = nextSourceKey == null
          ? new ImmutableBytesWritable(sourceTableHash.stopRow)
          : nextSourceKey;

        if (LOG.isDebugEnabled()) {
          LOG.debug("Hash mismatch.  Key range: " + toHex(targetHasher.getBatchStartKey()) + " to "
            + toHex(stopRow) + " sourceHash: " + toHex(currentSourceHash) + " targetHash: "
            + toHex(targetHash));
        }

        syncRange(context, targetHasher.getBatchStartKey(), stopRow);
      }
    }

    private static String toHex(ImmutableBytesWritable bytes) {
      return Bytes.toHex(bytes.get(), bytes.getOffset(), bytes.getLength());
    }

    private static final CellScanner EMPTY_CELL_SCANNER =
      new CellScanner(Collections. emptyIterator());

    /**
     * Rescan the given range directly from the source and target tables. Count and log differences,
     * and if this is not a dry run, output Puts and Deletes to make the target table match the
     * source table for this range
     */
    private void syncRange(Context context, ImmutableBytesWritable startRow,
      ImmutableBytesWritable stopRow) throws IOException, InterruptedException {
      Scan scan = sourceTableHash.initScan();
      scan.setStartRow(startRow.copyBytes());
      scan.setStopRow(stopRow.copyBytes());

      ResultScanner sourceScanner = sourceTable.getScanner(scan);
      CellScanner sourceCells = new CellScanner(sourceScanner.iterator());

      ResultScanner targetScanner = targetTable.getScanner(new Scan(scan));
      CellScanner targetCells = new CellScanner(targetScanner.iterator());

      boolean rangeMatched = true;
      byte[] nextSourceRow = sourceCells.nextRow();
      byte[] nextTargetRow = targetCells.nextRow();
      while (nextSourceRow != null || nextTargetRow != null) {
        boolean rowMatched;
        int rowComparison = compareRowKeys(nextSourceRow, nextTargetRow);
        if (rowComparison < 0) {
          if (LOG.isDebugEnabled()) {
            LOG.debug("Target missing row: " + Bytes.toString(nextSourceRow));
          }
          context.getCounter(Counter.TARGETMISSINGROWS).increment(1);

          rowMatched = syncRowCells(context, nextSourceRow, sourceCells, EMPTY_CELL_SCANNER);
          nextSourceRow = sourceCells.nextRow(); // advance only source to next row
        } else if (rowComparison > 0) {
          if (LOG.isDebugEnabled()) {
            LOG.debug("Source missing row: " + Bytes.toString(nextTargetRow));
          }
          context.getCounter(Counter.SOURCEMISSINGROWS).increment(1);

          rowMatched = syncRowCells(context, nextTargetRow, EMPTY_CELL_SCANNER, targetCells);
          nextTargetRow = targetCells.nextRow(); // advance only target to next row
        } else {
          // current row is the same on both sides, compare cell by cell
          rowMatched = syncRowCells(context, nextSourceRow, sourceCells, targetCells);
          nextSourceRow = sourceCells.nextRow();
          nextTargetRow = targetCells.nextRow();
        }

        if (!rowMatched) {
          rangeMatched = false;
        }
      }

      sourceScanner.close();
      targetScanner.close();

      context.getCounter(rangeMatched ? Counter.RANGESMATCHED : Counter.RANGESNOTMATCHED)
        .increment(1);
    }

    private static class CellScanner {
      private final Iterator results;

      private byte[] currentRow;
      private Result currentRowResult;
      private int nextCellInRow;

      private Result nextRowResult;

      public CellScanner(Iterator results) {
        this.results = results;
      }

      /**
       * Advance to the next row and return its row key. Returns null iff there are no more rows.
       */
      public byte[] nextRow() {
        if (nextRowResult == null) {
          // no cached row - check scanner for more
          while (results.hasNext()) {
            nextRowResult = results.next();
            Cell nextCell = nextRowResult.rawCells()[0];
            if (
              currentRow == null || !Bytes.equals(currentRow, 0, currentRow.length,
                nextCell.getRowArray(), nextCell.getRowOffset(), nextCell.getRowLength())
            ) {
              // found next row
              break;
            } else {
              // found another result from current row, keep scanning
              nextRowResult = null;
            }
          }

          if (nextRowResult == null) {
            // end of data, no more rows
            currentRowResult = null;
            currentRow = null;
            return null;
          }
        }

        // advance to cached result for next row
        currentRowResult = nextRowResult;
        nextCellInRow = 0;
        currentRow = currentRowResult.getRow();
        nextRowResult = null;
        return currentRow;
      }

      /**
       * Returns the next Cell in the current row or null iff none remain.
       */
      public Cell nextCellInRow() {
        if (currentRowResult == null) {
          // nothing left in current row
          return null;
        }

        Cell nextCell = currentRowResult.rawCells()[nextCellInRow];
        nextCellInRow++;
        if (nextCellInRow == currentRowResult.size()) {
          if (results.hasNext()) {
            Result result = results.next();
            Cell cell = result.rawCells()[0];
            if (
              Bytes.equals(currentRow, 0, currentRow.length, cell.getRowArray(),
                cell.getRowOffset(), cell.getRowLength())
            ) {
              // result is part of current row
              currentRowResult = result;
              nextCellInRow = 0;
            } else {
              // result is part of next row, cache it
              nextRowResult = result;
              // current row is complete
              currentRowResult = null;
            }
          } else {
            // end of data
            currentRowResult = null;
          }
        }
        return nextCell;
      }
    }

    private Cell checkAndResetTimestamp(Cell sourceCell) {
      if (ignoreTimestamp) {
        sourceCell =
          CellBuilderFactory.create(CellBuilderType.SHALLOW_COPY).setType(sourceCell.getType())
            .setRow(sourceCell.getRowArray(), sourceCell.getRowOffset(), sourceCell.getRowLength())
            .setFamily(sourceCell.getFamilyArray(), sourceCell.getFamilyOffset(),
              sourceCell.getFamilyLength())
            .setQualifier(sourceCell.getQualifierArray(), sourceCell.getQualifierOffset(),
              sourceCell.getQualifierLength())
            .setTimestamp(System.currentTimeMillis()).setValue(sourceCell.getValueArray(),
              sourceCell.getValueOffset(), sourceCell.getValueLength())
            .build();
      }
      return sourceCell;
    }

    /**
     * Compare the cells for the given row from the source and target tables. Count and log any
     * differences. If not a dry run, output a Put and/or Delete needed to sync the target table to
     * match the source table.
     */
    private boolean syncRowCells(Context context, byte[] rowKey, CellScanner sourceCells,
      CellScanner targetCells) throws IOException, InterruptedException {
      Put put = null;
      Delete delete = null;
      long matchingCells = 0;
      boolean matchingRow = true;
      Cell sourceCell = sourceCells.nextCellInRow();
      Cell targetCell = targetCells.nextCellInRow();
      while (sourceCell != null || targetCell != null) {

        int cellKeyComparison = compareCellKeysWithinRow(sourceCell, targetCell);
        if (cellKeyComparison < 0) {
          if (LOG.isDebugEnabled()) {
            LOG.debug("Target missing cell: " + sourceCell);
          }
          context.getCounter(Counter.TARGETMISSINGCELLS).increment(1);
          matchingRow = false;

          if (!dryRun && doPuts) {
            if (put == null) {
              put = new Put(rowKey);
            }
            sourceCell = checkAndResetTimestamp(sourceCell);
            put.add(sourceCell);
          }

          sourceCell = sourceCells.nextCellInRow();
        } else if (cellKeyComparison > 0) {
          if (LOG.isDebugEnabled()) {
            LOG.debug("Source missing cell: " + targetCell);
          }
          context.getCounter(Counter.SOURCEMISSINGCELLS).increment(1);
          matchingRow = false;

          if (!dryRun && doDeletes) {
            if (delete == null) {
              delete = new Delete(rowKey);
            }
            // add a tombstone to exactly match the target cell that is missing on the source
            delete.addColumn(CellUtil.cloneFamily(targetCell), CellUtil.cloneQualifier(targetCell),
              targetCell.getTimestamp());
          }

          targetCell = targetCells.nextCellInRow();
        } else {
          // the cell keys are equal, now check values
          if (CellUtil.matchingValue(sourceCell, targetCell)) {
            matchingCells++;
          } else {
            if (LOG.isDebugEnabled()) {
              LOG.debug("Different values: ");
              LOG.debug("  source cell: " + sourceCell + " value: "
                + Bytes.toString(sourceCell.getValueArray(), sourceCell.getValueOffset(),
                  sourceCell.getValueLength()));
              LOG.debug("  target cell: " + targetCell + " value: "
                + Bytes.toString(targetCell.getValueArray(), targetCell.getValueOffset(),
                  targetCell.getValueLength()));
            }
            context.getCounter(Counter.DIFFERENTCELLVALUES).increment(1);
            matchingRow = false;

            if (!dryRun && doPuts) {
              // overwrite target cell
              if (put == null) {
                put = new Put(rowKey);
              }
              sourceCell = checkAndResetTimestamp(sourceCell);
              put.add(sourceCell);
            }
          }
          sourceCell = sourceCells.nextCellInRow();
          targetCell = targetCells.nextCellInRow();
        }

        if (!dryRun && sourceTableHash.scanBatch > 0) {
          if (put != null && put.size() >= sourceTableHash.scanBatch) {
            context.write(new ImmutableBytesWritable(rowKey), put);
            put = null;
          }
          if (delete != null && delete.size() >= sourceTableHash.scanBatch) {
            context.write(new ImmutableBytesWritable(rowKey), delete);
            delete = null;
          }
        }
      }

      if (!dryRun) {
        if (put != null) {
          context.write(new ImmutableBytesWritable(rowKey), put);
        }
        if (delete != null) {
          context.write(new ImmutableBytesWritable(rowKey), delete);
        }
      }

      if (matchingCells > 0) {
        context.getCounter(Counter.MATCHINGCELLS).increment(matchingCells);
      }
      if (matchingRow) {
        context.getCounter(Counter.MATCHINGROWS).increment(1);
        return true;
      } else {
        context.getCounter(Counter.ROWSWITHDIFFS).increment(1);
        return false;
      }
    }

    /**
     * Compare row keys of the given Result objects. Nulls are after non-nulls
     */
    private static int compareRowKeys(byte[] r1, byte[] r2) {
      if (r1 == null) {
        return 1; // source missing row
      } else if (r2 == null) {
        return -1; // target missing row
      } else {
        // Sync on no META tables only. We can directly do what CellComparator is doing inside.
        // Never the call going to MetaCellComparator.
        return Bytes.compareTo(r1, 0, r1.length, r2, 0, r2.length);
      }
    }

    /**
     * Compare families, qualifiers, and timestamps of the given Cells. They are assumed to be of
     * the same row. Nulls are after non-nulls.
     */
    private int compareCellKeysWithinRow(Cell c1, Cell c2) {
      if (c1 == null) {
        return 1; // source missing cell
      }
      if (c2 == null) {
        return -1; // target missing cell
      }

      int result = CellComparator.getInstance().compareFamilies(c1, c2);
      if (result != 0) {
        return result;
      }

      result = CellComparator.getInstance().compareQualifiers(c1, c2);
      if (result != 0) {
        return result;
      }

      if (this.ignoreTimestamp) {
        return 0;
      } else {
        // note timestamp comparison is inverted - more recent cells first
        return CellComparator.getInstance().compareTimestamps(c1, c2);
      }
    }

    @Override
    protected void cleanup(Context context) throws IOException, InterruptedException {
      if (mapperException == null) {
        try {
          finishRemainingHashRanges(context);
        } catch (Throwable t) {
          mapperException = t;
        }
      }

      try {
        sourceTable.close();
        targetTable.close();
        sourceConnection.close();
        targetConnection.close();
      } catch (Throwable t) {
        if (mapperException == null) {
          mapperException = t;
        } else {
          LOG.error("Suppressing exception from closing tables", t);
        }
      }

      // propagate first exception
      if (mapperException != null) {
        Throwables.propagateIfInstanceOf(mapperException, IOException.class);
        Throwables.propagateIfInstanceOf(mapperException, InterruptedException.class);
        Throwables.propagate(mapperException);
      }
    }

    private void finishRemainingHashRanges(Context context)
      throws IOException, InterruptedException {
      TableSplit split = (TableSplit) context.getInputSplit();
      byte[] splitEndRow = split.getEndRow();
      boolean reachedEndOfTable = HashTable.isTableEndRow(splitEndRow);

      // if there are more hash batches that begin before the end of this split move to them
      while (
        nextSourceKey != null && (nextSourceKey.compareTo(splitEndRow) < 0 || reachedEndOfTable)
      ) {
        moveToNextBatch(context);
      }

      if (targetHasher.isBatchStarted()) {
        // need to complete the final open hash batch

        if (
          (nextSourceKey != null && nextSourceKey.compareTo(splitEndRow) > 0)
            || (nextSourceKey == null && !Bytes.equals(splitEndRow, sourceTableHash.stopRow))
        ) {
          // the open hash range continues past the end of this region
          // add a scan to complete the current hash range
          Scan scan = sourceTableHash.initScan();
          scan.setStartRow(splitEndRow);
          if (nextSourceKey == null) {
            scan.setStopRow(sourceTableHash.stopRow);
          } else {
            scan.setStopRow(nextSourceKey.copyBytes());
          }

          ResultScanner targetScanner = null;
          try {
            targetScanner = targetTable.getScanner(scan);
            for (Result row : targetScanner) {
              targetHasher.hashResult(row);
            }
          } finally {
            if (targetScanner != null) {
              targetScanner.close();
            }
          }
        } // else current batch ends exactly at split end row

        finishBatchAndCompareHashes(context);
      }
    }
  }

  private static final int NUM_ARGS = 3;

  private static void printUsage(final String errorMsg) {
    if (errorMsg != null && errorMsg.length() > 0) {
      System.err.println("ERROR: " + errorMsg);
      System.err.println();
    }
    System.err.println("Usage: SyncTable [options]   ");
    System.err.println();
    System.err.println("Options:");

    System.err.println(" sourcezkcluster  ZK cluster key of the source table");
    System.err.println("                  (defaults to cluster in classpath's config)");
    System.err.println(" targetzkcluster  ZK cluster key of the target table");
    System.err.println("                  (defaults to cluster in classpath's config)");
    System.err.println(" dryrun           if true, output counters but no writes");
    System.err.println("                  (defaults to false)");
    System.err.println(" doDeletes        if false, does not perform deletes");
    System.err.println("                  (defaults to true)");
    System.err.println(" doPuts           if false, does not perform puts");
    System.err.println("                  (defaults to true)");
    System.err.println(" ignoreTimestamps if true, ignores cells timestamps while comparing ");
    System.err.println("                  cell values. Any missing cell on target then gets");
    System.err.println("                  added with current time as timestamp ");
    System.err.println("                  (defaults to false)");
    System.err.println();
    System.err.println("Args:");
    System.err.println(" sourcehashdir    path to HashTable output dir for source table");
    System.err.println("                  (see org.apache.hadoop.hbase.mapreduce.HashTable)");
    System.err.println(" sourcetable      Name of the source table to sync from");
    System.err.println(" targettable      Name of the target table to sync to");
    System.err.println();
    System.err.println("Examples:");
    System.err.println(" For a dry run SyncTable of tableA from a remote source cluster");
    System.err.println(" to a local target cluster:");
    System.err.println(" $ hbase " + "org.apache.hadoop.hbase.mapreduce.SyncTable --dryrun=true"
      + " --sourcezkcluster=zk1.example.com,zk2.example.com,zk3.example.com:2181:/hbase"
      + " hdfs://nn:9000/hashes/tableA tableA tableA");
  }

  private boolean doCommandLine(final String[] args) {
    if (args.length < NUM_ARGS) {
      printUsage(null);
      return false;
    }
    try {
      sourceHashDir = new Path(args[args.length - 3]);
      sourceTableName = args[args.length - 2];
      targetTableName = args[args.length - 1];

      for (int i = 0; i < args.length - NUM_ARGS; i++) {
        String cmd = args[i];
        if (cmd.equals("-h") || cmd.startsWith("--h")) {
          printUsage(null);
          return false;
        }

        final String sourceZkClusterKey = "--sourcezkcluster=";
        if (cmd.startsWith(sourceZkClusterKey)) {
          sourceZkCluster = cmd.substring(sourceZkClusterKey.length());
          continue;
        }

        final String targetZkClusterKey = "--targetzkcluster=";
        if (cmd.startsWith(targetZkClusterKey)) {
          targetZkCluster = cmd.substring(targetZkClusterKey.length());
          continue;
        }

        final String dryRunKey = "--dryrun=";
        if (cmd.startsWith(dryRunKey)) {
          dryRun = Boolean.parseBoolean(cmd.substring(dryRunKey.length()));
          continue;
        }

        final String doDeletesKey = "--doDeletes=";
        if (cmd.startsWith(doDeletesKey)) {
          doDeletes = Boolean.parseBoolean(cmd.substring(doDeletesKey.length()));
          continue;
        }

        final String doPutsKey = "--doPuts=";
        if (cmd.startsWith(doPutsKey)) {
          doPuts = Boolean.parseBoolean(cmd.substring(doPutsKey.length()));
          continue;
        }

        final String ignoreTimestampsKey = "--ignoreTimestamps=";
        if (cmd.startsWith(ignoreTimestampsKey)) {
          ignoreTimestamps = Boolean.parseBoolean(cmd.substring(ignoreTimestampsKey.length()));
          continue;
        }

        printUsage("Invalid argument '" + cmd + "'");
        return false;
      }

    } catch (Exception e) {
      e.printStackTrace();
      printUsage("Can't start because " + e.getMessage());
      return false;
    }
    return true;
  }

  /**
   * Main entry point.
   */
  public static void main(String[] args) throws Exception {
    int ret = ToolRunner.run(new SyncTable(HBaseConfiguration.create()), args);
    System.exit(ret);
  }

  @Override
  public int run(String[] args) throws Exception {
    String[] otherArgs = new GenericOptionsParser(getConf(), args).getRemainingArgs();
    if (!doCommandLine(otherArgs)) {
      return 1;
    }

    Job job = createSubmittableJob(otherArgs);
    if (!job.waitForCompletion(true)) {
      LOG.info("Map-reduce job failed!");
      return 1;
    }
    counters = job.getCounters();
    return 0;
  }

}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy