All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.hadoop.hbase.mapreduce.replication.VerifyReplication Maven / Gradle / Ivy

Go to download

HBase is the &lt;a href="http://hadoop.apache.org"&rt;Hadoop</a&rt; database. Use it when you need random, realtime read/write access to your Big Data. This project's goal is the hosting of very large tables -- billions of rows X millions of columns -- atop clusters of commodity hardware.

The newest version!
/**
 * Copyright 2010 The Apache Software Foundation
 *
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.hadoop.hbase.mapreduce.replication;

import java.io.IOException;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.hbase.HBaseConfiguration;
import org.apache.hadoop.hbase.HConstants;
import org.apache.hadoop.hbase.KeyValue;
import org.apache.hadoop.hbase.client.HConnection;
import org.apache.hadoop.hbase.client.HConnectionManager;
import org.apache.hadoop.hbase.client.HConnectionManager.HConnectable;
import org.apache.hadoop.hbase.client.HTable;
import org.apache.hadoop.hbase.client.Put;
import org.apache.hadoop.hbase.client.Result;
import org.apache.hadoop.hbase.client.ResultScanner;
import org.apache.hadoop.hbase.client.Scan;
import org.apache.hadoop.hbase.io.ImmutableBytesWritable;
import org.apache.hadoop.hbase.mapreduce.TableInputFormat;
import org.apache.hadoop.hbase.mapreduce.TableMapReduceUtil;
import org.apache.hadoop.hbase.mapreduce.TableMapper;
import org.apache.hadoop.hbase.replication.ReplicationPeer;
import org.apache.hadoop.hbase.replication.ReplicationZookeeper;
import org.apache.hadoop.hbase.util.Bytes;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.output.NullOutputFormat;
import org.apache.zookeeper.KeeperException;

/**
 * This map-only job compares the data from a local table with a remote one.
 * Every cell is compared and must have exactly the same keys (even timestamp)
 * as well as same value. It is possible to restrict the job by time range and
 * families. The peer id that's provided must match the one given when the
 * replication stream was setup.
 * 

* Two counters are provided, Verifier.Counters.GOODROWS and BADROWS. The reason * for a why a row is different is shown in the map's log. */ public class VerifyReplication { private static final Log LOG = LogFactory.getLog(VerifyReplication.class); public final static String NAME = "verifyrep"; static long startTime = 0; static long endTime = 0; static String tableName = null; static String families = null; static String peerId = null; /** * Map-only comparator for 2 tables */ public static class Verifier extends TableMapper { public static enum Counters {GOODROWS, BADROWS} private ResultScanner replicatedScanner; /** * Map method that compares every scanned row with the equivalent from * a distant cluster. * @param row The current table row key. * @param value The columns. * @param context The current context. * @throws IOException When something is broken with the data. */ @Override public void map(ImmutableBytesWritable row, final Result value, Context context) throws IOException { if (replicatedScanner == null) { Configuration conf = context.getConfiguration(); final Scan scan = new Scan(); scan.setCaching(conf.getInt(TableInputFormat.SCAN_CACHEDROWS, 1)); long startTime = conf.getLong(NAME + ".startTime", 0); long endTime = conf.getLong(NAME + ".endTime", 0); String families = conf.get(NAME + ".families", null); if(families != null) { String[] fams = families.split(","); for(String fam : fams) { scan.addFamily(Bytes.toBytes(fam)); } } if (startTime != 0) { scan.setTimeRange(startTime, endTime == 0 ? HConstants.LATEST_TIMESTAMP : endTime); } HConnectionManager.execute(new HConnectable(conf) { @Override public Void connect(HConnection conn) throws IOException { try { ReplicationZookeeper zk = new ReplicationZookeeper(conn, conf, conn.getZooKeeperWatcher()); ReplicationPeer peer = zk.getPeer(conf.get(NAME+".peerId")); HTable replicatedTable = new HTable(peer.getConfiguration(), conf.get(NAME+".tableName")); scan.setStartRow(value.getRow()); replicatedScanner = replicatedTable.getScanner(scan); } catch (KeeperException e) { throw new IOException("Got a ZK exception", e); } return null; } }); } Result res = replicatedScanner.next(); try { Result.compareResults(value, res); context.getCounter(Counters.GOODROWS).increment(1); } catch (Exception e) { LOG.warn("Bad row", e); context.getCounter(Counters.BADROWS).increment(1); } } protected void cleanup(Context context) { if (replicatedScanner != null) { replicatedScanner.close(); replicatedScanner = null; } } } /** * Sets up the actual job. * * @param conf The current configuration. * @param args The command line parameters. * @return The newly created job. * @throws java.io.IOException When setting up the job fails. */ public static Job createSubmittableJob(Configuration conf, String[] args) throws IOException { if (!doCommandLine(args)) { return null; } if (!conf.getBoolean(HConstants.REPLICATION_ENABLE_KEY, false)) { throw new IOException("Replication needs to be enabled to verify it."); } HConnectionManager.execute(new HConnectable(conf) { @Override public Void connect(HConnection conn) throws IOException { try { ReplicationZookeeper zk = new ReplicationZookeeper(conn, conf, conn.getZooKeeperWatcher()); // Just verifying it we can connect ReplicationPeer peer = zk.getPeer(peerId); if (peer == null) { throw new IOException("Couldn't get access to the slave cluster," + "please see the log"); } } catch (KeeperException ex) { throw new IOException("Couldn't get access to the slave cluster" + " because: ", ex); } return null; } }); conf.set(NAME+".peerId", peerId); conf.set(NAME+".tableName", tableName); conf.setLong(NAME+".startTime", startTime); conf.setLong(NAME+".endTime", endTime); if (families != null) { conf.set(NAME+".families", families); } Job job = new Job(conf, NAME + "_" + tableName); job.setJarByClass(VerifyReplication.class); Scan scan = new Scan(); if (startTime != 0) { scan.setTimeRange(startTime, endTime == 0 ? HConstants.LATEST_TIMESTAMP : endTime); } if(families != null) { String[] fams = families.split(","); for(String fam : fams) { scan.addFamily(Bytes.toBytes(fam)); } } TableMapReduceUtil.initTableMapperJob(tableName, scan, Verifier.class, null, null, job); job.setOutputFormatClass(NullOutputFormat.class); job.setNumReduceTasks(0); return job; } private static boolean doCommandLine(final String[] args) { if (args.length < 2) { printUsage(null); return false; } try { for (int i = 0; i < args.length; i++) { String cmd = args[i]; if (cmd.equals("-h") || cmd.startsWith("--h")) { printUsage(null); return false; } final String startTimeArgKey = "--starttime="; if (cmd.startsWith(startTimeArgKey)) { startTime = Long.parseLong(cmd.substring(startTimeArgKey.length())); continue; } final String endTimeArgKey = "--endtime="; if (cmd.startsWith(endTimeArgKey)) { endTime = Long.parseLong(cmd.substring(endTimeArgKey.length())); continue; } final String familiesArgKey = "--families="; if (cmd.startsWith(familiesArgKey)) { families = cmd.substring(familiesArgKey.length()); continue; } if (i == args.length-2) { peerId = cmd; } if (i == args.length-1) { tableName = cmd; } } } catch (Exception e) { e.printStackTrace(); printUsage("Can't start because " + e.getMessage()); return false; } return true; } /* * @param errorMsg Error message. Can be null. */ private static void printUsage(final String errorMsg) { if (errorMsg != null && errorMsg.length() > 0) { System.err.println("ERROR: " + errorMsg); } System.err.println("Usage: verifyrep [--starttime=X]" + " [--stoptime=Y] [--families=A] "); System.err.println(); System.err.println("Options:"); System.err.println(" starttime beginning of the time range"); System.err.println(" without endtime means from starttime to forever"); System.err.println(" stoptime end of the time range"); System.err.println(" families comma-separated list of families to copy"); System.err.println(); System.err.println("Args:"); System.err.println(" peerid Id of the peer used for verification, must match the one given for replication"); System.err.println(" tablename Name of the table to verify"); System.err.println(); System.err.println("Examples:"); System.err.println(" To verify the data replicated from TestTable for a 1 hour window with peer #5 "); System.err.println(" $ bin/hbase " + "org.apache.hadoop.hbase.mapreduce.replication.VerifyReplication" + " --starttime=1265875194289 --stoptime=1265878794289 5 TestTable "); } /** * Main entry point. * * @param args The command line parameters. * @throws Exception When running the job fails. */ public static void main(String[] args) throws Exception { Configuration conf = HBaseConfiguration.create(); Job job = createSubmittableJob(conf, args); if (job != null) { System.exit(job.waitForCompletion(true) ? 0 : 1); } } }





© 2015 - 2025 Weber Informatics LLC | Privacy Policy