All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.hadoop.hbase.util.RegionSplitter Maven / Gradle / Ivy

The newest version!
/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.hadoop.hbase.util;

import java.io.IOException;
import java.math.BigInteger;
import java.util.Arrays;
import java.util.Collection;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.TreeMap;
import org.apache.commons.lang3.ArrayUtils;
import org.apache.commons.lang3.StringUtils;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FSDataOutputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hbase.HBaseConfiguration;
import org.apache.hadoop.hbase.HConstants;
import org.apache.hadoop.hbase.HRegionInfo;
import org.apache.hadoop.hbase.HRegionLocation;
import org.apache.hadoop.hbase.ServerName;
import org.apache.hadoop.hbase.TableName;
import org.apache.hadoop.hbase.client.Admin;
import org.apache.hadoop.hbase.client.ClusterConnection;
import org.apache.hadoop.hbase.client.ColumnFamilyDescriptor;
import org.apache.hadoop.hbase.client.ColumnFamilyDescriptorBuilder;
import org.apache.hadoop.hbase.client.Connection;
import org.apache.hadoop.hbase.client.ConnectionFactory;
import org.apache.hadoop.hbase.client.NoServerForRegionException;
import org.apache.hadoop.hbase.client.RegionLocator;
import org.apache.hadoop.hbase.client.Table;
import org.apache.hadoop.hbase.client.TableDescriptor;
import org.apache.hadoop.hbase.client.TableDescriptorBuilder;
import org.apache.hadoop.hbase.regionserver.HRegionFileSystem;
import org.apache.yetus.audience.InterfaceAudience;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import org.apache.hbase.thirdparty.com.google.common.base.Preconditions;
import org.apache.hbase.thirdparty.com.google.common.collect.Lists;
import org.apache.hbase.thirdparty.com.google.common.collect.Maps;
import org.apache.hbase.thirdparty.com.google.common.collect.Sets;
import org.apache.hbase.thirdparty.org.apache.commons.cli.CommandLine;
import org.apache.hbase.thirdparty.org.apache.commons.cli.GnuParser;
import org.apache.hbase.thirdparty.org.apache.commons.cli.HelpFormatter;
import org.apache.hbase.thirdparty.org.apache.commons.cli.OptionBuilder;
import org.apache.hbase.thirdparty.org.apache.commons.cli.Options;
import org.apache.hbase.thirdparty.org.apache.commons.cli.ParseException;

/**
 * The {@link RegionSplitter} class provides several utilities to help in the administration
 * lifecycle for developers who choose to manually split regions instead of having HBase handle that
 * automatically. The most useful utilities are:
 * 

*

    *
  • Create a table with a specified number of pre-split regions *
  • Execute a rolling split of all regions on an existing table *
*

* Both operations can be safely done on a live server. *

* Question: How do I turn off automatic splitting?
* Answer: Automatic splitting is determined by the configuration value * HConstants.HREGION_MAX_FILESIZE. It is not recommended that you set this to Long.MAX_VALUE * in case you forget about manual splits. A suggested setting is 100GB, which would result in > * 1hr major compactions if reached. *

* Question: Why did the original authors decide to manually split?
* Answer: Specific workload characteristics of our use case allowed us to benefit from a * manual split system. *

*

    *
  • Data (~1k) that would grow instead of being replaced *
  • Data growth was roughly uniform across all regions *
  • OLTP workload. Data loss is a big deal. *
*

* Question: Why is manual splitting good for this workload?
* Answer: Although automated splitting is not a bad option, there are benefits to manual * splitting. *

*

    *
  • With growing amounts of data, splits will continually be needed. Since you always know * exactly what regions you have, long-term debugging and profiling is much easier with manual * splits. It is hard to trace the logs to understand region level problems if it keeps splitting * and getting renamed. *
  • Data offlining bugs + unknown number of split regions == oh crap! If an WAL or StoreFile was * mistakenly unprocessed by HBase due to a weird bug and you notice it a day or so later, you can * be assured that the regions specified in these files are the same as the current regions and you * have less headaches trying to restore/replay your data. *
  • You can finely tune your compaction algorithm. With roughly uniform data growth, it's easy to * cause split / compaction storms as the regions all roughly hit the same data size at the same * time. With manual splits, you can let staggered, time-based major compactions spread out your * network IO load. *
*

* Question: What's the optimal number of pre-split regions to create?
* Answer: Mileage will vary depending upon your application. *

* The short answer for our application is that we started with 10 pre-split regions / server and * watched our data growth over time. It's better to err on the side of too little regions and * rolling split later. *

* The more complicated answer is that this depends upon the largest storefile in your region. With * a growing data size, this will get larger over time. You want the largest region to be just big * enough that the {@link org.apache.hadoop.hbase.regionserver.HStore} compact selection algorithm * only compacts it due to a timed major. If you don't, your cluster can be prone to compaction * storms as the algorithm decides to run major compactions on a large series of regions all at * once. Note that compaction storms are due to the uniform data growth, not the manual split * decision. *

* If you pre-split your regions too thin, you can increase the major compaction interval by * configuring HConstants.MAJOR_COMPACTION_PERIOD. If your data size grows too large, use this * script to perform a network IO safe rolling split of all regions. */ @InterfaceAudience.Private public class RegionSplitter { private static final Logger LOG = LoggerFactory.getLogger(RegionSplitter.class); /** * A generic interface for the RegionSplitter code to use for all it's functionality. Note that * the original authors of this code use {@link HexStringSplit} to partition their table and set * it as default, but provided this for your custom algorithm. To use, create a new derived class * from this interface and call {@link RegionSplitter#createPresplitTable} or * RegionSplitter#rollingSplit(TableName, SplitAlgorithm, Configuration) with the argument * splitClassName giving the name of your class. */ public interface SplitAlgorithm { /** * Split a pre-existing region into 2 regions. n * first row (inclusive) n * last row * (exclusive) * @return the split row to use */ byte[] split(byte[] start, byte[] end); /** * Split an entire table. n * number of regions to split the table into n * user input is * validated at this time. may throw a runtime exception in response to a parse failure * @return array of split keys for the initial regions of the table. The length of the returned * array should be numRegions-1. */ byte[][] split(int numRegions); /** * Some MapReduce jobs may want to run multiple mappers per region, this is intended for such * usecase. * @param start first row (inclusive) * @param end last row (exclusive) * @param numSplits number of splits to generate * @param inclusive whether start and end are returned as split points */ byte[][] split(byte[] start, byte[] end, int numSplits, boolean inclusive); /** * In HBase, the first row is represented by an empty byte array. This might cause problems with * your split algorithm or row printing. All your APIs will be passed firstRow() instead of * empty array. * @return your representation of your first row */ byte[] firstRow(); /** * In HBase, the last row is represented by an empty byte array. This might cause problems with * your split algorithm or row printing. All your APIs will be passed firstRow() instead of * empty array. * @return your representation of your last row */ byte[] lastRow(); /** * In HBase, the last row is represented by an empty byte array. Set this value to help the * split code understand how to evenly divide the first region. n * raw user input (may throw * RuntimeException on parse failure) */ void setFirstRow(String userInput); /** * In HBase, the last row is represented by an empty byte array. Set this value to help the * split code understand how to evenly divide the last region. Note that this last row is * inclusive for all rows sharing the same prefix. n * raw user input (may throw * RuntimeException on parse failure) */ void setLastRow(String userInput); /** * n * user or file input for row * @return byte array representation of this row for HBase */ byte[] strToRow(String input); /** * n * byte array representing a row in HBase * @return String to use for debug & file printing */ String rowToStr(byte[] row); /** * @return the separator character to use when storing / printing the row */ String separator(); /** * Set the first row * @param userInput byte array of the row key. */ void setFirstRow(byte[] userInput); /** * Set the last row * @param userInput byte array of the row key. */ void setLastRow(byte[] userInput); } /** * The main function for the RegionSplitter application. Common uses: *

*

    *
  • create a table named 'myTable' with 60 pre-split regions containing 2 column families * 'test' & 'rs', assuming the keys are hex-encoded ASCII: *
      *
    • bin/hbase org.apache.hadoop.hbase.util.RegionSplitter -c 60 -f test:rs myTable * HexStringSplit *
    *
  • create a table named 'myTable' with 50 pre-split regions, assuming the keys are * decimal-encoded ASCII: *
      *
    • bin/hbase org.apache.hadoop.hbase.util.RegionSplitter -c 50 myTable DecimalStringSplit *
    *
  • perform a rolling split of 'myTable' (i.e. 60 => 120 regions), # 2 outstanding splits at * a time, assuming keys are uniformly distributed bytes: *
      *
    • bin/hbase org.apache.hadoop.hbase.util.RegionSplitter -r -o 2 myTable UniformSplit *
    *
* There are three SplitAlgorithms built into RegionSplitter, HexStringSplit, DecimalStringSplit, * and UniformSplit. These are different strategies for choosing region boundaries. See their * source code for details. n * Usage: RegionSplitter <TABLE> <SPLITALGORITHM> <-c * <# regions> -f <family:family:...> | -r [-o <# outstanding splits>]> [-D * <conf.param=value>] n * HBase IO problem n * user requested exit n * problem parsing user * input */ @SuppressWarnings("static-access") public static void main(String[] args) throws IOException, InterruptedException, ParseException { Configuration conf = HBaseConfiguration.create(); // parse user input Options opt = new Options(); opt.addOption(OptionBuilder.withArgName("property=value").hasArg() .withDescription("Override HBase Configuration Settings").create("D")); opt.addOption(OptionBuilder.withArgName("region count").hasArg() .withDescription("Create a new table with a pre-split number of regions").create("c")); opt.addOption(OptionBuilder.withArgName("family:family:...").hasArg() .withDescription("Column Families to create with new table. Required with -c").create("f")); opt.addOption("h", false, "Print this usage help"); opt.addOption("r", false, "Perform a rolling split of an existing region"); opt.addOption(OptionBuilder.withArgName("count").hasArg() .withDescription("Max outstanding splits that have unfinished major compactions") .create("o")); opt.addOption(null, "firstrow", true, "First Row in Table for Split Algorithm"); opt.addOption(null, "lastrow", true, "Last Row in Table for Split Algorithm"); opt.addOption(null, "risky", false, "Skip verification steps to complete quickly. " + "STRONGLY DISCOURAGED for production systems. "); CommandLine cmd = new GnuParser().parse(opt, args); if (cmd.hasOption("D")) { for (String confOpt : cmd.getOptionValues("D")) { String[] kv = confOpt.split("=", 2); if (kv.length == 2) { conf.set(kv[0], kv[1]); LOG.debug("-D configuration override: " + kv[0] + "=" + kv[1]); } else { throw new ParseException("-D option format invalid: " + confOpt); } } } if (cmd.hasOption("risky")) { conf.setBoolean("split.verify", false); } boolean createTable = cmd.hasOption("c") && cmd.hasOption("f"); boolean rollingSplit = cmd.hasOption("r"); boolean oneOperOnly = createTable ^ rollingSplit; if (2 != cmd.getArgList().size() || !oneOperOnly || cmd.hasOption("h")) { new HelpFormatter().printHelp("bin/hbase regionsplitter \n" + "SPLITALGORITHM is the java class name of a class implementing " + "SplitAlgorithm, or one of the special strings HexStringSplit or " + "DecimalStringSplit or UniformSplit, which are built-in split algorithms. " + "HexStringSplit treats keys as hexadecimal ASCII, and " + "DecimalStringSplit treats keys as decimal ASCII, and " + "UniformSplit treats keys as arbitrary bytes.", opt); return; } TableName tableName = TableName.valueOf(cmd.getArgs()[0]); String splitClass = cmd.getArgs()[1]; SplitAlgorithm splitAlgo = newSplitAlgoInstance(conf, splitClass); if (cmd.hasOption("firstrow")) { splitAlgo.setFirstRow(cmd.getOptionValue("firstrow")); } if (cmd.hasOption("lastrow")) { splitAlgo.setLastRow(cmd.getOptionValue("lastrow")); } if (createTable) { conf.set("split.count", cmd.getOptionValue("c")); createPresplitTable(tableName, splitAlgo, cmd.getOptionValue("f").split(":"), conf); } if (rollingSplit) { if (cmd.hasOption("o")) { conf.set("split.outstanding", cmd.getOptionValue("o")); } rollingSplit(tableName, splitAlgo, conf); } } static void createPresplitTable(TableName tableName, SplitAlgorithm splitAlgo, String[] columnFamilies, Configuration conf) throws IOException, InterruptedException { final int splitCount = conf.getInt("split.count", 0); Preconditions.checkArgument(splitCount > 1, "Split count must be > 1"); Preconditions.checkArgument(columnFamilies.length > 0, "Must specify at least one column family. "); LOG.debug("Creating table " + tableName + " with " + columnFamilies.length + " column families. Presplitting to " + splitCount + " regions"); TableDescriptorBuilder builder = TableDescriptorBuilder.newBuilder(tableName); for (String cf : columnFamilies) { builder.setColumnFamily(ColumnFamilyDescriptorBuilder.of(cf)); } try (Connection connection = ConnectionFactory.createConnection(conf)) { Admin admin = connection.getAdmin(); try { Preconditions.checkArgument(!admin.tableExists(tableName), "Table already exists: " + tableName); admin.createTable(builder.build(), splitAlgo.split(splitCount)); } finally { admin.close(); } LOG.debug("Table created! Waiting for regions to show online in META..."); if (!conf.getBoolean("split.verify", true)) { // NOTE: createTable is synchronous on the table, but not on the regions int onlineRegions = 0; try (RegionLocator locator = connection.getRegionLocator(tableName)) { while (onlineRegions < splitCount) { onlineRegions = locator.getAllRegionLocations().size(); LOG.debug(onlineRegions + " of " + splitCount + " regions online..."); if (onlineRegions < splitCount) { Thread.sleep(10 * 1000); // sleep } } } } LOG.debug("Finished creating table with " + splitCount + " regions"); } } /** * Alternative getCurrentNrHRS which is no longer available. n * @return Rough count of * regionservers out on cluster. * @throws IOException if a remote or network exception occurs */ private static int getRegionServerCount(final Connection connection) throws IOException { try (Admin admin = connection.getAdmin()) { Collection servers = admin.getRegionServers(); return servers == null || servers.isEmpty() ? 0 : servers.size(); } } private static byte[] readFile(final FileSystem fs, final Path path) throws IOException { FSDataInputStream tmpIn = fs.open(path); try { byte[] rawData = new byte[tmpIn.available()]; tmpIn.readFully(rawData); return rawData; } finally { tmpIn.close(); } } static void rollingSplit(TableName tableName, SplitAlgorithm splitAlgo, Configuration conf) throws IOException, InterruptedException { final int minOS = conf.getInt("split.outstanding", 2); try (Connection connection = ConnectionFactory.createConnection(conf)) { // Max outstanding splits. default == 50% of servers final int MAX_OUTSTANDING = Math.max(getRegionServerCount(connection) / 2, minOS); Path hbDir = CommonFSUtils.getRootDir(conf); Path tableDir = CommonFSUtils.getTableDir(hbDir, tableName); Path splitFile = new Path(tableDir, "_balancedSplit"); FileSystem fs = FileSystem.get(conf); // Get a list of daughter regions to create LinkedList> tmpRegionSet = null; try (Table table = connection.getTable(tableName)) { tmpRegionSet = getSplits(connection, tableName, splitAlgo); } LinkedList> outstanding = Lists.newLinkedList(); int splitCount = 0; final int origCount = tmpRegionSet.size(); // all splits must compact & we have 1 compact thread, so 2 split // requests to the same RS can stall the outstanding split queue. // To fix, group the regions into an RS pool and round-robin through it LOG.debug("Bucketing regions by regionserver..."); TreeMap>> daughterRegions = Maps.newTreeMap(); // Get a regionLocator. Need it in below. try (RegionLocator regionLocator = connection.getRegionLocator(tableName)) { for (Pair dr : tmpRegionSet) { ServerName rsLocation = regionLocator.getRegionLocation(dr.getSecond()).getServerName(); if (!daughterRegions.containsKey(rsLocation)) { LinkedList> entry = Lists.newLinkedList(); daughterRegions.put(rsLocation, entry); } daughterRegions.get(rsLocation).add(dr); } LOG.debug("Done with bucketing. Split time!"); long startTime = System.currentTimeMillis(); // Open the split file and modify it as splits finish byte[] rawData = readFile(fs, splitFile); FSDataOutputStream splitOut = fs.create(splitFile); try { splitOut.write(rawData); try { // *** split code *** while (!daughterRegions.isEmpty()) { LOG.debug(daughterRegions.size() + " RS have regions to splt."); // Get ServerName to region count mapping final TreeMap rsSizes = Maps.newTreeMap(); List hrls = regionLocator.getAllRegionLocations(); for (HRegionLocation hrl : hrls) { ServerName sn = hrl.getServerName(); if (rsSizes.containsKey(sn)) { rsSizes.put(sn, rsSizes.get(sn) + 1); } else { rsSizes.put(sn, 1); } } // Round-robin through the ServerName list. Choose the lightest-loaded servers // first to keep the master from load-balancing regions as we split. for (Map.Entry>> daughterRegion : daughterRegions.entrySet()) { Pair dr = null; ServerName rsLoc = daughterRegion.getKey(); LinkedList> regionList = daughterRegion.getValue(); // Find a region in the ServerName list that hasn't been moved LOG.debug("Finding a region on " + rsLoc); while (!regionList.isEmpty()) { dr = regionList.pop(); // get current region info byte[] split = dr.getSecond(); HRegionLocation regionLoc = regionLocator.getRegionLocation(split); // if this region moved locations ServerName newRs = regionLoc.getServerName(); if (newRs.compareTo(rsLoc) != 0) { LOG.debug("Region with " + splitAlgo.rowToStr(split) + " moved to " + newRs + ". Relocating..."); // relocate it, don't use it right now if (!daughterRegions.containsKey(newRs)) { LinkedList> entry = Lists.newLinkedList(); daughterRegions.put(newRs, entry); } daughterRegions.get(newRs).add(dr); dr = null; continue; } // make sure this region wasn't already split byte[] sk = regionLoc.getRegionInfo().getStartKey(); if (sk.length != 0) { if (Bytes.equals(split, sk)) { LOG.debug("Region already split on " + splitAlgo.rowToStr(split) + ". Skipping this region..."); ++splitCount; dr = null; continue; } byte[] start = dr.getFirst(); Preconditions.checkArgument(Bytes.equals(start, sk), splitAlgo.rowToStr(start) + " != " + splitAlgo.rowToStr(sk)); } // passed all checks! found a good region break; } if (regionList.isEmpty()) { daughterRegions.remove(rsLoc); } if (dr == null) continue; // we have a good region, time to split! byte[] split = dr.getSecond(); LOG.debug("Splitting at " + splitAlgo.rowToStr(split)); try (Admin admin = connection.getAdmin()) { admin.split(tableName, split); } LinkedList> finished = Lists.newLinkedList(); LinkedList> local_finished = Lists.newLinkedList(); if (conf.getBoolean("split.verify", true)) { // we need to verify and rate-limit our splits outstanding.addLast(dr); // with too many outstanding splits, wait for some to finish while (outstanding.size() >= MAX_OUTSTANDING) { LOG.debug("Wait for outstanding splits " + outstanding.size()); local_finished = splitScan(outstanding, connection, tableName, splitAlgo); if (local_finished.isEmpty()) { Thread.sleep(30 * 1000); } else { finished.addAll(local_finished); outstanding.removeAll(local_finished); LOG.debug(local_finished.size() + " outstanding splits finished"); } } } else { finished.add(dr); } // mark each finished region as successfully split. for (Pair region : finished) { splitOut.writeChars("- " + splitAlgo.rowToStr(region.getFirst()) + " " + splitAlgo.rowToStr(region.getSecond()) + "\n"); splitCount++; if (splitCount % 10 == 0) { long tDiff = (System.currentTimeMillis() - startTime) / splitCount; LOG.debug( "STATUS UPDATE: " + splitCount + " / " + origCount + ". Avg Time / Split = " + org.apache.hadoop.util.StringUtils.formatTime(tDiff)); } } } } if (conf.getBoolean("split.verify", true)) { while (!outstanding.isEmpty()) { LOG.debug("Finally Wait for outstanding splits " + outstanding.size()); LinkedList> finished = splitScan(outstanding, connection, tableName, splitAlgo); if (finished.isEmpty()) { Thread.sleep(30 * 1000); } else { outstanding.removeAll(finished); for (Pair region : finished) { splitOut.writeChars("- " + splitAlgo.rowToStr(region.getFirst()) + " " + splitAlgo.rowToStr(region.getSecond()) + "\n"); splitCount++; } LOG.debug("Finally " + finished.size() + " outstanding splits finished"); } } } LOG.debug("All regions have been successfully split!"); } finally { long tDiff = System.currentTimeMillis() - startTime; LOG.debug("TOTAL TIME = " + org.apache.hadoop.util.StringUtils.formatTime(tDiff)); LOG.debug("Splits = " + splitCount); if (0 < splitCount) { LOG.debug("Avg Time / Split = " + org.apache.hadoop.util.StringUtils.formatTime(tDiff / splitCount)); } } } finally { splitOut.close(); fs.delete(splitFile, false); } } } } /** * @throws IOException if the specified SplitAlgorithm class couldn't be instantiated */ public static SplitAlgorithm newSplitAlgoInstance(Configuration conf, String splitClassName) throws IOException { Class splitClass; // For split algorithms builtin to RegionSplitter, the user can specify // their simple class name instead of a fully qualified class name. if (splitClassName.equals(HexStringSplit.class.getSimpleName())) { splitClass = HexStringSplit.class; } else if (splitClassName.equals(DecimalStringSplit.class.getSimpleName())) { splitClass = DecimalStringSplit.class; } else if (splitClassName.equals(UniformSplit.class.getSimpleName())) { splitClass = UniformSplit.class; } else { try { splitClass = conf.getClassByName(splitClassName); } catch (ClassNotFoundException e) { throw new IOException("Couldn't load split class " + splitClassName, e); } if (splitClass == null) { throw new IOException("Failed loading split class " + splitClassName); } if (!SplitAlgorithm.class.isAssignableFrom(splitClass)) { throw new IOException("Specified split class doesn't implement SplitAlgorithm"); } } try { return splitClass.asSubclass(SplitAlgorithm.class).getDeclaredConstructor().newInstance(); } catch (Exception e) { throw new IOException("Problem loading split algorithm: ", e); } } static LinkedList> splitScan(LinkedList> regionList, final Connection connection, final TableName tableName, SplitAlgorithm splitAlgo) throws IOException, InterruptedException { LinkedList> finished = Lists.newLinkedList(); LinkedList> logicalSplitting = Lists.newLinkedList(); LinkedList> physicalSplitting = Lists.newLinkedList(); // Get table info Pair tableDirAndSplitFile = getTableDirAndSplitFile(connection.getConfiguration(), tableName); Path tableDir = tableDirAndSplitFile.getFirst(); FileSystem fs = tableDir.getFileSystem(connection.getConfiguration()); // Clear the cache to forcibly refresh region information ((ClusterConnection) connection).clearRegionLocationCache(); TableDescriptor htd = null; try (Table table = connection.getTable(tableName)) { htd = table.getDescriptor(); } try (RegionLocator regionLocator = connection.getRegionLocator(tableName)) { // for every region that hasn't been verified as a finished split for (Pair region : regionList) { byte[] start = region.getFirst(); byte[] split = region.getSecond(); // see if the new split daughter region has come online try { HRegionInfo dri = regionLocator.getRegionLocation(split).getRegionInfo(); if (dri.isOffline() || !Bytes.equals(dri.getStartKey(), split)) { logicalSplitting.add(region); continue; } } catch (NoServerForRegionException nsfre) { // NSFRE will occur if the old hbase:meta entry has no server assigned LOG.info(nsfre.toString(), nsfre); logicalSplitting.add(region); continue; } try { // when a daughter region is opened, a compaction is triggered // wait until compaction completes for both daughter regions LinkedList check = Lists.newLinkedList(); check.add(regionLocator.getRegionLocation(start).getRegionInfo()); check.add(regionLocator.getRegionLocation(split).getRegionInfo()); for (HRegionInfo hri : check.toArray(new HRegionInfo[check.size()])) { byte[] sk = hri.getStartKey(); if (sk.length == 0) sk = splitAlgo.firstRow(); HRegionFileSystem regionFs = HRegionFileSystem .openRegionFromFileSystem(connection.getConfiguration(), fs, tableDir, hri, true); // Check every Column Family for that region -- check does not have references. boolean refFound = false; for (ColumnFamilyDescriptor c : htd.getColumnFamilies()) { if ((refFound = regionFs.hasReferences(c.getNameAsString()))) { break; } } // compaction is completed when all reference files are gone if (!refFound) { check.remove(hri); } } if (check.isEmpty()) { finished.add(region); } else { physicalSplitting.add(region); } } catch (NoServerForRegionException nsfre) { LOG.debug("No Server Exception thrown for: " + splitAlgo.rowToStr(start)); physicalSplitting.add(region); ((ClusterConnection) connection).clearRegionLocationCache(); } } LOG.debug("Split Scan: " + finished.size() + " finished / " + logicalSplitting.size() + " split wait / " + physicalSplitting.size() + " reference wait"); return finished; } } /** * nn * @return A Pair where first item is table dir and second is the split file. * @throws IOException if a remote or network exception occurs */ private static Pair getTableDirAndSplitFile(final Configuration conf, final TableName tableName) throws IOException { Path hbDir = CommonFSUtils.getRootDir(conf); Path tableDir = CommonFSUtils.getTableDir(hbDir, tableName); Path splitFile = new Path(tableDir, "_balancedSplit"); return new Pair<>(tableDir, splitFile); } static LinkedList> getSplits(final Connection connection, TableName tableName, SplitAlgorithm splitAlgo) throws IOException { Pair tableDirAndSplitFile = getTableDirAndSplitFile(connection.getConfiguration(), tableName); Path tableDir = tableDirAndSplitFile.getFirst(); Path splitFile = tableDirAndSplitFile.getSecond(); FileSystem fs = tableDir.getFileSystem(connection.getConfiguration()); // Using strings because (new byte[]{0}).equals(new byte[]{0}) == false Set> daughterRegions = Sets.newHashSet(); // Does a split file exist? if (!fs.exists(splitFile)) { // NO = fresh start. calculate splits to make LOG.debug("No " + splitFile.getName() + " file. Calculating splits "); // Query meta for all regions in the table Set> rows = Sets.newHashSet(); Pair tmp = null; try (RegionLocator regionLocator = connection.getRegionLocator(tableName)) { tmp = regionLocator.getStartEndKeys(); } Preconditions.checkArgument(tmp.getFirst().length == tmp.getSecond().length, "Start and End rows should be equivalent"); for (int i = 0; i < tmp.getFirst().length; ++i) { byte[] start = tmp.getFirst()[i], end = tmp.getSecond()[i]; if (start.length == 0) start = splitAlgo.firstRow(); if (end.length == 0) end = splitAlgo.lastRow(); rows.add(Pair.newPair(start, end)); } LOG.debug("Table " + tableName + " has " + rows.size() + " regions that will be split."); // prepare the split file Path tmpFile = new Path(tableDir, "_balancedSplit_prepare"); FSDataOutputStream tmpOut = fs.create(tmpFile); // calculate all the splits == [daughterRegions] = [(start, splitPoint)] for (Pair r : rows) { byte[] splitPoint = splitAlgo.split(r.getFirst(), r.getSecond()); String startStr = splitAlgo.rowToStr(r.getFirst()); String splitStr = splitAlgo.rowToStr(splitPoint); daughterRegions.add(Pair.newPair(startStr, splitStr)); LOG.debug("Will Split [" + startStr + " , " + splitAlgo.rowToStr(r.getSecond()) + ") at " + splitStr); tmpOut.writeChars("+ " + startStr + splitAlgo.separator() + splitStr + "\n"); } tmpOut.close(); fs.rename(tmpFile, splitFile); } else { LOG.debug("_balancedSplit file found. Replay log to restore state..."); RecoverLeaseFSUtils.recoverFileLease(fs, splitFile, connection.getConfiguration(), null); // parse split file and process remaining splits FSDataInputStream tmpIn = fs.open(splitFile); StringBuilder sb = new StringBuilder(tmpIn.available()); while (tmpIn.available() > 0) { sb.append(tmpIn.readChar()); } tmpIn.close(); for (String line : sb.toString().split("\n")) { String[] cmd = line.split(splitAlgo.separator()); Preconditions.checkArgument(3 == cmd.length); byte[] start = splitAlgo.strToRow(cmd[1]); String startStr = splitAlgo.rowToStr(start); byte[] splitPoint = splitAlgo.strToRow(cmd[2]); String splitStr = splitAlgo.rowToStr(splitPoint); Pair r = Pair.newPair(startStr, splitStr); if (cmd[0].equals("+")) { LOG.debug("Adding: " + r); daughterRegions.add(r); } else { LOG.debug("Removing: " + r); Preconditions.checkArgument(cmd[0].equals("-"), "Unknown option: " + cmd[0]); Preconditions.checkState(daughterRegions.contains(r), "Missing row: " + r); daughterRegions.remove(r); } } LOG.debug("Done reading. " + daughterRegions.size() + " regions left."); } LinkedList> ret = Lists.newLinkedList(); for (Pair r : daughterRegions) { ret.add(Pair.newPair(splitAlgo.strToRow(r.getFirst()), splitAlgo.strToRow(r.getSecond()))); } return ret; } /** * HexStringSplit is a well-known {@link SplitAlgorithm} for choosing region boundaries. The * format of a HexStringSplit region boundary is the ASCII representation of an MD5 checksum, or * any other uniformly distributed hexadecimal value. Row are hex-encoded long values in the range * "00000000" => "FFFFFFFF" and are left-padded with zeros to keep the same order * lexicographically as if they were binary. Since this split algorithm uses hex strings as keys, * it is easy to read & write in the shell but takes up more space and may be non-intuitive. */ public static class HexStringSplit extends NumberStringSplit { final static String DEFAULT_MIN_HEX = "00000000"; final static String DEFAULT_MAX_HEX = "FFFFFFFF"; final static int RADIX_HEX = 16; public HexStringSplit() { super(DEFAULT_MIN_HEX, DEFAULT_MAX_HEX, RADIX_HEX); } } /** * The format of a DecimalStringSplit region boundary is the ASCII representation of reversed * sequential number, or any other uniformly distributed decimal value. Row are decimal-encoded * long values in the range "00000000" => "99999999" and are left-padded with zeros to * keep the same order lexicographically as if they were binary. */ public static class DecimalStringSplit extends NumberStringSplit { final static String DEFAULT_MIN_DEC = "00000000"; final static String DEFAULT_MAX_DEC = "99999999"; final static int RADIX_DEC = 10; public DecimalStringSplit() { super(DEFAULT_MIN_DEC, DEFAULT_MAX_DEC, RADIX_DEC); } } public abstract static class NumberStringSplit implements SplitAlgorithm { String firstRow; BigInteger firstRowInt; String lastRow; BigInteger lastRowInt; int rowComparisonLength; int radix; NumberStringSplit(String minRow, String maxRow, int radix) { this.firstRow = minRow; this.lastRow = maxRow; this.radix = radix; this.firstRowInt = BigInteger.ZERO; this.lastRowInt = new BigInteger(lastRow, this.radix); this.rowComparisonLength = lastRow.length(); } @Override public byte[] split(byte[] start, byte[] end) { BigInteger s = convertToBigInteger(start); BigInteger e = convertToBigInteger(end); Preconditions.checkArgument(!e.equals(BigInteger.ZERO)); return convertToByte(split2(s, e)); } @Override public byte[][] split(int n) { Preconditions.checkArgument(lastRowInt.compareTo(firstRowInt) > 0, "last row (%s) is configured less than first row (%s)", lastRow, firstRow); // +1 to range because the last row is inclusive BigInteger range = lastRowInt.subtract(firstRowInt).add(BigInteger.ONE); Preconditions.checkState(range.compareTo(BigInteger.valueOf(n)) >= 0, "split granularity (%s) is greater than the range (%s)", n, range); BigInteger[] splits = new BigInteger[n - 1]; BigInteger sizeOfEachSplit = range.divide(BigInteger.valueOf(n)); for (int i = 1; i < n; i++) { // NOTE: this means the last region gets all the slop. // This is not a big deal if we're assuming n << MAXHEX splits[i - 1] = firstRowInt.add(sizeOfEachSplit.multiply(BigInteger.valueOf(i))); } return convertToBytes(splits); } @Override public byte[][] split(byte[] start, byte[] end, int numSplits, boolean inclusive) { BigInteger s = convertToBigInteger(start); BigInteger e = convertToBigInteger(end); Preconditions.checkArgument(e.compareTo(s) > 0, "last row (%s) is configured less than first row (%s)", rowToStr(end), end); // +1 to range because the last row is inclusive BigInteger range = e.subtract(s).add(BigInteger.ONE); Preconditions.checkState(range.compareTo(BigInteger.valueOf(numSplits)) >= 0, "split granularity (%s) is greater than the range (%s)", numSplits, range); BigInteger[] splits = new BigInteger[numSplits - 1]; BigInteger sizeOfEachSplit = range.divide(BigInteger.valueOf(numSplits)); for (int i = 1; i < numSplits; i++) { // NOTE: this means the last region gets all the slop. // This is not a big deal if we're assuming n << MAXHEX splits[i - 1] = s.add(sizeOfEachSplit.multiply(BigInteger.valueOf(i))); } if (inclusive) { BigInteger[] inclusiveSplitPoints = new BigInteger[numSplits + 1]; inclusiveSplitPoints[0] = convertToBigInteger(start); inclusiveSplitPoints[numSplits] = convertToBigInteger(end); System.arraycopy(splits, 0, inclusiveSplitPoints, 1, splits.length); return convertToBytes(inclusiveSplitPoints); } else { return convertToBytes(splits); } } @Override public byte[] firstRow() { return convertToByte(firstRowInt); } @Override public byte[] lastRow() { return convertToByte(lastRowInt); } @Override public void setFirstRow(String userInput) { firstRow = userInput; firstRowInt = new BigInteger(firstRow, radix); } @Override public void setLastRow(String userInput) { lastRow = userInput; lastRowInt = new BigInteger(lastRow, radix); // Precondition: lastRow > firstRow, so last's length is the greater rowComparisonLength = lastRow.length(); } @Override public byte[] strToRow(String in) { return convertToByte(new BigInteger(in, radix)); } @Override public String rowToStr(byte[] row) { return Bytes.toStringBinary(row); } @Override public String separator() { return " "; } @Override public void setFirstRow(byte[] userInput) { firstRow = Bytes.toString(userInput); } @Override public void setLastRow(byte[] userInput) { lastRow = Bytes.toString(userInput); } /** * Divide 2 numbers in half (for split algorithm) * @param a number #1 * @param b number #2 * @return the midpoint of the 2 numbers */ public BigInteger split2(BigInteger a, BigInteger b) { return a.add(b).divide(BigInteger.valueOf(2)).abs(); } /** * Returns an array of bytes corresponding to an array of BigIntegers * @param bigIntegers numbers to convert * @return bytes corresponding to the bigIntegers */ public byte[][] convertToBytes(BigInteger[] bigIntegers) { byte[][] returnBytes = new byte[bigIntegers.length][]; for (int i = 0; i < bigIntegers.length; i++) { returnBytes[i] = convertToByte(bigIntegers[i]); } return returnBytes; } /** * Returns the bytes corresponding to the BigInteger * @param bigInteger number to convert * @param pad padding length * @return byte corresponding to input BigInteger */ public byte[] convertToByte(BigInteger bigInteger, int pad) { String bigIntegerString = bigInteger.toString(radix); bigIntegerString = StringUtils.leftPad(bigIntegerString, pad, '0'); return Bytes.toBytes(bigIntegerString); } /** * Returns the bytes corresponding to the BigInteger * @param bigInteger number to convert * @return corresponding bytes */ public byte[] convertToByte(BigInteger bigInteger) { return convertToByte(bigInteger, rowComparisonLength); } /** * Returns the BigInteger represented by the byte array * @param row byte array representing row * @return the corresponding BigInteger */ public BigInteger convertToBigInteger(byte[] row) { return (row.length > 0) ? new BigInteger(Bytes.toString(row), radix) : BigInteger.ZERO; } @Override public String toString() { return this.getClass().getSimpleName() + " [" + rowToStr(firstRow()) + "," + rowToStr(lastRow()) + "]"; } } /** * A SplitAlgorithm that divides the space of possible keys evenly. Useful when the keys are * approximately uniform random bytes (e.g. hashes). Rows are raw byte values in the range 00 * => FF and are right-padded with zeros to keep the same memcmp() order. This is the * natural algorithm to use for a byte[] environment and saves space, but is not necessarily the * easiest for readability. */ public static class UniformSplit implements SplitAlgorithm { static final byte xFF = (byte) 0xFF; byte[] firstRowBytes = ArrayUtils.EMPTY_BYTE_ARRAY; byte[] lastRowBytes = new byte[] { xFF, xFF, xFF, xFF, xFF, xFF, xFF, xFF }; @Override public byte[] split(byte[] start, byte[] end) { return Bytes.split(start, end, 1)[1]; } @Override public byte[][] split(int numRegions) { Preconditions.checkArgument(Bytes.compareTo(lastRowBytes, firstRowBytes) > 0, "last row (%s) is configured less than first row (%s)", Bytes.toStringBinary(lastRowBytes), Bytes.toStringBinary(firstRowBytes)); byte[][] splits = Bytes.split(firstRowBytes, lastRowBytes, true, numRegions - 1); Preconditions.checkState(splits != null, "Could not split region with given user input: " + this); // remove endpoints, which are included in the splits list return splits == null ? null : Arrays.copyOfRange(splits, 1, splits.length - 1); } @Override public byte[][] split(byte[] start, byte[] end, int numSplits, boolean inclusive) { if (Arrays.equals(start, HConstants.EMPTY_BYTE_ARRAY)) { start = firstRowBytes; } if (Arrays.equals(end, HConstants.EMPTY_BYTE_ARRAY)) { end = lastRowBytes; } Preconditions.checkArgument(Bytes.compareTo(end, start) > 0, "last row (%s) is configured less than first row (%s)", Bytes.toStringBinary(end), Bytes.toStringBinary(start)); byte[][] splits = Bytes.split(start, end, true, numSplits - 1); Preconditions.checkState(splits != null, "Could not calculate input splits with given user input: " + this); if (inclusive) { return splits; } else { // remove endpoints, which are included in the splits list return Arrays.copyOfRange(splits, 1, splits.length - 1); } } @Override public byte[] firstRow() { return firstRowBytes; } @Override public byte[] lastRow() { return lastRowBytes; } @Override public void setFirstRow(String userInput) { firstRowBytes = Bytes.toBytesBinary(userInput); } @Override public void setLastRow(String userInput) { lastRowBytes = Bytes.toBytesBinary(userInput); } @Override public void setFirstRow(byte[] userInput) { firstRowBytes = userInput; } @Override public void setLastRow(byte[] userInput) { lastRowBytes = userInput; } @Override public byte[] strToRow(String input) { return Bytes.toBytesBinary(input); } @Override public String rowToStr(byte[] row) { return Bytes.toStringBinary(row); } @Override public String separator() { return ","; } @Override public String toString() { return this.getClass().getSimpleName() + " [" + rowToStr(firstRow()) + "," + rowToStr(lastRow()) + "]"; } } }