All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.hadoop.hbase.regionserver.SplitTransaction Maven / Gradle / Ivy

There is a newer version: 3.0.0-beta-1
Show newest version
/**
 *
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.hadoop.hbase.regionserver;

import static org.apache.hadoop.hbase.executor.EventType.RS_ZK_REQUEST_REGION_SPLIT;
import static org.apache.hadoop.hbase.executor.EventType.RS_ZK_REGION_SPLIT;
import static org.apache.hadoop.hbase.executor.EventType.RS_ZK_REGION_SPLITTING;

import java.io.IOException;
import java.io.InterruptedIOException;
import java.security.PrivilegedExceptionAction;
import java.util.ArrayList;
import java.util.List;
import java.util.ListIterator;
import java.util.Map;
import java.util.concurrent.Callable;
import java.util.concurrent.ExecutionException;
import java.util.concurrent.Executors;
import java.util.concurrent.Future;
import java.util.concurrent.ThreadFactory;
import java.util.concurrent.ThreadPoolExecutor;
import java.util.concurrent.TimeUnit;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hbase.classification.InterfaceAudience;
import org.apache.hadoop.hbase.HConstants;
import org.apache.hadoop.hbase.HRegionInfo;
import org.apache.hadoop.hbase.RegionTransition;
import org.apache.hadoop.hbase.Server;
import org.apache.hadoop.hbase.ServerName;
import org.apache.hadoop.hbase.catalog.CatalogTracker;
import org.apache.hadoop.hbase.catalog.MetaEditor;
import org.apache.hadoop.hbase.client.Mutation;
import org.apache.hadoop.hbase.client.Put;
import org.apache.hadoop.hbase.executor.EventType;
import org.apache.hadoop.hbase.protobuf.generated.RegionServerStatusProtos.RegionStateTransition.TransitionCode;
import org.apache.hadoop.hbase.security.User;
import org.apache.hadoop.hbase.util.Bytes;
import org.apache.hadoop.hbase.util.CancelableProgressable;
import org.apache.hadoop.hbase.util.ConfigUtil;
import org.apache.hadoop.hbase.util.EnvironmentEdgeManager;
import org.apache.hadoop.hbase.util.FSUtils;
import org.apache.hadoop.hbase.util.HasThread;
import org.apache.hadoop.hbase.util.Pair;
import org.apache.hadoop.hbase.util.PairOfSameType;
import org.apache.hadoop.hbase.zookeeper.ZKAssign;
import org.apache.hadoop.hbase.zookeeper.ZKUtil;
import org.apache.hadoop.hbase.zookeeper.ZooKeeperWatcher;
import org.apache.zookeeper.KeeperException;
import org.apache.zookeeper.KeeperException.NodeExistsException;
import org.apache.zookeeper.data.Stat;

import com.google.common.util.concurrent.ThreadFactoryBuilder;

/**
 * Executes region split as a "transaction".  Call {@link #prepare()} to setup
 * the transaction, {@link #execute(Server, RegionServerServices)} to run the
 * transaction and {@link #rollback(Server, RegionServerServices)} to cleanup if execute fails.
 *
 * 

Here is an example of how you would use this class: *

 *  SplitTransaction st = new SplitTransaction(this.conf, parent, midKey)
 *  if (!st.prepare()) return;
 *  try {
 *    st.execute(server, services);
 *  } catch (IOException ioe) {
 *    try {
 *      st.rollback(server, services);
 *      return;
 *    } catch (RuntimeException e) {
 *      myAbortable.abort("Failed split, abort");
 *    }
 *  }
 * 
*

This class is not thread safe. Caller needs ensure split is run by * one thread only. */ @InterfaceAudience.Private public class SplitTransaction { private static final Log LOG = LogFactory.getLog(SplitTransaction.class); /* * Region to split */ private final HRegion parent; private HRegionInfo hri_a; private HRegionInfo hri_b; private long fileSplitTimeout = 30000; private int znodeVersion = -1; boolean useZKForAssignment; /* * Row to split around */ private final byte [] splitrow; /** * Types to add to the transaction journal. * Each enum is a step in the split transaction. Used to figure how much * we need to rollback. */ static enum JournalEntryType { /** * Started */ STARTED, /** * Prepared (after table lock) */ PREPARED, /** * Before preSplit coprocessor hook */ BEFORE_PRE_SPLIT_HOOK, /** * After preSplit coprocessor hook */ AFTER_PRE_SPLIT_HOOK, /** * Set region as in transition, set it into SPLITTING state. */ SET_SPLITTING_IN_ZK, /** * We created the temporary split data directory. */ CREATE_SPLIT_DIR, /** * Closed the parent region. */ CLOSED_PARENT_REGION, /** * The parent has been taken out of the server's online regions list. */ OFFLINED_PARENT, /** * Started in on creation of the first daughter region. */ STARTED_REGION_A_CREATION, /** * Started in on the creation of the second daughter region. */ STARTED_REGION_B_CREATION, /** * Opened the first daughter region */ OPENED_REGION_A, /** * Opened the second daughter region */ OPENED_REGION_B, /** * Before postSplit coprocessor hook */ BEFORE_POST_SPLIT_HOOK, /** * After postSplit coprocessor hook */ AFTER_POST_SPLIT_HOOK, /** * Point of no return. * If we got here, then transaction is not recoverable other than by * crashing out the regionserver. */ PONR } static class JournalEntry { private JournalEntryType type; private long timestamp; public JournalEntry(JournalEntryType type) { this(type, EnvironmentEdgeManager.currentTimeMillis()); } public JournalEntry(JournalEntryType type, long timestamp) { this.type = type; this.timestamp = timestamp; } @Override public String toString() { StringBuilder sb = new StringBuilder(); sb.append(type); sb.append(" at "); sb.append(timestamp); return sb.toString(); } } /* * Journal of how far the split transaction has progressed. */ private final List journal = new ArrayList(); /** * Constructor * @param r Region to split * @param splitrow Row to split around */ public SplitTransaction(final HRegion r, final byte [] splitrow) { this.parent = r; this.splitrow = splitrow; this.journal.add(new JournalEntry(JournalEntryType.STARTED)); this.useZKForAssignment = ConfigUtil.useZKForAssignment(r.getBaseConf()); } /** * Does checks on split inputs. * @return true if the region is splittable else * false if it is not (e.g. its already closed, etc.). */ public boolean prepare() { if (!this.parent.isSplittable()) return false; // Split key can be null if this region is unsplittable; i.e. has refs. if (this.splitrow == null) return false; HRegionInfo hri = this.parent.getRegionInfo(); parent.prepareToSplit(); // Check splitrow. byte [] startKey = hri.getStartKey(); byte [] endKey = hri.getEndKey(); if (Bytes.equals(startKey, splitrow) || !this.parent.getRegionInfo().containsRow(splitrow)) { LOG.info("Split row is not inside region key range or is equal to " + "startkey: " + Bytes.toStringBinary(this.splitrow)); return false; } long rid = getDaughterRegionIdTimestamp(hri); this.hri_a = new HRegionInfo(hri.getTable(), startKey, this.splitrow, false, rid); this.hri_b = new HRegionInfo(hri.getTable(), this.splitrow, endKey, false, rid); this.journal.add(new JournalEntry(JournalEntryType.PREPARED)); return true; } /** * Calculate daughter regionid to use. * @param hri Parent {@link HRegionInfo} * @return Daughter region id (timestamp) to use. */ private static long getDaughterRegionIdTimestamp(final HRegionInfo hri) { long rid = EnvironmentEdgeManager.currentTimeMillis(); // Regionid is timestamp. Can't be less than that of parent else will insert // at wrong location in hbase:meta (See HBASE-710). if (rid < hri.getRegionId()) { LOG.warn("Clock skew; parent regions id is " + hri.getRegionId() + " but current time here is " + rid); rid = hri.getRegionId() + 1; } return rid; } private static IOException closedByOtherException = new IOException( "Failed to close region: already closed by another thread"); /** * Prepare the regions and region files. * @param server Hosting server instance. Can be null when testing (won't try * and update in zk if a null server) * @param services Used to online/offline regions. * @param user * @throws IOException If thrown, transaction failed. * Call {@link #rollback(Server, RegionServerServices)} * @return Regions created */ @Deprecated /* package */PairOfSameType createDaughters(final Server server, final RegionServerServices services) throws IOException { return createDaughters(server, services, null); } /* package */PairOfSameType createDaughters(final Server server, final RegionServerServices services, User user) throws IOException { LOG.info("Starting split of region " + this.parent); if ((server != null && server.isStopped()) || (services != null && services.isStopping())) { throw new IOException("Server is stopped or stopping"); } assert !this.parent.lock.writeLock().isHeldByCurrentThread(): "Unsafe to hold write lock while performing RPCs"; journal.add(new JournalEntry(JournalEntryType.BEFORE_PRE_SPLIT_HOOK)); // Coprocessor callback if (this.parent.getCoprocessorHost() != null) { if (user == null) { // TODO: Remove one of these parent.getCoprocessorHost().preSplit(); parent.getCoprocessorHost().preSplit(splitrow); } else { try { user.getUGI().doAs(new PrivilegedExceptionAction() { @Override public Void run() throws Exception { parent.getCoprocessorHost().preSplit(); parent.getCoprocessorHost().preSplit(splitrow); return null; } }); } catch (InterruptedException ie) { InterruptedIOException iioe = new InterruptedIOException(); iioe.initCause(ie); throw iioe; } } } journal.add(new JournalEntry(JournalEntryType.AFTER_PRE_SPLIT_HOOK)); // If true, no cluster to write meta edits to or to update znodes in. boolean testing = server == null? true: server.getConfiguration().getBoolean("hbase.testing.nocluster", false); this.fileSplitTimeout = testing ? this.fileSplitTimeout : server.getConfiguration().getLong("hbase.regionserver.fileSplitTimeout", this.fileSplitTimeout); PairOfSameType daughterRegions = stepsBeforePONR(server, services, testing); final List metaEntries = new ArrayList(); boolean ret = false; if (this.parent.getCoprocessorHost() != null) { if (user == null) { ret = parent.getCoprocessorHost().preSplitBeforePONR(splitrow, metaEntries); } else { try { ret = user.getUGI().doAs(new PrivilegedExceptionAction() { @Override public Boolean run() throws Exception { return parent.getCoprocessorHost().preSplitBeforePONR(splitrow, metaEntries); } }); } catch (InterruptedException ie) { InterruptedIOException iioe = new InterruptedIOException(); iioe.initCause(ie); throw iioe; } } if (ret) { throw new IOException("Coprocessor bypassing region " + this.parent.getRegionNameAsString() + " split."); } try { for (Mutation p : metaEntries) { HRegionInfo.parseRegionName(p.getRow()); } } catch (IOException e) { LOG.error("Row key of mutation from coprossor is not parsable as region name." + "Mutations from coprocessor should only for hbase:meta table."); throw e; } } // This is the point of no return. Adding subsequent edits to .META. as we // do below when we do the daughter opens adding each to .META. can fail in // various interesting ways the most interesting of which is a timeout // BUT the edits all go through (See HBASE-3872). IF we reach the PONR // then subsequent failures need to crash out this regionserver; the // server shutdown processing should be able to fix-up the incomplete split. // The offlined parent will have the daughters as extra columns. If // we leave the daughter regions in place and do not remove them when we // crash out, then they will have their references to the parent in place // still and the server shutdown fixup of .META. will point to these // regions. // We should add PONR JournalEntry before offlineParentInMeta,so even if // OfflineParentInMeta timeout,this will cause regionserver exit,and then // master ServerShutdownHandler will fix daughter & avoid data loss. (See // HBase-4562). this.journal.add(new JournalEntry(JournalEntryType.PONR)); // Edit parent in meta. Offlines parent region and adds splita and splitb // as an atomic update. See HBASE-7721. This update to META makes the region // will determine whether the region is split or not in case of failures. // If it is successful, master will roll-forward, if not, master will rollback // and assign the parent region. if (!testing && useZKForAssignment) { if (metaEntries == null || metaEntries.isEmpty()) { MetaEditor.splitRegion(server.getCatalogTracker(), parent.getRegionInfo(), daughterRegions .getFirst().getRegionInfo(), daughterRegions.getSecond().getRegionInfo(), server .getServerName()); } else { offlineParentInMetaAndputMetaEntries(server.getCatalogTracker(), parent.getRegionInfo(), daughterRegions.getFirst().getRegionInfo(), daughterRegions.getSecond().getRegionInfo(), server.getServerName(), metaEntries); } } else if (services != null && !useZKForAssignment) { if (!services.reportRegionStateTransition(TransitionCode.SPLIT_PONR, parent.getRegionInfo(), hri_a, hri_b)) { // Passed PONR, let SSH clean it up throw new IOException("Failed to notify master that split passed PONR: " + parent.getRegionInfo().getRegionNameAsString()); } } return daughterRegions; } public PairOfSameType stepsBeforePONR(final Server server, final RegionServerServices services, boolean testing) throws IOException { // Set ephemeral SPLITTING znode up in zk. Mocked servers sometimes don't // have zookeeper so don't do zk stuff if server or zookeeper is null if (server != null && server.getZooKeeper() != null && useZKForAssignment) { try { createNodeSplitting(server.getZooKeeper(), parent.getRegionInfo(), server.getServerName(), hri_a, hri_b); } catch (KeeperException e) { throw new IOException("Failed creating PENDING_SPLIT znode on " + this.parent.getRegionNameAsString(), e); } } else if (services != null && !useZKForAssignment) { if (!services.reportRegionStateTransition(TransitionCode.READY_TO_SPLIT, parent.getRegionInfo(), hri_a, hri_b)) { throw new IOException("Failed to get ok from master to split " + parent.getRegionNameAsString()); } } this.journal.add(new JournalEntry(JournalEntryType.SET_SPLITTING_IN_ZK)); if (server != null && server.getZooKeeper() != null && useZKForAssignment) { // After creating the split node, wait for master to transition it // from PENDING_SPLIT to SPLITTING so that we can move on. We want master // knows about it and won't transition any region which is splitting. znodeVersion = getZKNode(server, services); } this.parent.getRegionFileSystem().createSplitsDir(); this.journal.add(new JournalEntry(JournalEntryType.CREATE_SPLIT_DIR)); Map> hstoreFilesToSplit = null; Exception exceptionToThrow = null; try{ hstoreFilesToSplit = this.parent.close(false); } catch (Exception e) { exceptionToThrow = e; } if (exceptionToThrow == null && hstoreFilesToSplit == null) { // The region was closed by a concurrent thread. We can't continue // with the split, instead we must just abandon the split. If we // reopen or split this could cause problems because the region has // probably already been moved to a different server, or is in the // process of moving to a different server. exceptionToThrow = closedByOtherException; } if (exceptionToThrow != closedByOtherException) { this.journal.add(new JournalEntry(JournalEntryType.CLOSED_PARENT_REGION)); } if (exceptionToThrow != null) { if (exceptionToThrow instanceof IOException) throw (IOException)exceptionToThrow; throw new IOException(exceptionToThrow); } if (!testing) { services.removeFromOnlineRegions(this.parent, null); } this.journal.add(new JournalEntry(JournalEntryType.OFFLINED_PARENT)); // TODO: If splitStoreFiles were multithreaded would we complete steps in // less elapsed time? St.Ack 20100920 // // splitStoreFiles creates daughter region dirs under the parent splits dir // Nothing to unroll here if failure -- clean up of CREATE_SPLIT_DIR will // clean this up. Pair expectedReferences = splitStoreFiles(hstoreFilesToSplit); // Log to the journal that we are creating region A, the first daughter // region. We could fail halfway through. If we do, we could have left // stuff in fs that needs cleanup -- a storefile or two. Thats why we // add entry to journal BEFORE rather than AFTER the change. this.journal.add(new JournalEntry(JournalEntryType.STARTED_REGION_A_CREATION)); assertReferenceFileCount(expectedReferences.getFirst(), this.parent.getRegionFileSystem().getSplitsDir(this.hri_a)); HRegion a = this.parent.createDaughterRegionFromSplits(this.hri_a); assertReferenceFileCount(expectedReferences.getFirst(), new Path(this.parent.getRegionFileSystem().getTableDir(), this.hri_a.getEncodedName())); // Ditto this.journal.add(new JournalEntry(JournalEntryType.STARTED_REGION_B_CREATION)); assertReferenceFileCount(expectedReferences.getSecond(), this.parent.getRegionFileSystem().getSplitsDir(this.hri_b)); HRegion b = this.parent.createDaughterRegionFromSplits(this.hri_b); assertReferenceFileCount(expectedReferences.getSecond(), new Path(this.parent.getRegionFileSystem().getTableDir(), this.hri_b.getEncodedName())); return new PairOfSameType(a, b); } void assertReferenceFileCount(int expectedReferenceFileCount, Path dir) throws IOException { if (expectedReferenceFileCount != 0 && expectedReferenceFileCount != FSUtils.getRegionReferenceFileCount(this.parent.getFilesystem(), dir)) { throw new IOException("Failing split. Expected reference file count isn't equal."); } } /** * Perform time consuming opening of the daughter regions. * @param server Hosting server instance. Can be null when testing (won't try * and update in zk if a null server) * @param services Used to online/offline regions. * @param a first daughter region * @param a second daughter region * @throws IOException If thrown, transaction failed. * Call {@link #rollback(Server, RegionServerServices)} */ /* package */void openDaughters(final Server server, final RegionServerServices services, HRegion a, HRegion b) throws IOException { boolean stopped = server != null && server.isStopped(); boolean stopping = services != null && services.isStopping(); // TODO: Is this check needed here? if (stopped || stopping) { LOG.info("Not opening daughters " + b.getRegionInfo().getRegionNameAsString() + " and " + a.getRegionInfo().getRegionNameAsString() + " because stopping=" + stopping + ", stopped=" + stopped); } else { // Open daughters in parallel. DaughterOpener aOpener = new DaughterOpener(server, a); DaughterOpener bOpener = new DaughterOpener(server, b); aOpener.start(); bOpener.start(); try { aOpener.join(); if (aOpener.getException() == null) { journal.add(new JournalEntry(JournalEntryType.OPENED_REGION_A)); } bOpener.join(); if (bOpener.getException() == null) { journal.add(new JournalEntry(JournalEntryType.OPENED_REGION_B)); } } catch (InterruptedException e) { throw (InterruptedIOException)new InterruptedIOException().initCause(e); } if (aOpener.getException() != null) { throw new IOException("Failed " + aOpener.getName(), aOpener.getException()); } if (bOpener.getException() != null) { throw new IOException("Failed " + bOpener.getName(), bOpener.getException()); } if (services != null) { try { if (useZKForAssignment) { // add 2nd daughter first (see HBASE-4335) services.postOpenDeployTasks(b, server.getCatalogTracker()); } else if (!services.reportRegionStateTransition(TransitionCode.SPLIT, parent.getRegionInfo(), hri_a, hri_b)) { throw new IOException("Failed to report split region to master: " + parent.getRegionInfo().getShortNameToLog()); } // Should add it to OnlineRegions services.addToOnlineRegions(b); if (useZKForAssignment) { services.postOpenDeployTasks(a, server.getCatalogTracker()); } services.addToOnlineRegions(a); } catch (KeeperException ke) { throw new IOException(ke); } } } } /** * Finish off split transaction, transition the zknode * @param server Hosting server instance. Can be null when testing (won't try * and update in zk if a null server) * @param services Used to online/offline regions. * @param a first daughter region * @param a second daughter region * @throws IOException If thrown, transaction failed. * Call {@link #rollback(Server, RegionServerServices)} */ /* package */void transitionZKNode(final Server server, final RegionServerServices services, HRegion a, HRegion b) throws IOException { // Tell master about split by updating zk. If we fail, abort. if (server != null && server.getZooKeeper() != null) { try { this.znodeVersion = transitionSplittingNode(server.getZooKeeper(), parent.getRegionInfo(), a.getRegionInfo(), b.getRegionInfo(), server.getServerName(), this.znodeVersion, RS_ZK_REGION_SPLITTING, RS_ZK_REGION_SPLIT); int spins = 0; // Now wait for the master to process the split. We know it's done // when the znode is deleted. The reason we keep tickling the znode is // that it's possible for the master to miss an event. do { if (spins % 10 == 0) { LOG.debug("Still waiting on the master to process the split for " + this.parent.getRegionInfo().getEncodedName()); } Thread.sleep(100); // When this returns -1 it means the znode doesn't exist this.znodeVersion = transitionSplittingNode(server.getZooKeeper(), parent.getRegionInfo(), a.getRegionInfo(), b.getRegionInfo(), server.getServerName(), this.znodeVersion, RS_ZK_REGION_SPLIT, RS_ZK_REGION_SPLIT); spins++; } while (this.znodeVersion != -1 && !server.isStopped() && !services.isStopping()); } catch (Exception e) { if (e instanceof InterruptedException) { Thread.currentThread().interrupt(); } throw new IOException("Failed telling master about split", e); } } // Leaving here, the splitdir with its dross will be in place but since the // split was successful, just leave it; it'll be cleaned when parent is // deleted and cleaned up. } /** * Wait for the splitting node to be transitioned from pending_split * to splitting by master. That's how we are sure master has processed * the event and is good with us to move on. If we don't get any update, * we periodically transition the node so that master gets the callback. * If the node is removed or is not in pending_split state any more, * we abort the split. */ private int getZKNode(final Server server, final RegionServerServices services) throws IOException { // Wait for the master to process the pending_split. try { int spins = 0; Stat stat = new Stat(); ZooKeeperWatcher zkw = server.getZooKeeper(); ServerName expectedServer = server.getServerName(); String node = parent.getRegionInfo().getEncodedName(); while (!(server.isStopped() || services.isStopping())) { if (spins % 5 == 0) { LOG.debug("Still waiting for master to process " + "the pending_split for " + node); transitionSplittingNode(zkw, parent.getRegionInfo(), hri_a, hri_b, expectedServer, -1, RS_ZK_REQUEST_REGION_SPLIT, RS_ZK_REQUEST_REGION_SPLIT); } Thread.sleep(100); spins++; byte [] data = ZKAssign.getDataNoWatch(zkw, node, stat); if (data == null) { throw new IOException("Data is null, splitting node " + node + " no longer exists"); } RegionTransition rt = RegionTransition.parseFrom(data); EventType et = rt.getEventType(); if (et == RS_ZK_REGION_SPLITTING) { ServerName serverName = rt.getServerName(); if (!serverName.equals(expectedServer)) { throw new IOException("Splitting node " + node + " is for " + serverName + ", not us " + expectedServer); } byte [] payloadOfSplitting = rt.getPayload(); List splittingRegions = HRegionInfo.parseDelimitedFrom( payloadOfSplitting, 0, payloadOfSplitting.length); assert splittingRegions.size() == 2; HRegionInfo a = splittingRegions.get(0); HRegionInfo b = splittingRegions.get(1); if (!(hri_a.equals(a) && hri_b.equals(b))) { throw new IOException("Splitting node " + node + " is for " + a + ", " + b + ", not expected daughters: " + hri_a + ", " + hri_b); } // Master has processed it. return stat.getVersion(); } if (et != RS_ZK_REQUEST_REGION_SPLIT) { throw new IOException("Splitting node " + node + " moved out of splitting to " + et); } } // Server is stopping/stopped throw new IOException("Server is " + (services.isStopping() ? "stopping" : "stopped")); } catch (Exception e) { if (e instanceof InterruptedException) { Thread.currentThread().interrupt(); } throw new IOException("Failed getting SPLITTING znode on " + parent.getRegionNameAsString(), e); } } public PairOfSameType execute(final Server server, final RegionServerServices services) throws IOException { if (User.isHBaseSecurityEnabled(parent.getBaseConf())) { LOG.warn("Should use execute(Server, RegionServerServices, User)"); } return execute(server, services, null); } /** * Run the transaction. * @param server Hosting server instance. Can be null when testing (won't try * and update in zk if a null server) * @param services Used to online/offline regions. * @throws IOException If thrown, transaction failed. * Call {@link #rollback(Server, RegionServerServices)} * @return Regions created * @throws IOException * @see #rollback(Server, RegionServerServices) */ public PairOfSameType execute(final Server server, final RegionServerServices services, User user) throws IOException { useZKForAssignment = server == null ? true : ConfigUtil.useZKForAssignment(server.getConfiguration()); PairOfSameType regions = createDaughters(server, services, user); if (this.parent.getCoprocessorHost() != null) { if (user == null) { parent.getCoprocessorHost().preSplitAfterPONR(); } else { try { user.getUGI().doAs(new PrivilegedExceptionAction() { @Override public Void run() throws Exception { parent.getCoprocessorHost().preSplitAfterPONR(); return null; } }); } catch (InterruptedException ie) { InterruptedIOException iioe = new InterruptedIOException(); iioe.initCause(ie); throw iioe; } } } return stepsAfterPONR(server, services, regions, user); } @Deprecated public PairOfSameType stepsAfterPONR(final Server server, final RegionServerServices services, final PairOfSameType regions) throws IOException { return stepsAfterPONR(server, services, regions, null); } public PairOfSameType stepsAfterPONR(final Server server, final RegionServerServices services, final PairOfSameType regions, User user) throws IOException { openDaughters(server, services, regions.getFirst(), regions.getSecond()); if (server != null && server.getZooKeeper() != null && useZKForAssignment) { transitionZKNode(server, services, regions.getFirst(), regions.getSecond()); } journal.add(new JournalEntry(JournalEntryType.BEFORE_POST_SPLIT_HOOK)); // Coprocessor callback if (this.parent.getCoprocessorHost() != null) { if (user == null) { this.parent.getCoprocessorHost().postSplit(regions.getFirst(), regions.getSecond()); } else { try { user.getUGI().doAs(new PrivilegedExceptionAction() { @Override public Void run() throws Exception { parent.getCoprocessorHost().postSplit(regions.getFirst(), regions.getSecond()); return null; } }); } catch (InterruptedException ie) { InterruptedIOException iioe = new InterruptedIOException(); iioe.initCause(ie); throw iioe; } } } journal.add(new JournalEntry(JournalEntryType.AFTER_POST_SPLIT_HOOK)); return regions; } private void offlineParentInMetaAndputMetaEntries(CatalogTracker catalogTracker, HRegionInfo parent, HRegionInfo splitA, HRegionInfo splitB, ServerName serverName, List metaEntries) throws IOException { List mutations = metaEntries; HRegionInfo copyOfParent = new HRegionInfo(parent); copyOfParent.setOffline(true); copyOfParent.setSplit(true); //Put for parent Put putParent = MetaEditor.makePutFromRegionInfo(copyOfParent); MetaEditor.addDaughtersToPut(putParent, splitA, splitB); mutations.add(putParent); //Puts for daughters Put putA = MetaEditor.makePutFromRegionInfo(splitA); Put putB = MetaEditor.makePutFromRegionInfo(splitB); addLocation(putA, serverName, 1); //these are new regions, openSeqNum = 1 is fine. addLocation(putB, serverName, 1); mutations.add(putA); mutations.add(putB); MetaEditor.mutateMetaTable(catalogTracker, mutations); } public Put addLocation(final Put p, final ServerName sn, long openSeqNum) { p.addImmutable(HConstants.CATALOG_FAMILY, HConstants.SERVER_QUALIFIER, Bytes.toBytes(sn.getHostAndPort())); p.addImmutable(HConstants.CATALOG_FAMILY, HConstants.STARTCODE_QUALIFIER, Bytes.toBytes(sn.getStartcode())); p.addImmutable(HConstants.CATALOG_FAMILY, HConstants.SEQNUM_QUALIFIER, Bytes.toBytes(openSeqNum)); return p; } /* * Open daughter region in its own thread. * If we fail, abort this hosting server. */ class DaughterOpener extends HasThread { private final Server server; private final HRegion r; private Throwable t = null; DaughterOpener(final Server s, final HRegion r) { super((s == null? "null-services": s.getServerName()) + "-daughterOpener=" + r.getRegionInfo().getEncodedName()); setDaemon(true); this.server = s; this.r = r; } /** * @return Null if open succeeded else exception that causes us fail open. * Call it after this thread exits else you may get wrong view on result. */ Throwable getException() { return this.t; } @Override public void run() { try { openDaughterRegion(this.server, r); } catch (Throwable t) { this.t = t; } } } /** * Open daughter regions, add them to online list and update meta. * @param server * @param daughter * @throws IOException * @throws KeeperException */ void openDaughterRegion(final Server server, final HRegion daughter) throws IOException, KeeperException { HRegionInfo hri = daughter.getRegionInfo(); LoggingProgressable reporter = server == null ? null : new LoggingProgressable(hri, server.getConfiguration().getLong( "hbase.regionserver.split.daughter.open.log.interval", 10000)); daughter.openHRegion(reporter); } static class LoggingProgressable implements CancelableProgressable { private final HRegionInfo hri; private long lastLog = -1; private final long interval; LoggingProgressable(final HRegionInfo hri, final long interval) { this.hri = hri; this.interval = interval; } @Override public boolean progress() { long now = EnvironmentEdgeManager.currentTimeMillis(); if (now - lastLog > this.interval) { LOG.info("Opening " + this.hri.getRegionNameAsString()); this.lastLog = now; } return true; } } /** * Creates reference files for top and bottom half of the * @param hstoreFilesToSplit map of store files to create half file references for. * @return the number of reference files that were created. * @throws IOException */ private Pair splitStoreFiles( final Map> hstoreFilesToSplit) throws IOException { if (hstoreFilesToSplit == null) { // Could be null because close didn't succeed -- for now consider it fatal throw new IOException("Close returned empty list of StoreFiles"); } // The following code sets up a thread pool executor with as many slots as // there's files to split. It then fires up everything, waits for // completion and finally checks for any exception int nbFiles = 0; for (Map.Entry> entry: hstoreFilesToSplit.entrySet()) { nbFiles += entry.getValue().size(); } if (nbFiles == 0) { // no file needs to be splitted. return new Pair(0,0); } // Default max #threads to use is the smaller of table's configured number of blocking store // files or the available number of logical cores. int defMaxThreads = Math.min(parent.conf.getInt(HStore.BLOCKING_STOREFILES_KEY, HStore.DEFAULT_BLOCKING_STOREFILE_COUNT), Runtime.getRuntime().availableProcessors()); // Max #threads is the smaller of the number of storefiles or the default max determined above. int maxThreads = Math.min(parent.conf.getInt(HConstants.REGION_SPLIT_THREADS_MAX, defMaxThreads), nbFiles); LOG.info("Preparing to split " + nbFiles + " storefiles for region " + this.parent + " using " + maxThreads + " threads"); ThreadFactoryBuilder builder = new ThreadFactoryBuilder(); builder.setNameFormat("StoreFileSplitter-%1$d"); ThreadFactory factory = builder.build(); ThreadPoolExecutor threadPool = (ThreadPoolExecutor) Executors.newFixedThreadPool(maxThreads, factory); List>> futures = new ArrayList>> (nbFiles); // Split each store file. for (Map.Entry> entry: hstoreFilesToSplit.entrySet()) { for (StoreFile sf: entry.getValue()) { StoreFileSplitter sfs = new StoreFileSplitter(entry.getKey(), sf); futures.add(threadPool.submit(sfs)); } } // Shutdown the pool threadPool.shutdown(); // Wait for all the tasks to finish try { boolean stillRunning = !threadPool.awaitTermination( this.fileSplitTimeout, TimeUnit.MILLISECONDS); if (stillRunning) { threadPool.shutdownNow(); // wait for the thread to shutdown completely. while (!threadPool.isTerminated()) { Thread.sleep(50); } throw new IOException("Took too long to split the" + " files and create the references, aborting split"); } } catch (InterruptedException e) { throw (InterruptedIOException)new InterruptedIOException().initCause(e); } int created_a = 0; int created_b = 0; // Look for any exception for (Future> future : futures) { try { Pair p = future.get(); created_a += p.getFirst() != null ? 1 : 0; created_b += p.getSecond() != null ? 1 : 0; } catch (InterruptedException e) { throw (InterruptedIOException) new InterruptedIOException().initCause(e); } catch (ExecutionException e) { throw new IOException(e); } } if (LOG.isDebugEnabled()) { LOG.debug("Split storefiles for region " + this.parent + " Daugther A: " + created_a + " storefiles, Daugther B: " + created_b + " storefiles."); } return new Pair(created_a, created_b); } private Pair splitStoreFile(final byte[] family, final StoreFile sf) throws IOException { if (LOG.isDebugEnabled()) { LOG.debug("Splitting started for store file: " + sf.getPath() + " for region: " + this.parent); } HRegionFileSystem fs = this.parent.getRegionFileSystem(); String familyName = Bytes.toString(family); Path path_a = fs.splitStoreFile(this.hri_a, familyName, sf, this.splitrow, false, this.parent.getSplitPolicy()); Path path_b = fs.splitStoreFile(this.hri_b, familyName, sf, this.splitrow, true, this.parent.getSplitPolicy()); if (LOG.isDebugEnabled()) { LOG.debug("Splitting complete for store file: " + sf.getPath() + " for region: " + this.parent); } return new Pair(path_a, path_b); } /** * Utility class used to do the file splitting / reference writing * in parallel instead of sequentially. */ class StoreFileSplitter implements Callable> { private final byte[] family; private final StoreFile sf; /** * Constructor that takes what it needs to split * @param family Family that contains the store file * @param sf which file */ public StoreFileSplitter(final byte[] family, final StoreFile sf) { this.sf = sf; this.family = family; } public Pair call() throws IOException { return splitStoreFile(family, sf); } } public boolean rollback(final Server server, final RegionServerServices services) throws IOException { if (User.isHBaseSecurityEnabled(parent.getBaseConf())) { LOG.warn("Should use rollback(Server, RegionServerServices, User)"); } return rollback(server, services, null); } /** * @param server Hosting server instance (May be null when testing). * @param services * @throws IOException If thrown, rollback failed. Take drastic action. * @return True if we successfully rolled back, false if we got to the point * of no return and so now need to abort the server to minimize damage. */ @SuppressWarnings("deprecation") public boolean rollback(final Server server, final RegionServerServices services, User user) throws IOException { // Coprocessor callback if (this.parent.getCoprocessorHost() != null) { if (user == null) { this.parent.getCoprocessorHost().preRollBackSplit(); } else { try { user.getUGI().doAs(new PrivilegedExceptionAction() { @Override public Void run() throws Exception { parent.getCoprocessorHost().preRollBackSplit(); return null; } }); } catch (InterruptedException ie) { InterruptedIOException iioe = new InterruptedIOException(); iioe.initCause(ie); throw iioe; } } } boolean result = true; ListIterator iterator = this.journal.listIterator(this.journal.size()); // Iterate in reverse. while (iterator.hasPrevious()) { JournalEntry je = iterator.previous(); switch(je.type) { case SET_SPLITTING_IN_ZK: if (server != null && server.getZooKeeper() != null && useZKForAssignment) { cleanZK(server, this.parent.getRegionInfo()); } else if (services != null && !useZKForAssignment && !services.reportRegionStateTransition(TransitionCode.SPLIT_REVERTED, parent.getRegionInfo(), hri_a, hri_b)) { return false; } break; case CREATE_SPLIT_DIR: this.parent.writestate.writesEnabled = true; this.parent.getRegionFileSystem().cleanupSplitsDir(); break; case CLOSED_PARENT_REGION: try { // So, this returns a seqid but if we just closed and then reopened, we // should be ok. On close, we flushed using sequenceid obtained from // hosting regionserver so no need to propagate the sequenceid returned // out of initialize below up into regionserver as we normally do. // TODO: Verify. this.parent.initialize(); } catch (IOException e) { LOG.error("Failed rollbacking CLOSED_PARENT_REGION of region " + this.parent.getRegionNameAsString(), e); throw new RuntimeException(e); } break; case STARTED_REGION_A_CREATION: this.parent.getRegionFileSystem().cleanupDaughterRegion(this.hri_a); break; case STARTED_REGION_B_CREATION: this.parent.getRegionFileSystem().cleanupDaughterRegion(this.hri_b); break; case OFFLINED_PARENT: if (services != null) services.addToOnlineRegions(this.parent); break; case PONR: // We got to the point-of-no-return so we need to just abort. Return // immediately. Do not clean up created daughter regions. They need // to be in place so we don't delete the parent region mistakenly. // See HBASE-3872. return false; // Informational only cases case STARTED: case PREPARED: case BEFORE_PRE_SPLIT_HOOK: case AFTER_PRE_SPLIT_HOOK: case BEFORE_POST_SPLIT_HOOK: case AFTER_POST_SPLIT_HOOK: case OPENED_REGION_A: case OPENED_REGION_B: break; default: throw new RuntimeException("Unhandled journal entry: " + je); } } // Coprocessor callback if (this.parent.getCoprocessorHost() != null) { if (user == null) { this.parent.getCoprocessorHost().postRollBackSplit(); } else { try { user.getUGI().doAs(new PrivilegedExceptionAction() { @Override public Void run() throws Exception { parent.getCoprocessorHost().postRollBackSplit(); return null; } }); } catch (InterruptedException ie) { InterruptedIOException iioe = new InterruptedIOException(); iioe.initCause(ie); throw iioe; } } } return result; } HRegionInfo getFirstDaughter() { return hri_a; } HRegionInfo getSecondDaughter() { return hri_b; } private static void cleanZK(final Server server, final HRegionInfo hri) { try { // Only delete if its in expected state; could have been hijacked. if (!ZKAssign.deleteNode(server.getZooKeeper(), hri.getEncodedName(), RS_ZK_REQUEST_REGION_SPLIT, server.getServerName())) { ZKAssign.deleteNode(server.getZooKeeper(), hri.getEncodedName(), RS_ZK_REGION_SPLITTING, server.getServerName()); } } catch (KeeperException.NoNodeException e) { LOG.info("Failed cleanup zk node of " + hri.getRegionNameAsString(), e); } catch (KeeperException e) { server.abort("Failed cleanup of " + hri.getRegionNameAsString(), e); } } /** * Creates a new ephemeral node in the PENDING_SPLIT state for the specified region. * Create it ephemeral in case regionserver dies mid-split. * *

Does not transition nodes from other states. If a node already exists * for this region, a {@link NodeExistsException} will be thrown. * * @param zkw zk reference * @param region region to be created as offline * @param serverName server event originates from * @throws KeeperException * @throws IOException */ public static void createNodeSplitting(final ZooKeeperWatcher zkw, final HRegionInfo region, final ServerName serverName, final HRegionInfo a, final HRegionInfo b) throws KeeperException, IOException { LOG.debug(zkw.prefix("Creating ephemeral node for " + region.getEncodedName() + " in PENDING_SPLIT state")); byte [] payload = HRegionInfo.toDelimitedByteArray(a, b); RegionTransition rt = RegionTransition.createRegionTransition( RS_ZK_REQUEST_REGION_SPLIT, region.getRegionName(), serverName, payload); String node = ZKAssign.getNodeName(zkw, region.getEncodedName()); if (!ZKUtil.createEphemeralNodeAndWatch(zkw, node, rt.toByteArray())) { throw new IOException("Failed create of ephemeral " + node); } } /** * Transitions an existing ephemeral node for the specified region which is * currently in the begin state to be in the end state. Master cleans up the * final SPLIT znode when it reads it (or if we crash, zk will clean it up). * *

Does not transition nodes from other states. If for some reason the * node could not be transitioned, the method returns -1. If the transition * is successful, the version of the node after transition is returned. * *

This method can fail and return false for three different reasons: *

  • Node for this region does not exist
  • *
  • Node for this region is not in the begin state
  • *
  • After verifying the begin state, update fails because of wrong version * (this should never actually happen since an RS only does this transition * following a transition to the begin state. If two RS are conflicting, one would * fail the original transition to the begin state and not this transition)
  • *
* *

Does not set any watches. * *

This method should only be used by a RegionServer when splitting a region. * * @param zkw zk reference * @param parent region to be transitioned to opened * @param a Daughter a of split * @param b Daughter b of split * @param serverName server event originates from * @param znodeVersion expected version of data before modification * @param beginState the expected current state the znode should be * @param endState the state to be transition to * @return version of node after transition, -1 if unsuccessful transition * @throws KeeperException if unexpected zookeeper exception * @throws IOException */ public static int transitionSplittingNode(ZooKeeperWatcher zkw, HRegionInfo parent, HRegionInfo a, HRegionInfo b, ServerName serverName, final int znodeVersion, final EventType beginState, final EventType endState) throws KeeperException, IOException { byte [] payload = HRegionInfo.toDelimitedByteArray(a, b); return ZKAssign.transitionNode(zkw, parent, serverName, beginState, endState, znodeVersion, payload); } List getJournal() { return journal; } }





© 2015 - 2024 Weber Informatics LLC | Privacy Policy