org.apache.hadoop.ha.HAAdmin Maven / Gradle / Ivy
The newest version!
/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hadoop.ha;
import java.io.IOException;
import java.io.PrintStream;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collection;
import java.util.Map;
import org.apache.commons.cli.Options;
import org.apache.commons.cli.CommandLine;
import org.apache.commons.cli.GnuParser;
import org.apache.commons.cli.ParseException;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.classification.InterfaceAudience;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.CommonConfigurationKeys;
import org.apache.hadoop.ha.HAServiceProtocol.HAServiceState;
import org.apache.hadoop.ha.HAServiceProtocol.StateChangeRequestInfo;
import org.apache.hadoop.ha.HAServiceProtocol.RequestSource;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
import com.google.common.base.Preconditions;
import com.google.common.collect.ImmutableMap;
/**
* A command-line tool for making calls in the HAServiceProtocol.
* For example,. this can be used to force a service to standby or active
* mode, or to trigger a health-check.
*/
@InterfaceAudience.Private
public abstract class HAAdmin extends Configured implements Tool {
private static final String FORCEFENCE = "forcefence";
private static final String FORCEACTIVE = "forceactive";
/**
* Undocumented flag which allows an administrator to use manual failover
* state transitions even when auto-failover is enabled. This is an unsafe
* operation, which is why it is not documented in the usage below.
*/
private static final String FORCEMANUAL = "forcemanual";
private static final Log LOG = LogFactory.getLog(HAAdmin.class);
private int rpcTimeoutForChecks = -1;
protected final static Map USAGE =
ImmutableMap.builder()
.put("-transitionToActive",
new UsageInfo("[--"+FORCEACTIVE+"] ", "Transitions the service into Active state"))
.put("-transitionToStandby",
new UsageInfo("", "Transitions the service into Standby state"))
.put("-failover",
new UsageInfo("[--"+FORCEFENCE+"] [--"+FORCEACTIVE+"] ",
"Failover from the first service to the second.\n" +
"Unconditionally fence services if the --"+FORCEFENCE+" option is used.\n" +
"Try to failover to the target service even if it is not ready if the " +
"--" + FORCEACTIVE + " option is used."))
.put("-getServiceState",
new UsageInfo("", "Returns the state of the service"))
.put("-checkHealth",
new UsageInfo("",
"Requests that the service perform a health check.\n" +
"The HAAdmin tool will exit with a non-zero exit code\n" +
"if the check fails."))
.put("-help",
new UsageInfo("", "Displays help on the specified command"))
.build();
/** Output stream for errors, for use in tests */
protected PrintStream errOut = System.err;
protected PrintStream out = System.out;
private RequestSource requestSource = RequestSource.REQUEST_BY_USER;
protected HAAdmin() {
super();
}
protected HAAdmin(Configuration conf) {
super(conf);
}
protected abstract HAServiceTarget resolveTarget(String string);
protected Collection getTargetIds(String targetNodeToActivate) {
return new ArrayList(
Arrays.asList(new String[]{targetNodeToActivate}));
}
protected String getUsageString() {
return "Usage: HAAdmin";
}
protected void printUsage(PrintStream errOut) {
errOut.println(getUsageString());
for (Map.Entry e : USAGE.entrySet()) {
String cmd = e.getKey();
UsageInfo usage = e.getValue();
errOut.println(" [" + cmd + " " + usage.args + "]");
}
errOut.println();
ToolRunner.printGenericCommandUsage(errOut);
}
private void printUsage(PrintStream errOut, String cmd) {
UsageInfo usage = USAGE.get(cmd);
if (usage == null) {
throw new RuntimeException("No usage for cmd " + cmd);
}
errOut.println(getUsageString() + " [" + cmd + " " + usage.args + "]");
}
private int transitionToActive(final CommandLine cmd)
throws IOException, ServiceFailedException {
String[] argv = cmd.getArgs();
if (argv.length != 1) {
errOut.println("transitionToActive: incorrect number of arguments");
printUsage(errOut, "-transitionToActive");
return -1;
}
/* returns true if other target node is active or some exception occurred
and forceActive was not set */
if(!cmd.hasOption(FORCEACTIVE)) {
if(isOtherTargetNodeActive(argv[0], cmd.hasOption(FORCEACTIVE))) {
return -1;
}
}
HAServiceTarget target = resolveTarget(argv[0]);
if (!checkManualStateManagementOK(target)) {
return -1;
}
HAServiceProtocol proto = target.getProxy(
getConf(), 0);
HAServiceProtocolHelper.transitionToActive(proto, createReqInfo());
return 0;
}
/**
* Checks whether other target node is active or not
* @param targetNodeToActivate
* @return true if other target node is active or some other exception
* occurred and forceActive was set otherwise false
* @throws IOException
*/
private boolean isOtherTargetNodeActive(String targetNodeToActivate, boolean forceActive)
throws IOException {
Collection targetIds = getTargetIds(targetNodeToActivate);
targetIds.remove(targetNodeToActivate);
for(String targetId : targetIds) {
HAServiceTarget target = resolveTarget(targetId);
if (!checkManualStateManagementOK(target)) {
return true;
}
try {
HAServiceProtocol proto = target.getProxy(getConf(), 5000);
if(proto.getServiceStatus().getState() == HAServiceState.ACTIVE) {
errOut.println("transitionToActive: Node " + targetId +" is already active");
printUsage(errOut, "-transitionToActive");
return true;
}
} catch (Exception e) {
//If forceActive switch is false then return true
if(!forceActive) {
errOut.println("Unexpected error occurred " + e.getMessage());
printUsage(errOut, "-transitionToActive");
return true;
}
}
}
return false;
}
private int transitionToStandby(final CommandLine cmd)
throws IOException, ServiceFailedException {
String[] argv = cmd.getArgs();
if (argv.length != 1) {
errOut.println("transitionToStandby: incorrect number of arguments");
printUsage(errOut, "-transitionToStandby");
return -1;
}
HAServiceTarget target = resolveTarget(argv[0]);
if (!checkManualStateManagementOK(target)) {
return -1;
}
HAServiceProtocol proto = target.getProxy(
getConf(), 0);
HAServiceProtocolHelper.transitionToStandby(proto, createReqInfo());
return 0;
}
/**
* Ensure that we are allowed to manually manage the HA state of the target
* service. If automatic failover is configured, then the automatic
* failover controllers should be doing state management, and it is generally
* an error to use the HAAdmin command line to do so.
*
* @param target the target to check
* @return true if manual state management is allowed
*/
private boolean checkManualStateManagementOK(HAServiceTarget target) {
if (target.isAutoFailoverEnabled()) {
if (requestSource != RequestSource.REQUEST_BY_USER_FORCED) {
errOut.println(
"Automatic failover is enabled for " + target + "\n" +
"Refusing to manually manage HA state, since it may cause\n" +
"a split-brain scenario or other incorrect state.\n" +
"If you are very sure you know what you are doing, please \n" +
"specify the --" + FORCEMANUAL + " flag.");
return false;
} else {
LOG.warn("Proceeding with manual HA state management even though\n" +
"automatic failover is enabled for " + target);
return true;
}
}
return true;
}
private StateChangeRequestInfo createReqInfo() {
return new StateChangeRequestInfo(requestSource);
}
private int failover(CommandLine cmd)
throws IOException, ServiceFailedException {
boolean forceFence = cmd.hasOption(FORCEFENCE);
boolean forceActive = cmd.hasOption(FORCEACTIVE);
int numOpts = cmd.getOptions() == null ? 0 : cmd.getOptions().length;
final String[] args = cmd.getArgs();
if (numOpts > 3 || args.length != 2) {
errOut.println("failover: incorrect arguments");
printUsage(errOut, "-failover");
return -1;
}
HAServiceTarget fromNode = resolveTarget(args[0]);
HAServiceTarget toNode = resolveTarget(args[1]);
// Check that auto-failover is consistently configured for both nodes.
Preconditions.checkState(
fromNode.isAutoFailoverEnabled() ==
toNode.isAutoFailoverEnabled(),
"Inconsistent auto-failover configs between %s and %s!",
fromNode, toNode);
if (fromNode.isAutoFailoverEnabled()) {
if (forceFence || forceActive) {
// -forceActive doesn't make sense with auto-HA, since, if the node
// is not healthy, then its ZKFC will immediately quit the election
// again the next time a health check runs.
//
// -forceFence doesn't seem to have any real use cases with auto-HA
// so it isn't implemented.
errOut.println(FORCEFENCE + " and " + FORCEACTIVE + " flags not " +
"supported with auto-failover enabled.");
return -1;
}
try {
return gracefulFailoverThroughZKFCs(toNode);
} catch (UnsupportedOperationException e){
errOut.println("Failover command is not supported with " +
"auto-failover enabled: " + e.getLocalizedMessage());
return -1;
}
}
FailoverController fc = new FailoverController(getConf(),
requestSource);
try {
fc.failover(fromNode, toNode, forceFence, forceActive);
out.println("Failover from "+args[0]+" to "+args[1]+" successful");
} catch (FailoverFailedException ffe) {
errOut.println("Failover failed: " + ffe.getLocalizedMessage());
return -1;
}
return 0;
}
/**
* Initiate a graceful failover by talking to the target node's ZKFC.
* This sends an RPC to the ZKFC, which coordinates the failover.
*
* @param toNode the node to fail to
* @return status code (0 for success)
* @throws IOException if failover does not succeed
*/
private int gracefulFailoverThroughZKFCs(HAServiceTarget toNode)
throws IOException {
int timeout = FailoverController.getRpcTimeoutToNewActive(getConf());
ZKFCProtocol proxy = toNode.getZKFCProxy(getConf(), timeout);
try {
proxy.gracefulFailover();
out.println("Failover to " + toNode + " successful");
} catch (ServiceFailedException sfe) {
errOut.println("Failover failed: " + sfe.getLocalizedMessage());
return -1;
}
return 0;
}
private int checkHealth(final CommandLine cmd)
throws IOException, ServiceFailedException {
String[] argv = cmd.getArgs();
if (argv.length != 1) {
errOut.println("checkHealth: incorrect number of arguments");
printUsage(errOut, "-checkHealth");
return -1;
}
HAServiceProtocol proto = resolveTarget(argv[0]).getProxy(
getConf(), rpcTimeoutForChecks);
try {
HAServiceProtocolHelper.monitorHealth(proto, createReqInfo());
} catch (HealthCheckFailedException e) {
errOut.println("Health check failed: " + e.getLocalizedMessage());
return -1;
}
return 0;
}
private int getServiceState(final CommandLine cmd)
throws IOException, ServiceFailedException {
String[] argv = cmd.getArgs();
if (argv.length != 1) {
errOut.println("getServiceState: incorrect number of arguments");
printUsage(errOut, "-getServiceState");
return -1;
}
HAServiceProtocol proto = resolveTarget(argv[0]).getProxy(
getConf(), rpcTimeoutForChecks);
out.println(proto.getServiceStatus().getState());
return 0;
}
/**
* Return the serviceId as is, we are assuming it was
* given as a service address of form .
*/
protected String getServiceAddr(String serviceId) {
return serviceId;
}
@Override
public void setConf(Configuration conf) {
super.setConf(conf);
if (conf != null) {
rpcTimeoutForChecks = conf.getInt(
CommonConfigurationKeys.HA_FC_CLI_CHECK_TIMEOUT_KEY,
CommonConfigurationKeys.HA_FC_CLI_CHECK_TIMEOUT_DEFAULT);
}
}
@Override
public int run(String[] argv) throws Exception {
try {
return runCmd(argv);
} catch (IllegalArgumentException iae) {
errOut.println("Illegal argument: " + iae.getLocalizedMessage());
return -1;
} catch (IOException ioe) {
errOut.println("Operation failed: " + ioe.getLocalizedMessage());
if (LOG.isDebugEnabled()) {
LOG.debug("Operation failed", ioe);
}
return -1;
}
}
protected int runCmd(String[] argv) throws Exception {
if (argv.length < 1) {
printUsage(errOut);
return -1;
}
String cmd = argv[0];
if (!cmd.startsWith("-")) {
errOut.println("Bad command '" + cmd + "': expected command starting with '-'");
printUsage(errOut);
return -1;
}
if (!USAGE.containsKey(cmd)) {
errOut.println(cmd.substring(1) + ": Unknown command");
printUsage(errOut);
return -1;
}
Options opts = new Options();
// Add command-specific options
if ("-failover".equals(cmd)) {
addFailoverCliOpts(opts);
}
if("-transitionToActive".equals(cmd)) {
addTransitionToActiveCliOpts(opts);
}
// Mutative commands take FORCEMANUAL option
if ("-transitionToActive".equals(cmd) ||
"-transitionToStandby".equals(cmd) ||
"-failover".equals(cmd)) {
opts.addOption(FORCEMANUAL, false,
"force manual control even if auto-failover is enabled");
}
CommandLine cmdLine = parseOpts(cmd, opts, argv);
if (cmdLine == null) {
// error already printed
return -1;
}
if (cmdLine.hasOption(FORCEMANUAL)) {
if (!confirmForceManual()) {
LOG.fatal("Aborted");
return -1;
}
// Instruct the NNs to honor this request even if they're
// configured for manual failover.
requestSource = RequestSource.REQUEST_BY_USER_FORCED;
}
if ("-transitionToActive".equals(cmd)) {
return transitionToActive(cmdLine);
} else if ("-transitionToStandby".equals(cmd)) {
return transitionToStandby(cmdLine);
} else if ("-failover".equals(cmd)) {
return failover(cmdLine);
} else if ("-getServiceState".equals(cmd)) {
return getServiceState(cmdLine);
} else if ("-checkHealth".equals(cmd)) {
return checkHealth(cmdLine);
} else if ("-help".equals(cmd)) {
return help(argv);
} else {
// we already checked command validity above, so getting here
// would be a coding error
throw new AssertionError("Should not get here, command: " + cmd);
}
}
private boolean confirmForceManual() throws IOException {
return ToolRunner.confirmPrompt(
"You have specified the --" + FORCEMANUAL + " flag. This flag is " +
"dangerous, as it can induce a split-brain scenario that WILL " +
"CORRUPT your HDFS namespace, possibly irrecoverably.\n" +
"\n" +
"It is recommended not to use this flag, but instead to shut down the " +
"cluster and disable automatic failover if you prefer to manually " +
"manage your HA state.\n" +
"\n" +
"You may abort safely by answering 'n' or hitting ^C now.\n" +
"\n" +
"Are you sure you want to continue?");
}
/**
* Add CLI options which are specific to the failover command and no
* others.
*/
private void addFailoverCliOpts(Options failoverOpts) {
failoverOpts.addOption(FORCEFENCE, false, "force fencing");
failoverOpts.addOption(FORCEACTIVE, false, "force failover");
// Don't add FORCEMANUAL, since that's added separately for all commands
// that change state.
}
/**
* Add CLI options which are specific to the transitionToActive command and
* no others.
*/
private void addTransitionToActiveCliOpts(Options transitionToActiveCliOpts) {
transitionToActiveCliOpts.addOption(FORCEACTIVE, false, "force active");
}
private CommandLine parseOpts(String cmdName, Options opts, String[] argv) {
try {
// Strip off the first arg, since that's just the command name
argv = Arrays.copyOfRange(argv, 1, argv.length);
return new GnuParser().parse(opts, argv);
} catch (ParseException pe) {
errOut.println(cmdName.substring(1) +
": incorrect arguments");
printUsage(errOut, cmdName);
return null;
}
}
private int help(String[] argv) {
if (argv.length == 1) { // only -help
printUsage(out);
return 0;
} else if (argv.length != 2) {
printUsage(errOut, "-help");
return -1;
}
String cmd = argv[1];
if (!cmd.startsWith("-")) {
cmd = "-" + cmd;
}
UsageInfo usageInfo = USAGE.get(cmd);
if (usageInfo == null) {
errOut.println(cmd + ": Unknown command");
printUsage(errOut);
return -1;
}
out.println(cmd + " [" + usageInfo.args + "]: " + usageInfo.help);
return 0;
}
protected static class UsageInfo {
public final String args;
public final String help;
public UsageInfo(String args, String help) {
this.args = args;
this.help = help;
}
}
}