All Downloads are FREE. Search and download functionalities are using the official Maven repository.

oracle.kv.util.Ping Maven / Gradle / Ivy

/*-
 * Copyright (C) 2011, 2018 Oracle and/or its affiliates. All rights reserved.
 *
 * This file was distributed by Oracle as part of a version of Oracle NoSQL
 * Database made available at:
 *
 * http://www.oracle.com/technetwork/database/database-technologies/nosqldb/downloads/index.html
 *
 * Please see the LICENSE file included in the top-level directory of the
 * appropriate version of Oracle NoSQL Database for a copy of the license and
 * additional information.
 */

package oracle.kv.util;

import static oracle.kv.impl.util.JsonUtils.createObjectNode;
import static oracle.kv.impl.util.JsonUtils.createWriter;
import static oracle.kv.impl.util.JsonUtils.getArray;

import java.io.IOException;
import java.io.PrintStream;
import java.rmi.NotBoundException;
import java.rmi.RemoteException;
import java.rmi.registry.Registry;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collection;
import java.util.Collections;
import java.util.Comparator;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;
import java.util.Set;
import java.util.TreeMap;
import java.util.concurrent.Callable;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;

import oracle.kv.KVSecurityException;
import oracle.kv.KVStoreException;
import oracle.kv.LoginCredentials;
import oracle.kv.impl.admin.AdminStatus;
import oracle.kv.impl.admin.CommandJsonUtils;
import oracle.kv.impl.admin.CommandResult;
import oracle.kv.impl.admin.CommandServiceAPI;
import oracle.kv.impl.admin.param.AdminParams;
import oracle.kv.impl.admin.param.GlobalParams;
import oracle.kv.impl.admin.param.Parameters;
import oracle.kv.impl.admin.param.StorageNodeParams;
import oracle.kv.impl.arb.ArbNodeStatus;
import oracle.kv.impl.monitor.views.ServiceChange;
import oracle.kv.impl.rep.RepNodeStatus;
import oracle.kv.impl.security.login.LoginManager;
import oracle.kv.impl.security.util.KVStoreLogin;
import oracle.kv.impl.sna.StorageNodeStatus;
import oracle.kv.impl.topo.AdminId;
import oracle.kv.impl.topo.ArbNode;
import oracle.kv.impl.topo.ArbNodeId;
import oracle.kv.impl.topo.Datacenter;
import oracle.kv.impl.topo.RepGroup;
import oracle.kv.impl.topo.RepGroupId;
import oracle.kv.impl.topo.RepNode;
import oracle.kv.impl.topo.RepNodeId;
import oracle.kv.impl.topo.ResourceId;
import oracle.kv.impl.topo.StorageNode;
import oracle.kv.impl.topo.StorageNodeId;
import oracle.kv.impl.topo.Topology;
import oracle.kv.impl.util.CommandParser;
import oracle.kv.impl.util.ConfigurableService.ServiceStatus;
import oracle.kv.impl.util.FormatUtils;
import oracle.kv.impl.util.HostPort;
import oracle.kv.impl.util.JsonUtils;
import oracle.kv.impl.util.TopologyLocator;
import oracle.kv.impl.util.registry.RegistryUtils;
import oracle.kv.util.PingCollector.AdminInfo;
import oracle.kv.util.shell.ShellCommandResult;

import org.codehaus.jackson.JsonNode;
import org.codehaus.jackson.map.ObjectWriter;
import org.codehaus.jackson.node.ArrayNode;
import org.codehaus.jackson.node.ObjectNode;

/**
 * Pings all the RNs, SNs, and admins associated with a KVS. It is provided a
 * single storage node's host:port or a list of host:port pairs, and uses those
 * to find an RMI Registry and uses that to locate a service (RepNode or Admin)
 * that can provide a store topology.  The topology is then used to obtain
 * knowledge about all the other nodes in the store.  If an admin can be found,
 * then it is used to obtain parameters which are used to find the other
 * admins.
 *
 * Ping utility also support pinging specific shard in both admin and top level
 * version. Status for SNs, RNs and Arbiter associated with specific shard is
 * displayed.
 * 
 * Description of flags:
 *  (from CommandParser):
 *  -host -- single host to use.  Must not be used if -helper-hosts is used.
 *  -port -- port to use on the single host  Must not be used if -helper-hosts
 *    is used.
 *  -helper-hosts -- a comma-separated list of host:port pairs.  This allows
 *    the call to be more resilient to the case where an individual host may
 *    not be accessible.  Must not be used if -host and -port are specified.
 *  -shard -- Used to get status information specific to particular shard.
 *
 * 

Ping also provides utility methods to other functionality that needs to * ping remote services. * *

Here is an annotated example of the output of the ping command in JSON * format: * *

 * {
 *   // Topology overview
 *   "topology" : {
 *     "storeName" : "mykvstore",
 *     "sequenceNumber" : 21,
 *     "numPartitions" : 10,
 *     "numStorageNodes" : 2,
 *     "time" : 1425658621417,
 *     "version" : "12.1.3.2.15"
 *   },
 *
 *   // Overview of status of all shards in the store
 *   "shardStatus" : {
 *     "healthy" : 2,           // Shards with all RNs/ANs active
 *     "writable-degraded" : 0, // Some inactive RNs/ANs but with quorum
 *     "read-only" : 0,         // Lost quorum but with some active RNs/ANs
 *     "offline" : 0            // No active RNs/ANs
 *   },
 *
 *   // Overview of the status of the admin, one of "healthy",
 *   // "writable-degraded", "read-only", or "offline"
 *   "adminStatus" : "healthy",
 *
 *   // Status of each zone
 *   "zoneStatus" : [ {
 *     "resourceId" : "zn1",
 *     "name" : "Boston",
 *     "type" : "PRIMARY",
 *     "rnSummaryStatus" : {
 *       "online" : 6,          // RNs in the zone that are online
 *       "offline" : 0,         // RNs in the zone that are offline
 *       "hasReplicas" : true,  // Whether the zone has non-master replicas
 *
 *       // The maximum over all replicas in the zone of the estimated delay,
 *       // in milliseconds, between when a transaction was committed on the
 *       // master and when the master learned that the transaction was
 *       // processed on a replica.
 *       "maxDelayMillis" : 1,
 *
 *       // The maximum over all replicas in the zone of the estimated time, in
 *       // seconds, until a replica eliminate delays with the master. Uses the
 *       // maximum positive long value for replicas that are remaining behind
 *       // without catching up or falling behind.  For replicas that are
 *       // falling behind, uses a negative value whose absolute value is the
 *       // estimated time until the delay doubles. If there are negative
 *       // values, then the maximum is the negative value nearest to zero,
 *       // which represents the replica that is falling behind most quickly.
 *       "maxCatchupTimeSecs" : 0
 *     },
 *     "anSummaryStatus" : {
 *     "online" : 2,
 *     "offline" : 0
 *   }
 *   } ],
 *
 *   // Status of each storage node
 *   "snStatus" : [ {
 *     "resourceId" : "sn1",
 *     "hostname" : "localhost",
 *     "registryPort" : 5001,
 *     "zone" : {
 *       "resourceId" : "zn1",
 *       "name" : "Boston",
 *       "type" : "PRIMARY"     // PRIMARY or SECONDARY
 *     },
 *
 *     // The service status of an admin or replication node, with values from
 *     // the constant values of the ConfigurableService.ServiceStatus enum.
 *     // One of "STARTING", "WAITING_FOR_DEPLOY", "RUNNING", "STOPPING",
 *     // "STOPPED", "ERROR_RESTARTING", "ERROR_NO_RESTART", or "UNREACHABLE".
 *     "serviceStatus" : "RUNNING",
 *
 *     "version" :
 *     "12cR1.3.2.15 2015-03-06 04:16:48 UTC  Build id: 65a27a9d8a1b+",
 *
 *     // Status of the admin node on this storage node, missing if no admin
 *     "adminStatus" : {
 *       "resourceId" : "admin1",
 *       "status" : "RUNNING",
 *
 *       // The replication state of the node, with values from the constant
 *       // values of the ReplicatedEnvironment.State enum.  One of "DETACHED",
 *       // "UNKNOWN", "MASTER", or "REPLICA".
 *       "state" : "MASTER"
 *
 *       // If the node is the master, whether the node is known to be the
 *       // authoritative master: one that is in active contact with a quorum
 *       // of replicas
 *       "authoritativeMaster" : true;
 *     },
 *
 *     // Status of each replication node
 *     "rnStatus" : [ {
 *       "resourceId" : "rg1-rn1",
 *       "status" : "RUNNING",
 *       "state" : "MASTER",
 *       "authoritativeMaster" : true;
 *       "sequenceNumber" : 37,
 *       "haPort" : 5003
 *     }, {
 *       "resourceId" : "rg1-rn3",
 *       "status" : "RUNNING",
 *       "state" : "REPLICA",
 *       "sequenceNumber" : 37,
 *       "haPort" : 5005,
 *
 *       // Whether the node is performing a network restore
 *       "networkRestoreUnderway" : false
 *
 *       // The estimated delay, in milliseconds, between when a transaction
 *       // was committed on the master and when the master learned that the
 *       // transaction was processed on the replica.  Missing if this node is
 *       // a master or if the value is not known.
 *       "delayMillis" : 0,
 *
 *       // The estimated time, in seconds, until all of the replica eliminate
 *       // its delay with the master. Set to the maximum positive long value
 *       // if the replica is not catching up. For replicas that are falling
 *       // behind, set to a negative value whose absolute value is the
 *       // estimated time until the delay doubles.  Missing if this node is a
 *       // master or if the value is not known.
 *       "catchupTimeSecs" : 0,
 *
 *       // The estimated rate, in milliseconds per minute, that the replica is
 *       // reducing its delay with the master.  Set to a negative value if the
 *       // replica is falling behind.  Missing if this node is a master or if
 *       // the value is not known.
 *       "catchupRateMillisPerMinute" : 0
 *     },
 *     // ...
 *   },
 *   //...
 *       } ],
 *     "anStatus" : [ {
 *     "resourceId" : "rg1-an1",
 *     "status" : "RUNNING",
 *     "state" : "REPLICA",
 *     "sequenceNumber" : 0,
 *     "haPort" : "5021"
 *   }
 *   ]
 *
 *
 *   // Status of each arbiter node
 *    {
 *    "resourceId" : "rg3-an1",
 *    "status" : "RUNNING",
 *    "state" : "REPLICA",
 *   "sequenceNumber" : 0,
 *   "haPort" : "5042"
 * }
 *
 *   // Exit code and result code for command is displayed in the json
 *   // report as well as being used as the process exit code. See a
 *   // description of the exit values below
 *    "operation" : "ping",
 *    "return_code" : 5000,
 *    "description" : "No errors found",
 *    "exit_code" : 0
 *   }
 * }
 * 
*

* The process exit code can be used as a diagnostic to determine the state of * the store. Each exit code is paired with a NoSQL error code, as defined in * the NoSQL error messages catalog. TBW - write up the exit codes. * The exit codes have the following meaning: * *

* 0 (EXIT_OK) -- all services in the store could be located and are in a * known, good state (e.g. RUNNING). *

* 1 (EXIT_OPERATIONAL) -- one or more services in the store could not be * reached, or are in an unknown or not usable state. In this case the store * should support all operations across all shards, as well as for the admin, * but may be in a state of degraded performance. Some action should be * taken to find and fix the problem before part of the store becomes * unavailable. *

* 2 (EXIT_NO_ADMIN_QUORUM) -- the admin replication group does not have * quorum or is not available at all. The store supports all normal data * operations despite the loss of admin quorum, but this state requires * immediate attention to restore full store capabilities. *

* 3 (EXIT_NO_SHARD_QUORUM) -- one or more of the shards does not have quorum * and either cannot accept write requests, or is completely unavailable. * This state requires immediate attention to restore store capabilities. * This exit code takes precedence over EXIT_NO_ADMIN_QUORUM, so if this * exit code is used it is possible that the admin capabilities are also * reduced or unavailable. *

* All of the last three of the exit codes above (1-3) could be indicative of a * network connectivity issue and that should be checked first, before * concluding that any services have a problem. *

* 100 (EXIT_USAGE) -- a usage error *

* 101 (EXIT_TOPOLOGY_FAILURE) -- Ping was unable to find a * Topology in order to operate. This could be a store problem, a network * problem, or it could be a usage problem with the parameters passed to * Ping (e.g. the host and port or helper-hosts list are not part of a store). *

* 102 (EXIT_UNEXPECTED) -- the utility has experienced an unexpected error. * * Note that each Ping instance accumulates state after each ping call * and should be used a single time. */ public class Ping { private static final int MAX_N_THREADS = 10; /* External commands, for "java -jar" usage. */ public static final String COMMAND_NAME = "ping"; public static final String COMMAND_DESC = "attempts to contact a store to get status of running services"; private static final String HELPER_HOSTS_FLAG = "-helper-hosts"; public static final String COMMAND_ARGS = CommandParser.getHostUsage() + " " + CommandParser.getPortUsage() + " or\n\t" + HELPER_HOSTS_FLAG + " \n\t" + CommandParser.getUserUsage() + "\n\t" + CommandParser.getSecurityUsage() + "\n\t" + CommandParser.getRegOpenTimeoutUsage() + "\n\t" + CommandParser.getRegReadTimeoutUsage() + "\n\t" + CommandParser.optional(CommandParser.JSON_FLAG); static final String EXIT_CODE_FIELD_V1 = "exit_code"; static final String EXIT_CODE_FIELD = "exitCode"; /* * The possible return codes for the Ping utility, obeying the Unix * convention that a non-zero value is an abnormal termination. */ public static enum ExitCode { EXIT_OK(0, ErrorMessage.NOSQL_5000, "No errors found"), EXIT_OPERATIONAL(1, ErrorMessage.NOSQL_5301, "Store is operational but some services are " + "unavailable"), EXIT_NO_ADMIN_QUORUM(2, ErrorMessage.NOSQL_5302, "All data operations are full available but " + "administrative changes are disabled"), EXIT_NO_SHARD_QUORUM(3, ErrorMessage.NOSQL_5303, "One or more shards cannot accept write " + "operations"), EXIT_USAGE(100, ErrorMessage.NOSQL_5100, "Usage error"), EXIT_TOPOLOGY_FAILURE(101, ErrorMessage.NOSQL_5304, "Topology cannot be found"), EXIT_UNEXPECTED(102, ErrorMessage.NOSQL_5500, "Internal error"); /* A value that is a valid Unix return code, in the range of 0-127 */ private final int returnCode; /* * A value that is an Oracle NoSQL error code, which maps into the * error message catalog. */ private final ErrorMessage errorCode; private final String description; ExitCode(int returnCode, ErrorMessage errorCode, String description) { this.returnCode = returnCode; this.errorCode = errorCode; this.description = description; } public int value(){ return this.returnCode; } public ErrorMessage getErrorCode() { return errorCode; } public String getDescription() { return description; } } /** Returns the status for an Admin, or null if not known. */ /* TODO: Replace with Function in Java 8 */ public interface AdminStatusFunction { AdminStatus get(AdminId adminId); } /** Returns the status for an RN, or null if not known. */ /* TODO: Replace with Function in Java 8 */ public interface RepNodeStatusFunction { RepNodeStatus get(RepNode rn); } /** Returns the status for an AN, or null if not known. */ public interface ArbNodeStatusFunction { ArbNodeStatus get(ArbNode an); } /* Ping will find a topology and params to direct its searches */ private Topology topo; private final Parameters params; private final boolean showHidden; private final int jsonVersion; private final PrintStream std; private final PrintStream err; /* * 1. When Ping is used in admin CLI, internal login manager will be * passed in. No need to use login credentials, it will be null. * 2. When Ping is used in ping collector service, internal login manager * will be passed in. No need to use login credentials, it will be null. * 3. When Ping is used by Java command line, login credentials will be * passed in, use the login credentials to create login manager for proper * access. */ private final LoginManager loginManager; private final LoginCredentials loginCreds; private PingCollector collector; /* * The problem report stores information that can be used for follow-on * action. For example, if a component in the store couldn't be contacted, * the problem report lists enough information so the caller can run a * network connectivity check against that hostport. */ private final List problemReport = Collections.synchronizedList(new ArrayList()); /* * exitCode holds the Unix system exitcode that will be returned by * Ping. It acts as a summary, or signal, of what happened in the call, * giving the user a basic green (ok), yellow (something wrong), red (big * problem) type health check. ExitCode should be set via any path that * leads to an exit. Initialize it here to indicate an unexpected, internal * error. If all goes as it should, the return code will be set to a proper * value by follow on processing. If there is a bug in the utility, and * the exitCode is not set, this initialization will point out the * problem. */ private ExitCode exitCode = ExitCode.EXIT_UNEXPECTED; /** * {@literal * Usage: java -jar KVHOME/lib/kvstore.jar ping * -host -port or * -helper-hosts * -username * -security * [-json] [-shard rgX] * } */ public static void main(String[] args) { class PingParser extends CommandParser { /* * This is an internal flag used by tests to allow them to * call Ping.main() without exiting. */ private final String DONT_EXIT_FLAG = "-no-exit"; private String helperHosts = null; private boolean dontExit = false; private boolean showHidden = false; private final String SHARD_EXIST_FLAG = "-shard"; private RepGroupId shard = null; PingParser(String[] args1) { super(args1); } @Override public void usage(String errorMsg) { /* * Note that you can't really test illegal arguments in a * threaded unit test -- the call to exit(..) when * dontExit is false doesn't kill the process, and the error * message gets lost. Still worth using dontExit so the * unit test process doesn't die, but unit testing of bad * arg handling has to happen with a process. */ if (!getJson()) { if (errorMsg != null) { System.err.println(errorMsg); } System.err.println(KVSTORE_USAGE_PREFIX + COMMAND_NAME + "\n\t" + COMMAND_ARGS); } exit(dontExit, errorMsg, ExitCode.EXIT_USAGE, getJsonVersion(), System.err); } @Override protected boolean checkArg(String arg) { if (arg.equals(HELPER_HOSTS_FLAG)) { helperHosts = nextArg(arg); return true; } if (arg.equals(DONT_EXIT_FLAG)) { dontExit = true; return true; } if (arg.equals(HIDDEN_FLAG)) { showHidden = true; return true; } if (arg.equals(SHARD_EXIST_FLAG)) { shard = RepGroupId.parse(nextArg(arg)); return true; } return false; } private String getHelperHosts() { return helperHosts; } private boolean getDontExit() { return dontExit; } private RepGroupId getShard() { return shard; } @Override protected void verifyArgs() { /* Check that one or more helper hosts are supplied */ if (getHelperHosts() != null && (getHostname() != null || (getRegistryPort() != 0))) { usage("Only one of either " + HELPER_HOSTS_FLAG + " or " + HOST_FLAG + " plus " + PORT_FLAG + "may be specified"); } if (getHelperHosts() == null) { if (getHostname() == null) { missingArg(HOST_FLAG); } if (getRegistryPort() == 0) { missingArg(PORT_FLAG); } } else { /* * Helper hosts have been supplied - validate the * argument. */ try { validateHelperHosts(getHelperHosts()); } catch (IllegalArgumentException e) { usage("Illegal value for " + HELPER_HOSTS_FLAG ); } } } /** * Validate that each helper host entry in the form * : */ private void validateHelperHosts(String helperHostVal) throws IllegalArgumentException { if (helperHostVal == null) { throw new IllegalArgumentException ("helper hosts cannot be null"); } String[] hosts = helperHostVal.split(","); HostPort.parse(hosts); } /** * Return a list of hostport strings. Assumes that an argument * to helperHosts has already been validated. */ List createHostPortList() { String[] hosts = null; if (helperHosts != null) { hosts = helperHosts.split(","); } else { hosts = new String[1]; hosts[0] = getHostname() + ":" + getRegistryPort(); } HostPort[] hps = HostPort.parse(hosts); List hpList = new ArrayList(); for (HostPort hp : hps) { hpList.add(hp.toString()); } return hpList; } } PingParser pp = new PingParser(args); try { pp.parseArgs(); } catch (Exception e) { exit(pp.getDontExit(), "Argument error: " + e.getMessage(), ExitCode.EXIT_USAGE, CommandParser.getJsonVersion(args), System.err); return; } final KVStoreLogin storeLogin = new KVStoreLogin(pp.getUserName(), pp.getSecurityFile()); try { storeLogin.loadSecurityProperties(); } catch (IllegalArgumentException iae) { exit(pp.getDontExit(), iae.getMessage(), ExitCode.EXIT_USAGE, pp.getJsonVersion(), System.err); return; } LoginCredentials loginCreds = null; if (storeLogin.foundTransportSettings()) { storeLogin.prepareRegistryCSF(pp.getRegistryOpenTimeout(), pp.getRegistryReadTimeout()); try { loginCreds = storeLogin.makeShellLoginCredentials(); } catch (IOException ioe) { exit(pp.getDontExit(), "Failed to get login credentials: " + ioe.getMessage(), ExitCode.EXIT_USAGE, pp.getJsonVersion(), System.err); return; } catch (IllegalArgumentException iae) { exit(pp.getDontExit(), iae.getMessage(), ExitCode.EXIT_USAGE, pp.getJsonVersion(), System.err); return; } } Ping ping = null; List hostports = pp.createHostPortList(); try { ping = new Ping(hostports, pp.showHidden, pp.getJsonVersion(), System.out, System.err, loginCreds); } catch (KVStoreException e) { /* * Has problems getting the topology or params. * For backward compatibility, see if we should generate the old * no-SNA-exists but isn't deployed yet message. Remove this when * the SNA status command is implemented that will give the user a * way to get a more understandable message that says the SNA * exists, but isn't yet deployed. */ if (!pp.getJson()) { /* Remove this when SNA status exists! */ checkIfSNAIsDeployed(hostports.get(0)); } exit(pp.getDontExit(), "Can't find store topology: " + e.getMessage(), ExitCode.EXIT_TOPOLOGY_FAILURE, pp.getJsonVersion(), System.err); return; } catch (KVSecurityException e) { /* Couldn't authenticate to a secure store. */ exit(pp.getDontExit(), "Access issue: " + e.getMessage(), ExitCode.EXIT_USAGE, pp.getJsonVersion(), System.err); return; } /* Verify if specified shard exists in topology */ if (pp.getShard() != null && ping.topo.get(pp.getShard()) == null) { throw new IllegalArgumentException( "Shard " + pp.getShard() + " does not exist. " + "Use show topology to find the available shards."); } /* * Ping all the components in the store, analyze the status, and * display the results. * * If shard flag is specified then ping SNs, RNs and Arbiter associated * with shard, analyze the status and display the results. */ ping.pingTopology(pp.getShard()); /* * Be sure not to specify an exit message here - all information * should have been displayed via pingTopology. */ exitNoDisplay(pp.getDontExit(), ping.exitCode); } /** * For use cases where the caller has a topology in hand already. * @param loginManager can be RepNodeLoginManager or InternalLoginManager */ private Ping(Topology topo, Parameters params, boolean showHidden, int jsonVersion, PrintStream ps, LoginManager loginManager) { this.topo = topo; this.params = params; this.showHidden = showHidden; this.jsonVersion = jsonVersion; this.std = ps; this.err = ps; this.loginManager = loginManager; this.loginCreds = null; } /** * For use cases where the utility must find a topology. * @throws KVStoreException */ public Ping(List hostPorts, boolean showHidden, int jsonVersion, PrintStream std, PrintStream err, LoginCredentials loginCreds) throws KVStoreException { this.showHidden = showHidden; this.jsonVersion = jsonVersion; this.std = std; this.err = err; /* Search available SNs for a topology */ String[] hostPortsArray = new String[hostPorts.size()]; hostPortsArray = hostPorts.toArray(hostPortsArray); if (loginCreds == null) { loginManager = null; } else { loginManager = KVStoreLogin.getRepNodeLoginMgr(hostPortsArray, loginCreds, null); } this.loginCreds = loginCreds; topo = findTopo(hostPortsArray); params = findParams(); } /** * For use cases where the utility must find a topology. * @param loginManager can be RepNodeLoginManager or InternalLoginManager * @throws KVStoreException */ public Ping(List hostPorts, boolean showHidden, int jsonVersion, PrintStream ps, LoginManager loginManager) throws KVStoreException { this.showHidden = showHidden; this.jsonVersion = jsonVersion; this.std = ps; this.err = ps; this.loginManager = loginManager; this.loginCreds = null; /* Search available SNs for a topology */ String[] hostPortsArray = new String[hostPorts.size()]; hostPortsArray = hostPorts.toArray(hostPortsArray); topo = findTopo(hostPortsArray); params = findParams(); } /** * For callers who prefer to use Ping over TopologyLocator, because Ping * searches both RNs and Admin Services in search of a topology. * @throws KVStoreException if a topology can't be found. */ public static Topology findTopology(String hostname, int port) throws KVStoreException { List helpers = new ArrayList(); helpers.add(new HostPort(hostname, port).toString()); Ping ping = new Ping(helpers, false, // showHidden -1, System.out, System.err, (LoginCredentials) null); return ping.getTopology(); } public Topology getTopology() { return topo; } public PingCollector getPingCollector() { return collector; } /** * Used by PingCommand, invoked within the Admin CLI. A topology is * provided since it's already connected to an Admin, no need to find one. * * Providing a static convenience method reinforces the fact that since * each Ping instance accumulates state, each Ping instance should be * used for a single Ping call. * @param loginManager can be RepNodeLoginManager or InternalLoginManager * @param shard if non-null, shows particular shard specific information */ public static void pingTopology(Topology topo, Parameters params, boolean showHidden, int jsonVersion, PrintStream ps, LoginManager loginManager, RepGroupId shard) { Ping p = new Ping(topo, params, showHidden, jsonVersion, ps, loginManager); p.pingTopology(shard); } /** * Ping all the SNs, RNs, and optionally admins, that make up the topology, * and print results to the specified PrintStream. If params is non-null, * it will be used to discover admins, otherwise admins will not be * included. * * If shard has been specified then ping SNs, RNs and Arbiter associated * with specific shard and and print results to the specified PrintStream. */ public void pingTopology(RepGroupId shard) { if (topo == null) { return; } /* TODO: Modify PingCollector to filter by shard, for efficiency. */ /* Request status from each service in the store. */ collector = new PingCollector(topo, params, loginManager); /* * Analyze the service status and generate an exist code. The exit code * acts as a summary of the findings, which indicates whether the * caller needs to take action. */ exitCode = analyzeStatus(); PrintStream ps = std; if (exitCode.value() != 0) { ps = err; } /* * Create a JSON node that displays the ping results. The conversion to * json is not just a direct translation into a display format, it also * aggregates the information and creates overviews and summaries. */ ObjectNode jsonTop; jsonTop = convertStatusToJson(shard); if (jsonVersion == CommandParser.JSON_V2) { final PingResult result = new PingResult(exitCode, null); final ShellCommandResult scr = ShellCommandResult.getDefault("ping"); scr.setReturnCode(result.getErrorCode()); scr.setDescription(result.getDescription()); jsonTop.put(EXIT_CODE_FIELD, result.getExitCode().value()); if (showHidden) { problemReport.addAll(collector.getProblems()); addProblemReport(jsonTop); } scr.setReturnValue(jsonTop); try { ps.println(scr.convertToJson()); } catch (IOException e) { err.println(scr.getConversionErrorJsonResult(e)); } } else if (jsonVersion == CommandParser.JSON_V1) { /* * Json mode only - display the exit code and any prescriptive * information. This is meant for scripting support, and therefore * is only available in json. * * Add information about the return code in the standard json * format for all CLIs. */ createResultsJson(jsonTop, new PingResult(exitCode, null)); /* * Add information about any problems found. This feature is * currently undocumented, only available via json, and is * currently governed by hidden. */ if (showHidden) { problemReport.addAll(collector.getProblems()); addProblemReport(jsonTop); } final ObjectWriter writer = createWriter(true /* pretty */); try { ps.println(writer.writeValueAsString(jsonTop)); } catch (IOException e) { err.println(e); } } else { ps.print("Pinging components of "); ps.println(PingDisplay.displayTopologyOverview(jsonTop)); if (shard == null) { ps.println(PingDisplay.displayShardOverview(jsonTop)); } else { ps.println(PingDisplay.displaySpecificShardOverview(jsonTop)); } final String adminOverview = PingDisplay.displayAdminOverview(jsonTop); if (!"".equals(adminOverview)) { ps.println(adminOverview); } for (JsonNode jsonZone : getArray(jsonTop, "zoneStatus")) { ps.println(PingDisplay.displayZoneOverview(jsonZone)); } for (JsonNode jsonSN : getArray(jsonTop, "snStatus")) { ps.println(PingDisplay.displayStorageNode(jsonSN)); final JsonNode jsonAdmin = jsonSN.get("adminStatus"); if (jsonAdmin != null) { ps.println(PingDisplay.displayAdmin(jsonAdmin)); } for (JsonNode jsonRN : getArray(jsonSN, "rnStatus")) { ps.println(PingDisplay.displayRepNode(jsonRN)); } for (JsonNode jsonAN : getArray(jsonSN, "anStatus")) { ps.println(PingDisplay.displayArbNode(jsonAN)); } } } } /** * Displays the status values collected from the cluster components as * JSON. In some cases, status values are simply formatted and displayed, * in others, status values are aggregated or summarized. */ private ObjectNode convertStatusToJson(RepGroupId shard) { Map snMap = collector.getSNMap(); /* Group output by Storage Node */ List sns = new ArrayList(snMap.keySet()); Collections.sort(sns, new Comparator() { @Override public int compare(StorageNode o1, StorageNode o2) { return o1.getStorageNodeId().getStorageNodeId() - o2.getStorageNodeId().getStorageNodeId(); }}); /* Using the collected RN statuses, extract RN master status info */ final Map rnMap = collector.getRNMap(); final Map masterStatusMap = new HashMap(); for (Entry e : rnMap.entrySet()) { final RepNodeStatus status = e.getValue(); if ((status != null) && status.getReplicationState().isMaster()) { final RepNode rn = e.getKey(); masterStatusMap.put(rn.getRepGroupId(), status); } } /* * Create a JSON object and construct the ping status display. * Start by summarizing topology */ final ObjectNode jsonTop = createObjectNode(); PingDisplay.topologyOverviewToJson(topo, shard, jsonTop); /* Add admin overview to the display */ final Map adminMap = collector.getAdminMap(); if (params != null) { final AdminStatusFunction adminStatusFunc = new AdminStatusFunction() { @Override public AdminStatus get(AdminId adminId) { final AdminInfo adminInfo = adminMap.get(adminId); return adminInfo.adminStatus; } }; PingDisplay.adminOverviewToJson(params, adminStatusFunc, jsonTop); } /* * Define a RN status function to extract info for shards and zones * overviews. */ final RepNodeStatusFunction rnfunc = new RepNodeStatusFunction() { @Override public RepNodeStatus get(RepNode rn) { return rnMap.get(rn); } }; final Map anMap = collector.getANMap(); final ArbNodeStatusFunction anfunc = new ArbNodeStatusFunction() { @Override public ArbNodeStatus get(ArbNode an) { return anMap.get(an); } }; /* Add a shard overview */ PingDisplay.shardOverviewToJson(topo, rnfunc, anfunc, shard, jsonTop); /* Add zone overviews. */ final ArrayNode jsonZones = jsonTop.putArray("zoneStatus"); for (final Datacenter dc : topo.getSortedDatacenters()) { jsonZones.add( PingDisplay.zoneOverviewToJson(topo, dc, rnfunc, anfunc, shard)); } /* Add SN, Admin, andRN status in SN order. */ final ArrayNode jsonSNs = jsonTop.putArray("snStatus"); for (StorageNode sn : sns) { /* * If shard is non-null then only check storage node if having * RNs associated with specified shard */ if (shard != null) { final StorageNodeId snId = sn.getStorageNodeId(); boolean rnInShard = false; for (final RepNodeId rnId : topo.getHostedRepNodeIds(snId)) { if (shard.sameGroup(rnId)) { rnInShard = true; break; } } if (!rnInShard) { continue; } } StorageNodeStatus status = snMap.get(sn); final ObjectNode jsonSN = PingDisplay.storageNodeToJson(topo, sn, status); jsonSNs.add(jsonSN); for (Entry aentry : adminMap.entrySet()) { final AdminInfo info = aentry.getValue(); if ((info != null) && sn.getStorageNodeId().equals(info.snId) && shard == null) { jsonSN.put("adminStatus", PingDisplay.adminToJson(aentry.getKey(), info.adminStatus)); break; } } final ArrayNode jsonRNs = jsonSN.putArray("rnStatus"); for (Entry rentry : rnMap.entrySet()) { final RepNode rn = rentry.getKey(); if (sn.getStorageNodeId().equals(rn.getStorageNodeId()) && (shard == null || shard.sameGroup(rn.getResourceId()))) { jsonRNs.add(PingDisplay.repNodeToJson (rn, rentry.getValue(), masterStatusMap.get(rn.getRepGroupId()), null /* expectedStatus */)); } } final ArrayNode jsonANs = jsonSN.putArray("anStatus"); for (Entry rentry : anMap.entrySet()) { final ArbNode an = rentry.getKey(); if (sn.getStorageNodeId().equals(an.getStorageNodeId()) && (shard == null || an.getRepGroupId().equals(shard))) { jsonANs.add(PingDisplay.arbNodeToJson (an, rentry.getValue(), null /* expectedStatus */)); } } } return jsonTop; } private Topology findTopo(String[] hostPortsArray) throws KVStoreException { /* Search available SNs for a topology */ Topology newtopo = null; /* * The search for a new topo is confined to SNs that host RNs. If * Admins live on SNs which don't host RNs, we'll be delayed in * seeing a new topo; we'd have to wait for that to be propagated to * the RNs. That's ok; by design, the system will propagate topos to * RNs in a timely fashion, and it's not worth adding complications * for the unusual case of an Admin-only SN. */ try { newtopo = TopologyLocator.get(hostPortsArray, 0, loginManager, null); } catch (KVStoreException topoLocEx) { /* had a problem getting a topology - try using the Admins */ newtopo = searchAdminsForTopo(hostPortsArray); /* Still can't find a topology */ if (newtopo == null) { throw topoLocEx; } } return newtopo; } /** * Using the topology to find SNs, find an AdminService to get some Params */ private Parameters findParams() { if (topo == null) { return null; } /* Look for admins to get parameters */ ExecutorService executor = Executors.newFixedThreadPool(MAX_N_THREADS); Collection> tasks = new ArrayList>(); for (final StorageNode sn : topo.getStorageNodeMap().getAll()) { tasks.add(new Callable() { @Override public Parameters call() throws Exception { try { final CommandServiceAPI admin = getAdmin(sn.getHostname(), sn.getRegistryPort()); return admin.getParameters(); } catch (RemoteException e) { /* * Note the problem - an Admin is registered on this SN, * but it couldn't be accessed. */ problemReport.add (new Problem(sn.getResourceId(), sn.getHostname(), sn.getRegistryPort(), "Admin Service exists on this SN " + "but ping couldn't contact it: ", e)); /* * Throw out all Exceptions to tell this task failed to * get admin parameters. */ throw e; } } }); } try { /* * Returns the admin parameter result got by the first completed * task. */ return executor.invokeAny(tasks); } catch (Exception e) { /* * If it throws Exception, that means all task failed. * Can't find any Admins, there should be some in the list. */ problemReport.add(new Problem("Can't contact any Admin services " + "in the store")); return null; } finally { executor.shutdownNow(); } } /** * Given a set of SNs, find an AdminService to find a topology */ private Topology searchAdminsForTopo(String[] hostPortStrings) { final HostPort[] targetHPs = HostPort.parse(hostPortStrings); /* Look for admins to get parameters */ ExecutorService executor = Executors.newFixedThreadPool(MAX_N_THREADS); Collection> tasks = new ArrayList>(); for (final HostPort hp : targetHPs) { tasks.add(new Callable() { @Override public Topology call() throws Exception { try { final CommandServiceAPI admin = getAdmin(hp.hostname(), hp.port()); return admin.getTopology(); } catch (RemoteException e) { /* * Note the problem - an Admin is registered on this SN, * but it couldn't be accessed. */ problemReport.add (new Problem(hp.hostname(), hp.port(), "Admin Service exists on this SN " + "but ping couldn't contact it: ", e)); /* * Throw out all Exceptions to tell this task failed to * get topology. */ throw e; } } }); } try { /* * Returns the topology result got by the first completed task. */ return executor.invokeAny(tasks); } catch (Exception e) { /* * If it throws Exception, that means all task failed. * Can't find any Admins, there should be some in the list. */ problemReport.add(new Problem("Searching for topology, can't "+ "contact any Admin services in the "+ "store")); return null; } finally { executor.shutdownNow(); } } /** * Get the CommandService on this particular SN. */ private CommandServiceAPI getAdmin(String snHostname, int snRegistryPort) throws NotBoundException, RemoteException { /* * Use login manager first, if it is available. */ if (loginManager != null) { return RegistryUtils.getAdmin( snHostname, snRegistryPort, loginManager); } /* * Use login credentials to build admin login manager. */ if (loginCreds != null) { return RegistryUtils.getAdmin(snHostname, snRegistryPort, KVStoreLogin.getAdminLoginMgr( snHostname, snRegistryPort, loginCreds)); } /* * Non-secure case. */ return RegistryUtils.getAdmin(snHostname, snRegistryPort, null); } /** * Look at the status information collected from the components of the * store, and analyze it in the context of how the component fits within * the topology. * * Walk the topology, looking for matching entries in the service maps. * If a service is not found or not in an expected "good" state, make * note of it. This isn't implemented to be efficient. It does a lot * of walking/scanning of structures. * * This function tracks: * - the total number of services not in a good state * - quorum for replicated services. * If a replication group (RN or admin) does not have quorum the global * noRNQuorum counter is incremented, indicating a situation that requires * immediate attention because the cluster may be partially inaccessible. * The quorum computation includes arbiters. */ private ExitCode analyzeStatus() { if (topo == null) { return ExitCode.EXIT_TOPOLOGY_FAILURE; } final Set failedAdmins = new HashSet(); final TreeMap failedShards = new TreeMap(); int numFail = 0; /* Look for failed storageNodes */ List sns = topo.getSortedStorageNodes(); final Map snMap = collector.getSNMap(); for (StorageNode sn : sns) { StorageNodeStatus snStatus = snMap.get(sn); ServiceStatus status = (snStatus == null ? ServiceStatus.UNREACHABLE : snStatus.getServiceStatus()); if (!status.equals(ServiceStatus.RUNNING)) { problemReport.add(new Problem(sn.getResourceId(), sn.getHostname(), sn.getRegistryPort(), "Unexpected status " + status)); numFail++; } } /* Does each shard have quorum? */ final Map rnMap = collector.getRNMap(); final Map monitoredChanges = collector.getMonitoredChanges(); int noRNQuorum = 0; for (RepGroup rg: topo.getRepGroupMap().getAll()) { /* Figure out what the quorum size is for this shard */ Collection rns = rg.getRepNodes(); int rf = rns.size(); int numNeeded = (rf/2 + 1); int numBadInShard = 0; for (RepNode rn: rns) { RepNodeStatus rnStatus = rnMap.get(rn); ServiceStatus status = (rnStatus == null ? ServiceStatus.UNREACHABLE : rnStatus.getServiceStatus()); if (!status.equals(ServiceStatus.RUNNING)) { RepNodeId rid = rn.getResourceId(); StringBuilder sb = new StringBuilder(); sb.append("RN is not running: ").append(status); if (rnStatus == null) { ServiceChange change = monitoredChanges.get(rid); sb.append(", last known status is "); if (change == null) { sb.append("UNKNOWN"); } else { String reportTime = FormatUtils.formatDateAndTime (change.getChangeTime()); sb.append(change.getStatus()) .append(", reported at ").append(reportTime); } } StorageNode sn = topo.get(rn.getStorageNodeId()); problemReport.add(new Problem(rid, sn.getHostname(), sn.getRegistryPort(), sb.toString())); numFail++; numBadInShard++; } } int numFailAN = 0; final Map anMap = collector.getANMap(); Collection ans = rg.getArbNodes(); for (ArbNode an : ans) { ArbNodeStatus anStatus = anMap.get(an); ServiceStatus status = (anStatus == null ? ServiceStatus.UNREACHABLE : anStatus.getServiceStatus()); if (!status.equals(ServiceStatus.RUNNING)) { ArbNodeId aid = an.getResourceId(); StringBuilder sb = new StringBuilder(); sb.append("AN is not running: ").append(status); if (anStatus == null) { ServiceChange change = monitoredChanges.get(aid); sb.append(", last known status is "); if (change == null) { sb.append("UNKNOWN"); } else { String reportTime = FormatUtils.formatDateAndTime (change.getChangeTime()); sb.append(change.getStatus()) .append(", reported at ").append(reportTime); } } StorageNode sn = topo.get(an.getStorageNodeId()); problemReport.add(new Problem(aid, sn.getHostname(), sn.getRegistryPort(), sb.toString())); numFailAN++; numBadInShard++; } } if (numBadInShard > 0) { failedShards.put(rg.getResourceId(), numBadInShard); /* check if using arbiters */ if (rf == 2 && !ans.isEmpty()) { /* Using arbiters. */ if (numBadInShard + numFailAN > 1) { noRNQuorum++; } } else if (numBadInShard >= numNeeded) { noRNQuorum++; } } } boolean noAdminQuorum = false; if (params != null) { Map adminMap = collector.getAdminMap(); int numAdmins = params.getAdminIds().size(); int numNeeded = (numAdmins/2 + 1); for (final AdminId aid : params.getAdminIds()) { AdminInfo ainfo = adminMap.get(aid); AdminStatus adminStatus = null; if (ainfo != null) { adminStatus = ainfo.adminStatus; } ServiceStatus status = (adminStatus == null ? ServiceStatus.UNREACHABLE : adminStatus.getServiceStatus()); if (!status.equals(ServiceStatus.RUNNING)) { /* Find the host/port for this Admin. */ AdminParams ap = params.get(aid); StorageNodeId snId = ap.getStorageNodeId(); StorageNodeParams snp = params.get(snId); problemReport.add(new Problem(aid, snp.getHostname(), snp.getRegistryPort(), "Admin is not running: " + status)); failedAdmins.add(aid); numFail++; } } if (failedAdmins.size() >= numNeeded) { noAdminQuorum = true; } } else { /* * Cannot establish the health of the admins. This is a failure. */ numFail++; } /* Use the generated information to figure out an exit code. */ if (noRNQuorum > 0) { return ExitCode.EXIT_NO_SHARD_QUORUM; } else if (noAdminQuorum) { return ExitCode.EXIT_NO_ADMIN_QUORUM; } else if (numFail > 0) { return ExitCode.EXIT_OPERATIONAL; } else { return ExitCode.EXIT_OK; } } /* --------- Json parsing, display support ---------- */ /** * TODO: refactor this with CommandJsonUtils.updateNodeWithResult when we * unify the admin automation work with scripting support for SNA commands * and non-admin utilities like ping. */ private static void createResultsJson(ObjectNode on, PingResult result) { if (result == null) { return; } on.put(CommandJsonUtils.FIELD_OPERATION, "ping"); on.put(CommandJsonUtils.FIELD_RETURN_CODE, result.getErrorCode()); on.put(CommandJsonUtils.FIELD_DESCRIPTION, result.getDescription()); on.put(EXIT_CODE_FIELD_V1, result.getExitCode().value()); return; } private static void displayExitJsonV1(PrintStream ps, ExitCode exitCode, String errorMsg) { /* Package up the exit results in a json node */ final ObjectNode exitStatus = createObjectNode(); PingResult pingResult = new PingResult(exitCode, errorMsg); createResultsJson(exitStatus, pingResult); /* print the json node. */ final ObjectWriter writer = createWriter(true /* pretty */); try { ps.println(writer.writeValueAsString(exitStatus)); } catch (IOException e) { ps.println(e); } } private static void displayExitJson(PrintStream ps, ExitCode exitCode, String errorMsg) { final PingResult pingResult = new PingResult(exitCode, errorMsg); final ShellCommandResult scr = ShellCommandResult.getDefault("ping"); scr.setReturnCode(pingResult.getErrorCode()); scr.setDescription(pingResult.getDescription()); final ObjectNode on = JsonUtils.createObjectNode(); on.put(EXIT_CODE_FIELD, pingResult.getExitCode().value()); scr.setReturnValue(on); try { ps.println(scr.convertToJson()); } catch (IOException e) { ps.println(e); } } /** * Create a json array containing each problem. Currently unadvertised. * "problems" : [ { * "component" : null, * "hostname" : null, * "port" : 0, * "description" : "Can't contact any Admin services in the store" * }, { * "component" : "sn1", * "hostname" : "localhost", * "port" : 5001, * "description" : "Unexpected status UNREACHABLE" * }, { * "component" : "sn1", * "hostname" : "localhost", * "port" : 5001, * "description" : "No RMI service for SN kvtest-oracle.kv.util.PingTest-testPing:sn1:MAIN" * } ] */ private void addProblemReport(ObjectNode resultsJson) { if (problemReport.isEmpty()) { return; } /* * Sort the display of problems; there may be more than one per * component. */ Problem[] problemArray = new Problem[problemReport.size()]; problemArray = problemReport.toArray(problemArray); Arrays.sort(problemArray); ArrayNode problemJson = resultsJson.putArray("problems"); for (Problem p : problemArray) { p.addToArrayNode(problemJson); } } private static class PingResult implements CommandResult { private final ExitCode exitCode; private final String errorMsg; PingResult(ExitCode exitCode, String errorMsg) { this.exitCode = exitCode; this.errorMsg = errorMsg; } @Override public String getReturnValue() { /* No values are returned by this command */ return null; } ExitCode getExitCode() { return exitCode; } @Override public String getDescription() { if (errorMsg == null) { return exitCode.getDescription(); } return exitCode.getDescription() + " - " + errorMsg; } @Override public int getErrorCode() { return exitCode.getErrorCode().getValue(); } @Override public String[] getCleanupJobs() { return null; } } /** * This is purely a backward compatibility method to supply the same * output for a running but not-deployed SNA that currently exists. * Replace and remove this method when the SNA status command has been * implemented and becomes the advised way to check on an individual * SNA's status. */ private static void checkIfSNAIsDeployed(String hostPort) { HostPort hp = HostPort.parse(hostPort); Registry snRegistry; try { snRegistry = RegistryUtils.getRegistry(hp.hostname(), hp.port(), null /* storeName */); final List serviceNames = new ArrayList(); Collections.addAll(serviceNames, snRegistry.list()); if (serviceNames.contains(GlobalParams.SNA_SERVICE_NAME)) { /* not yet registered. */ System.err.println ("SNA at hostname: " + hp.hostname() + ", registry port: " + hp.port() + " is not registered." + "\n\tNo further information is available"); return; } } catch (RemoteException e) { System.err.println("Could not connect to registry at " + hostPort + " " + e.getMessage()); } } /** * Exit the process with the appropriate exit code, generating the * appropriate message. * @param dontExit if false, don't exit if ping is being called from a * Junit unit test, as that will kill the test. */ private static void exit(boolean dontExit, String msg, ExitCode exitCode, int jsonVersion, PrintStream ps) { if ((msg != null) && (ps != null)) { if (jsonVersion == CommandParser.JSON_V2) { displayExitJson(ps, exitCode, msg); } else if (jsonVersion == CommandParser.JSON_V1) { displayExitJsonV1(ps, exitCode, msg); } else { ps.println(msg); } } if (!dontExit) { System.exit(exitCode.value()); } } /** * Exit, but no need to generate an exit message. */ private static void exitNoDisplay(boolean dontExit, ExitCode exitCode) { exit(dontExit, null, exitCode, -1, null); } /* A class to encapsulate problem reports, keyed by a component id. */ static class Problem implements Comparable { private final String componentName; private final String description; private final String hostname; private final int port; Problem(ResourceId resourceId, String hostname, int port, String description) { this.componentName = resourceId.toString(); this.description = description; this.hostname = hostname; this.port = port; } Problem(ResourceId resourceId, String hostname, int port, String description, Exception e) { this.componentName = resourceId.toString(); this.description = description + " " + e.getMessage(); this.hostname = hostname; this.port = port; } Problem(String description) { this.componentName = null; this.description = description; this.hostname = null; this.port = 0; } public Problem(String hostname, int port, String description, RemoteException e) { this.componentName = null; this.description = description + " " + e.getMessage(); this.hostname = hostname; this.port = port; } private void addToArrayNode(ArrayNode problemList) { final ObjectNode on = problemList.addObject(); on.put("component", componentName); on.put("hostname", hostname); on.put("port", port); on.put("description", description); } /** Order by component name */ @Override public int compareTo(Problem o) { if (o == null) { return 1; } if (o.componentName == null) { return 1; } if (componentName == null) { if (o.componentName == null) { return 0; } return -1; } return componentName.compareTo(o.componentName); } } }





© 2015 - 2025 Weber Informatics LLC | Privacy Policy