All Downloads are FREE. Search and download functionalities are using the official Maven repository.

oracle.kv.impl.admin.TopologyCheck Maven / Gradle / Ivy

Go to download

NoSQL Database Server - supplies build and runtime support for the server (store) side of the Oracle NoSQL Database.

The newest version!
/*-
 * Copyright (C) 2011, 2018 Oracle and/or its affiliates. All rights reserved.
 *
 * This file was distributed by Oracle as part of a version of Oracle NoSQL
 * Database made available at:
 *
 * http://www.oracle.com/technetwork/database/database-technologies/nosqldb/downloads/index.html
 *
 * Please see the LICENSE file included in the top-level directory of the
 * appropriate version of Oracle NoSQL Database for a copy of the license and
 * additional information.
 */

package oracle.kv.impl.admin;

import static oracle.kv.impl.util.ObjectUtil.checkNull;

import java.net.InetSocketAddress;
import java.rmi.NotBoundException;
import java.rmi.RemoteException;
import java.util.Collections;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.logging.Level;
import java.util.logging.Logger;

import oracle.kv.impl.admin.TopologyCheckUtils.SNServices;
import oracle.kv.impl.admin.VerifyConfiguration.CompareParamsResult;
import oracle.kv.impl.admin.VerifyConfiguration.Problem;
import oracle.kv.impl.admin.param.AdminParams;
import oracle.kv.impl.admin.param.ArbNodeParams;
import oracle.kv.impl.admin.param.GroupNodeParams;
import oracle.kv.impl.admin.param.Parameters;
import oracle.kv.impl.admin.param.RepNodeParams;
import oracle.kv.impl.admin.plan.AbstractPlan;
import oracle.kv.impl.admin.plan.PortTracker;
import oracle.kv.impl.admin.plan.task.ChangeServiceAddresses;
import oracle.kv.impl.admin.plan.task.RelocateAN;
import oracle.kv.impl.admin.plan.task.RelocateRN;
import oracle.kv.impl.admin.plan.task.Task.State;
import oracle.kv.impl.admin.plan.task.UpdateAdminParams;
import oracle.kv.impl.admin.plan.task.UpdateRepNodeParams;
import oracle.kv.impl.admin.plan.task.Utils;
import oracle.kv.impl.admin.topo.StorageDirectory;
import oracle.kv.impl.admin.topo.Validations.InsufficientRNs;
import oracle.kv.impl.arb.admin.ArbNodeAdminAPI;
import oracle.kv.impl.fault.CommandFaultException;
import oracle.kv.impl.fault.OperationFaultException;
import oracle.kv.impl.param.LoadParameters;
import oracle.kv.impl.param.ParameterMap;
import oracle.kv.impl.param.ParameterState;
import oracle.kv.impl.param.ParameterUtils;
import oracle.kv.impl.rep.admin.RepNodeAdminAPI;
import oracle.kv.impl.security.login.LoginManager;
import oracle.kv.impl.sna.StorageNodeAgentAPI;
import oracle.kv.impl.tif.TextIndexFeederManager;
import oracle.kv.impl.topo.AdminId;
import oracle.kv.impl.topo.ArbNode;
import oracle.kv.impl.topo.ArbNodeId;
import oracle.kv.impl.topo.RepGroup;
import oracle.kv.impl.topo.RepGroupId;
import oracle.kv.impl.topo.RepNode;
import oracle.kv.impl.topo.RepNodeId;
import oracle.kv.impl.topo.ResourceId;
import oracle.kv.impl.topo.ResourceId.ResourceType;
import oracle.kv.impl.topo.StorageNodeId;
import oracle.kv.impl.topo.Topology;
import oracle.kv.impl.util.ConfigurableService.ServiceStatus;
import oracle.kv.impl.util.registry.RegistryUtils;
import oracle.kv.util.ErrorMessage;

import com.sleepycat.je.rep.ReplicationNetworkConfig;
import com.sleepycat.je.rep.ReplicationNode;
import com.sleepycat.je.rep.utilint.HostPortPair;

/**
 * TopologyCheck is used by VerifyConfiguration, by RepairPlan and by other
 * plan tasks that modify topology. It checks that the three representations of
 * layout metadata (the topology, the remote SN config files, and the JEHA
 * group db) are consistent with each other.
 *
 * Both RNs, ANs and Admins are checked.
 *
 * The class provides repair methods that can fix some inconsistencies.
 */
public class TopologyCheck {

    /* TODO: use kvstore param */
    private static final int JE_HA_TIMEOUT_MS = 5000;

    /*
     * The remedies for RN topology mismatches are embodied in this static map.
     * Setup the map of remedies for detected problems.
     */
    private static final Map RN_REMEDIES =
        new HashMap<>();
    static {
        initRNRemedies();
    }

    private static final Map AN_REMEDIES =
        new HashMap<>();
    static {
        initANRemedies();
    }

    /*
     * For efficiency, TopoChecker saves the config.xml and information
     * derived from the topology for use across checking various services.
     */

    /* Information about hosted services, as derived from the SN's config.xml*/
    private final Map snRemoteParams;

    /*
     * A collection of topology resource ids, grouped by hostingSN, generated
     * from the AdminDb's topology and params
     */
    private final Map topoGroupedBySN;

    /**
     * Component doing the check.
     */
    private final String who;

    private final Logger logger;

    /**
     * The topology and params are used at construction time to reorganize
     * topology information for mismatch detection.
     */
    public TopologyCheck(String who,
                         Logger logger,
                         Topology topo,
                         Parameters params) {
        this.who = who + " TopologyCheck";
        this.logger = logger;
        snRemoteParams = new HashMap<>();
        topoGroupedBySN = TopologyCheckUtils.groupServicesBySN(topo, params);
    }

    /**
     * A remote config.xml was obtained from a SN. Save for future use.
     */
    void saveSNRemoteParams(StorageNodeId snId, LoadParameters lp) {
        snRemoteParams.put(snId, processRemoteInfo(snId, lp));
    }

    /**
     * Return the union of RNs that are referenced by the SN's config.xml and
     * those referenced by the AdminDB/topology.
     */
    Set getPossibleRNs(StorageNodeId snId) {
        Set rnsToCheck = new HashSet<>();
        rnsToCheck.addAll(snRemoteParams.get(snId).getAllRNs());
        rnsToCheck.addAll(topoGroupedBySN.get(snId).getAllRepNodeIds());
        return rnsToCheck;
    }

    /**
     * Return the union of ANs that are referenced by the SN's config.xml and
     * those referenced by the AdminDB/topology.
     */
    public Set getPossibleANs(StorageNodeId snId) {
        Set ansToCheck = new HashSet<>();
        ansToCheck.addAll(snRemoteParams.get(snId).getAllARBs());
        ansToCheck.addAll(topoGroupedBySN.get(snId).getAllARBs());
        return ansToCheck;
    }

    /**
     * Checks whether the specified arbiter node should be on this SN, and
     * optionally should be enabled. For further details see comment for
     * the general checkLocation.
     */
    public Remedy checkLocation(Admin admin,
                                StorageNodeId snId,
                                ArbNodeId resId,
                                boolean calledByDeployNewRN,
                                boolean makeRNEnabled,
                                StorageNodeId oldSNId)
        throws RemoteException, NotBoundException {
        return checkLocation(admin,
                                snId,
                                resId,
                                calledByDeployNewRN,
                                makeRNEnabled,
                                oldSNId,
                                null   /* storageDirectory */ );
    }

    /**
     * Checks whether the specified replication or arbiter node should be on
     * this SN, and optionally should be enabled. Recommend a fix if
     * - the RN's location information is inconsistent, or
     * - the RN's location information is consistent, but the RN needs to be
     * re-enabled, and this optional behavior is requested.
     *
     * Assumes that the checker has been constructed with the most up to date
     * topology and params. If SN remote config params are loaded with
     * saveSNRemoteParams, assume that's also up to date.
     *
     * @param admin the admin
     * @param snId the ID of the storage node to check
     * @param resId the ID of the RN or AN to check
     * @param calledByDeployNewRN if true, the caller was deploying a new RN,
     * so if we don't see the RN in the JE HA info we know it does not appear
     * anywhere
     * @param makeRNEnabled if true, return a remedy if the RN is not enabled,
     * otherwise ignore this issue
     * @param oldSNId if not null, provides the ID of the SN that used to host
     * this RN; used to find the original SN when reverting an RN relocation
     * @param storageDirectory if not null, provides the storage directory
     * for the RN if fixing things up for an RN that has been partially
     * relocated
     * @throws NotBoundException
     * @throws RemoteException
     */
    public Remedy checkLocation(Admin admin,
                                StorageNodeId snId,
                                ResourceId resId,
                                boolean calledByDeployNewRN,
                                boolean makeRNEnabled,
                                StorageNodeId oldSNId,
                                StorageDirectory storageDirectory)
       throws RemoteException, NotBoundException {

        final boolean isRN;
        if (resId.getType().isRepNode()) {
            isRN = true;
        } else if (resId.getType().isArbNode()) {
            isRN = false;
        } else {
            throw new IllegalArgumentException("Unexpected resource ID: " +
                                               resId);
        }

        /*
         * Check that the topo, JE HA, and SN are consistent about the RN's
         * location. Assemble our three inputs:
         *  a. topo
         *  b. remote SN config
         *  c. JEHA groupDB
         */

        /* a. Get the topo's viewpoint */
        TOPO_STATUS topoStatus = TOPO_STATUS.GONE;
        SNServices servicesOnThisSN = topoGroupedBySN.get(snId);
        if (servicesOnThisSN != null) {
            if (isRN)  {
                if (servicesOnThisSN.getAllRepNodeIds().contains(resId)) {
                    topoStatus = TOPO_STATUS.HERE;
                }
            } else {
                if (servicesOnThisSN.getAllARBs().contains(resId)) {
                    topoStatus = TOPO_STATUS.HERE;
                }
            }
        }

        /*
         * b. Get the remote SN config. If we can't reach the SN, we can't do
         * any kind of check. If there are problems reaching the SN, this will
         * throw an exception.
         */
        if (snRemoteParams.get(snId) == null) {
            Topology current = admin.getCurrentTopology();
            RegistryUtils regUtils =
                new RegistryUtils(current, admin.getLoginManager());
            StorageNodeAgentAPI sna = regUtils.getStorageNodeAgent(snId);
            LoadParameters remoteParams = sna.getParams();
            snRemoteParams.put(snId, processRemoteInfo(snId, remoteParams));
        }

        /* Does the SN say that the RN is present? */
        CONFIG_STATUS configStatus = CONFIG_STATUS.GONE;
        if (snRemoteParams.get(snId).contains(resId)) {
            configStatus = CONFIG_STATUS.HERE;
        }

        /* c. Get JEHA group metadata */
        JEHAInfo jeHAInfo = getJEHAInfo(admin, resId);

        /*
         * Now that all inputs are assembled, generate a RNLocationInput that
         * serves to lookup the required fix.
         */
        final RNLocationInput remedyKey;
        if (jeHAInfo == null) {
            /* We don't have JE HA info */
            if (calledByDeployNewRN) {
                /* We know that this RN can't exist anywhere else. */
                remedyKey = new RNLocationInput(topoStatus, configStatus,
                                                OTHERSN_STATUS.GONE);
            } else {
                /*
                 * No JEHA info, so check the information gathered from the
                 * config.xmls for other SN. We may be able to conjecture
                 * whether this RN exists anywhere else. If it does show
                 * up on other SNs, we have some info.
                 */
                if (readAllSNRemoteParams(admin.getCurrentTopology(),
                                          admin.getLoginManager())) {
                    boolean found = searchOtherSNs(snId, resId);
                    OTHERSN_STATUS otherSNStatus;
                    if (found) {
                        otherSNStatus = OTHERSN_STATUS.HERE;
                    } else {
                        otherSNStatus = OTHERSN_STATUS.GONE;
                    }

                    remedyKey = new RNLocationInput(topoStatus, configStatus,
                                                  otherSNStatus);
                } else {
                    /*
                     * Couldn't reach all SNs, so there's no proxy for JEHA
                     * info. Any repairs would have to be made solely on the
                     * topology and config status.
                     */
                    remedyKey = new RNLocationInput(topoStatus, configStatus);
                }
            }
        } else {
            /*
             * Great, we have definitive JE HA status. Use that to drive the
             * repairs to make topo and config match JE HA.
             */
            JEHA_STATUS jeHAStatus = JEHA_STATUS.GONE;
            if (jeHAInfo.getSNId().equals(snId)) {
                jeHAStatus = JEHA_STATUS.HERE;
            }

            remedyKey = new RNLocationInput(topoStatus, configStatus,
                                            jeHAStatus);
        }

        /*
         * Look up a remedy. If all seems okay, we'll get an
         * OkayRemedy.FACTORY.
         */
        RNRemedyFactory remedyFactory;

        if (isRN) {
            remedyFactory = RN_REMEDIES.get(remedyKey);
        } else {
            /* Must be an AN */
            remedyFactory = AN_REMEDIES.get(remedyKey);
        }

        /* We have a problem but no remedy -- there's nothing we can do */
        if (remedyFactory == null) {
            remedyFactory = NoFixRemedy.FACTORY;
        }

        /*
         * Even if the RN's information is consistent, we may still need to
         * generate a recommendation for repair. If the RN is disabled,
         * re-enable and restart the RN.
         */
        if (remedyFactory.equals(OkayRemedy.FACTORY) &&
            topoStatus.equals(TOPO_STATUS.HERE) &&
            makeRNEnabled) {
            Parameters params = admin.getCurrentParameters();
            boolean isDisabled =
                isRN ?
                params.get((RepNodeId)resId).isDisabled() :
                params.get((ArbNodeId)resId).isDisabled();

            if (isDisabled) {
                remedyFactory = CreateRNRemedy.FACTORY;
            }
        }

        return remedyFactory.createRemedy(
            this, remedyKey, snId, resId, jeHAInfo, oldSNId, storageDirectory);
    }

    /**
     * Check whether it is possible to move this Admin from oldSN to newSN.
     * Base this decision on the Admin's JE HA group information.
     */
    public Remedy checkAdminMove(Admin admin,
                                 AdminId adminId,
                                 StorageNodeId oldSN,
                                 StorageNodeId newSN) {

        /*
         * Find all the Admins and ask them for the JEHA Group info.
         */
        JEHAInfo jeHAInfo = getAdminJEHAInfo(admin, adminId);
        if (jeHAInfo == null) {
            /*
             * We were not able to get JE HA info. This doesn't make sense,
             * because this code runs on the Admin as part of deploying a
             * topology change, so the Admin it is running on must be an Admin
             * master, and the JEHAGroup info should be available.
             */
            throw new NonfatalAssertionException
                ("Attempting to check location for admin " + adminId +
                 " but could not obtain JE HA group db info");

        }

        Parameters params = admin.getCurrentParameters();
        Topology topo = admin.getCurrentTopology();

        /*
         * Compare the
         *  1. jeHA snId for this admin
         *  2. the Admin params snid
         *  3. the remote config.xml for the new SN
         *
         * These conditions should be true:
         *  a. jeHASNId is either oldSN or newSN
         *  b. adminParamsSNID is either oldSN or newSN
         *  c. the newSN's config.xml has either no admin or this admin.
         */
        StorageNodeId jeHASNId = jeHAInfo.getSNId();
        StorageNodeId adminParamsSNId = params.get(adminId).getStorageNodeId();
        SNServices remoteInfo = readOneSNRemoteParams(topo, newSN,
                                                      admin.getLoginManager());
        boolean remoteNewSNCorrect = false;
        if (remoteInfo != null) {
            AdminId remoteAdminId = remoteInfo.getAdminId();
            remoteNewSNCorrect = ((remoteAdminId == null) ||
                                  (adminId.equals(remoteAdminId)));
        }

        if (!((jeHASNId.equals(oldSN) || jeHASNId.equals(newSN)) &&
              (adminParamsSNId.equals(oldSN) || adminParamsSNId.equals(newSN)) &&
              remoteNewSNCorrect)) {

            /* Unexpectedly, conditions a, b and c are not fulfilled. */
            return new RunRepairRemedy(adminId);
        }

        return new OkayRemedy(adminId, jeHAInfo);
    }

    /**
     * Check that the Admin's JE HA group matches the AdminDB.
     *
     * Since Admins are
     *  1. only moved as a result of migrate-sn
     *  2. by definition, quorum exists
     *  3. the source SN is shut down
     * the AdminDB is always seen as source of truth. Unlike RNs, we never
     * need to revert an Admin location back to the old SN. If the Admin's
     * JE HA group does not match the AdminDB, update its HA address.
     */
    public Remedy checkAdminLocation(Admin admin,
                                     AdminId adminId) {

        /*
         * Find all the Admins and ask them for the JEHA Group info.
         */
        JEHAInfo jeHAInfo = getAdminJEHAInfo(admin, adminId);
        if (jeHAInfo == null) {
            /* There isn't an Admin master */
            return new NoFixRemedy(adminId);
        }

        Parameters params = admin.getCurrentParameters();
        Topology topo = admin.getCurrentTopology();

        /*
         * Compare the
         *  1. jeHA snId for this admin
         *  2. the Admin params snid
         *  3. the remote config.xml for the new SN
         *
         * These conditions should be true:
         *  a. jeHASNId == adminParamsSNID
         *  b. the remote SN has this Admin in its config file.
         */
        StorageNodeId jeHASNId = jeHAInfo.getSNId();
        StorageNodeId adminParamsSNId = params.get(adminId).getStorageNodeId();
        SNServices remoteInfo = readOneSNRemoteParams(topo, adminParamsSNId,
                                                      admin.getLoginManager());
        boolean remoteSNCorrect = false;
        if ((remoteInfo != null) &&
            (adminId.equals(remoteInfo.getAdminId()))) {
            remoteSNCorrect = true;
        }

        AdminLocationInput adminLoc =
            new AdminLocationInput(jeHASNId, adminParamsSNId, remoteSNCorrect);

        if (!(jeHASNId.equals(adminParamsSNId) && remoteSNCorrect)) {

            /* Conditions a and b are not fulfilled. */
            return new FixAdminRemedy(this, adminLoc, adminId, jeHAInfo);
        }

        return new OkayRemedy(adminId, jeHAInfo);
    }

    /**
     * @return true if this RN/AN is in the config file of an SN other than this
     * one.
     */
    private boolean searchOtherSNs(StorageNodeId snId, ResourceId resId) {
        for (Map.Entry e :
                 snRemoteParams.entrySet()) {

            /* This is this SN, skip it */
            if (e.getKey().equals(snId)) {
                continue;
            }

            if (e.getValue().contains(resId)) {
                /* Found a different SN that has this RN/AN */
                return true;
            }
        }

        /* Didn't find this RN/AN on any other SN */
        return false;
    }

    /**
     * Try to get the JE HA repgroup db information for this shard.
     */
    private JEHAInfo getJEHAInfo(Admin admin, ResourceId resId) {

        RepGroupId rgId =
            new RepGroupId(Utils.getRepGroupId(resId).getGroupId());
        /*
         * Assemble a set of sockets to query by adding all the helper
         * hosts and nodehostports  from the rep node params for each
         * member of the shard.
         */
        Topology topo = admin.getCurrentTopology();
        Parameters params = admin.getCurrentParameters();
        RepGroup rg = topo.get(rgId);
        if (rg == null) {

            /*
             * Something is quite inconsistent; there's a RN in the SN that is
             * not in the topology. Give up on trying to get JE HA info.
             */
            return null;
        }

        final Set helperSockets = new HashSet<>();

        /*
         * Find the set of SNs that the topo thinks owns the RN, in order
         * to optimize the translation. The translation will look at those
         * first.
         */
        Set snCheckSet = new HashSet<>();
        for (RepNode member : rg.getRepNodes()) {
            RepNodeParams rnp = params.get(member.getResourceId());
            snCheckSet.add(rnp.getStorageNodeId());
            helperSockets.addAll
                (HostPortPair.getSockets(rnp.getJEHelperHosts()));
            helperSockets.add
                (HostPortPair.getSocket(rnp.getJENodeHostPort()));
        }

        ReplicationNetworkConfig repNetConfig = admin.getRepNetConfig();
        /* Armed with the helper sockets, see if we can find a JE Master */
        Set group =
            TopologyCheckUtils.getJEHAGroup(rgId.getGroupName(),
                                            JE_HA_TIMEOUT_MS,
                                            logger,
                                            helperSockets,
                                            repNetConfig);

        StringBuilder helperHosts = new StringBuilder();
        for (ReplicationNode rNode : group) {
            if (helperHosts.length() > 0) {
                helperHosts.append(",");
            }
            helperHosts.append(HostPortPair.getString(rNode.getHostName(),
                                                      rNode.getPort()));
        }

        for (ReplicationNode rNode : group) {
            StorageNodeId foundSNId = TopologyCheckUtils.translateToSNId
                (topo, params, snCheckSet, rNode.getHostName(),
                 rNode.getPort());

            if (foundSNId == null) {
                /*
                 * Not expected -- why is the SN referred to by the RN not
                 * in the topology?
                 */
                logger.log(Level.SEVERE, "{0} couldn''t find SN for {1}:{2}",
                       new Object[]{who, rNode.getHostName(), rNode.getPort()});
            } else if (TextIndexFeederManager.isTIFNode(rNode.getName())) {
                /* skip parse the node name if a TIF node*/
                continue;
            } else {
                ResourceId rid = null;
                try {
                    rid = RepNodeId.parse(rNode.getName());
                } catch (IllegalArgumentException ignore) {
                    rid = ArbNodeId.parse(rNode.getName());
                }

                if (rid.equals(resId)) {
                    return new JEHAInfo(foundSNId,
                                        rNode,
                                        helperHosts.toString());
                }
            }
        }
        return null;
    }

    /**
     * Try to get the JE HA repgroup db information for the Admin group.
     */
    private JEHAInfo getAdminJEHAInfo(Admin admin, AdminId adminId) {

        /*
         * Assemble a set of sockets to query by adding all the helper
         * hosts and nodehostports from the admin params.
         */
        Parameters params = admin.getCurrentParameters();

        final Set helperSockets = new HashSet<>();

        /*
         * Find the set of SNs that the topo thinks owns admins, in order to
         * optimize the translation of nodeHostPort to SN. The translation will
         * look at those first.
         */
        Set snCheckSet = new HashSet<>();
        for (AdminParams ap: params.getAdminParams()) {
            snCheckSet.add(ap.getStorageNodeId());
            helperSockets.addAll
                (HostPortPair.getSockets(ap.getHelperHosts()));
            helperSockets.add
                (HostPortPair.getSocket(ap.getNodeHostPort()));
        }

        String kvstoreName =
                admin.getCurrentParameters().getGlobalParams().getKVStoreName();
        String groupName = Admin.getAdminRepGroupName(kvstoreName);
        ReplicationNetworkConfig repNetConfig = admin.getRepNetConfig();
        /* Armed with the helper sockets, see if we can find a JE Master */
        Set group =
            TopologyCheckUtils.getJEHAGroup(groupName,
                                            JE_HA_TIMEOUT_MS,
                                            logger,
                                            helperSockets,
                                            repNetConfig);

        StringBuilder helperHosts = new StringBuilder();
        for (ReplicationNode rNode : group) {
            if (helperHosts.length() > 0) {
                helperHosts.append(",");
            }
            helperHosts.append(HostPortPair.getString(rNode.getHostName(),
                                                      rNode.getPort()));
        }

        Topology topo = admin.getCurrentTopology();
        for (ReplicationNode rNode : group) {
            StorageNodeId foundSNId = TopologyCheckUtils.translateToSNId
                (topo, params, snCheckSet, rNode.getHostName(),
                 rNode.getPort());

            if (foundSNId == null) {
                /*
                 * Not expected -- why is the SN referred to by the
                 * ReplicationNode not in the topology?
                 */
                logger.log(Level.SEVERE, "{0} couldn''t find SN for {1}:{2}",
                       new Object[]{who, rNode.getHostName(), rNode.getPort()});
            } else {
                AdminId aId = AdminId.parse(rNode.getName());
                if (aId.equals(adminId)) {
                    return new JEHAInfo(foundSNId,
                                        rNode,
                                        helperHosts.toString());
                }
            }
        }
        return null;
    }

    /**
     * Try to get remote params for all SNs in the topology. Used when we are
     * are trying to deduce what has happened without JE HA rep group db info.
     * @param loginManager
     * @return true if all SNs are found.
     */
    private boolean readAllSNRemoteParams(Topology topo,
                                          LoginManager loginManager) {

        /* Make sure each SN has a copy of its remote params fetched. */
        List allSNs = topo.getStorageNodeIds();
        for (StorageNodeId snId : allSNs) {
            if (readOneSNRemoteParams(topo, snId, loginManager) == null) {
                /* Give up, we can't guarantee that we will find all SN info */
                return false;
            }
        }
        return true;
    }

    /**
     * Read one SN's remote config file and save it in the snRemoteParams map.
     * @return the newly generated information.
     */
    private SNServices readOneSNRemoteParams(Topology topo,
                                             StorageNodeId snId,
                                             LoginManager loginManager) {
        RegistryUtils regUtils = new RegistryUtils(topo, loginManager);

        SNServices remoteInfo = snRemoteParams.get(snId);
        if (remoteInfo != null) {
            return remoteInfo;
        }

        /* Try to get params from the remote SN */
        LoadParameters remoteParams;
        try {
            StorageNodeAgentAPI sna = regUtils.getStorageNodeAgent(snId);
            remoteParams = sna.getParams();
            remoteInfo = processRemoteInfo(snId, remoteParams);
            snRemoteParams.put(snId, remoteInfo);
            logger.log(Level.INFO, "{0} loaded remote params for {1}",
                       new Object[]{who, snId});
            return remoteInfo;
        } catch (NotBoundException | RemoteException re) {
            logger.log(Level.INFO,
                      "{0} failed to reach {1} to load SN params: {2}",
                      new Object[]{who, snId, re});
        }
        return null;
    }

    /**
     * Make a table of recommendations for how to repair RN inconsistencies.
     * For context, here are the steps taken when a RN is first created by
     * DeployNewRN:
     * 1. update AdminDB
     * 2. create on new SN w/sna.createRepNode
     * 3. that ends up updating the JE HA rep group
     *
     * RNRelocate steps
     * 1. disable RN on old SN
     *    a. update AdminDB w/disable bit
     *    b. remote call to update SN config w/disable bit
     * 2. update AdminDB to move RN to new SN
     * 3. update JE HA rep group
     * 4. create RN on new SN
     * 5. delete RN on old SN
     */
    private static void initRNRemedies() {

        /*
         * When JEHA groupdb is available, give that status highest priority.
         * If it is available, change other locations to match.
         *
         *   RN is on SN      should be
         * topo|config|JEHA|  on thisSN failed task    |  remedy
         * --------------------------------------------------------------
         *  T     T     T      no problems    none
         *  T     T     F      DeployNewRN    clear from AdminDb and config
         *  T     F     T      RelocateRN     call sna.createRepNode
         *  T     F     F      DeployNewRN/   clear from AdminDB or
         *                     RelocateRN     (change AdminDB back to SN
         *                                  indicated by JE HA
         *  F     T     T      RelocateRN     re-add to AdminDB, reenable on SN
         *  F     T     F      RelocateRN     remove from this SN, call delete
         *  F     F     T     --- can't happen ---- check migrateSN
         *  F     F     F     no problems    none
         *
         * When JE HA info is not available, we can still reason what might
         * have happened by looking in the config files of other SNs.
         *
         * topo|cnfg|in other
         *     |    | SN    |
         *     |    |configs|  failed task    |  remedy
         * ----------------------------------------------------------------
         *  T     T     T        RelocateRN     eventually need to disable on
         *                                        other SN
         *  T     T     F        none           the user should re-start the
         *                                        RN
         *  T     F     T        RelocateRN     unknown, we know that
         *                                        task failed, not sure if
         *                                        after 2 or 3
         *  T     F     F    DeployNewRN or     remove from topo
         *  F     T     T        RelocateRN     eventually need to disable,
         *                                        but not safe to decide now
         *  F     T     F        RelocateRN     can't happen
         *  F     F     n/a
         */

        /* All three inputs (AdminDB, SN config, JE HA) are available */
        addRNRemedy(new RNLocationInput(TOPO_STATUS.HERE,
                                        CONFIG_STATUS.HERE,
                                        JEHA_STATUS.HERE),
                    OkayRemedy.FACTORY);
        addRNRemedy(new RNLocationInput(TOPO_STATUS.HERE,
                                        CONFIG_STATUS.HERE,
                                        JEHA_STATUS.GONE),
                    ClearAdminConfigRemedy.FACTORY);
        addRNRemedy(new RNLocationInput(TOPO_STATUS.HERE,
                                        CONFIG_STATUS.GONE,
                                        JEHA_STATUS.HERE),
                    CreateRNRemedy.FACTORY);
        addRNRemedy(new RNLocationInput(TOPO_STATUS.HERE,
                                        CONFIG_STATUS.GONE,
                                        JEHA_STATUS.GONE),
                    RevertRNRemedy.FACTORY);
        addRNRemedy(new RNLocationInput(TOPO_STATUS.GONE,
                                        CONFIG_STATUS.HERE,
                                        JEHA_STATUS.HERE),
                    RevertRNRemedy.FACTORY);
        addRNRemedy(new RNLocationInput(TOPO_STATUS.GONE,
                                        CONFIG_STATUS.HERE,
                                        JEHA_STATUS.GONE),
                    RemoveRNRemedy.FACTORY);
        addRNRemedy(new RNLocationInput(TOPO_STATUS.GONE,
                                        CONFIG_STATUS.GONE,
                                        JEHA_STATUS.HERE),
                    NoFixRemedy.FACTORY);
        addRNRemedy(new RNLocationInput(TOPO_STATUS.GONE,
                                        CONFIG_STATUS.GONE,
                                        JEHA_STATUS.GONE),
                    OkayRemedy.FACTORY);

        /*
         * JE HA GroupDB not available, but there is information about what
         * is in other remote SN config files. For these situations, try to
         * move forward, don't revert, because we don't know what the JE HA
         * situation is.
         */
        /* Relocate, needs to be disabled on the other SN */
        addRNRemedy(new RNLocationInput(TOPO_STATUS.HERE,
                                        CONFIG_STATUS.HERE,
                                        OTHERSN_STATUS.HERE),
                    DisableRNRemedy.FACTORY);

        addRNRemedy(new RNLocationInput(TOPO_STATUS.HERE,
                                        CONFIG_STATUS.HERE,
                                        OTHERSN_STATUS.GONE),
                    CreateRNRemedy.FACTORY);

        /* Don't know what to do */
        addRNRemedy(new RNLocationInput(TOPO_STATUS.HERE,
                                        CONFIG_STATUS.GONE,
                                        OTHERSN_STATUS.HERE),
                    NoFixRemedy.FACTORY);

        /* Deploy didn't finish */
        addRNRemedy(new RNLocationInput(TOPO_STATUS.HERE,
                                        CONFIG_STATUS.GONE,
                                        OTHERSN_STATUS.GONE),
                  ClearAdminConfigRemedy.FACTORY);

        /* Relocate failed, need remove this SN if disabled. */
        addRNRemedy(new RNLocationInput(TOPO_STATUS.GONE,
                                        CONFIG_STATUS.HERE,
                                        OTHERSN_STATUS.HERE),
                    NoFixRemedy.FACTORY);
        addRNRemedy(new RNLocationInput(TOPO_STATUS.GONE,
                                        CONFIG_STATUS.HERE,
                                        OTHERSN_STATUS.GONE),
                    NoFixRemedy.FACTORY); // TODO, figure out what to do
        addRNRemedy(new RNLocationInput(TOPO_STATUS.GONE,
                                        CONFIG_STATUS.GONE,
                                        OTHERSN_STATUS.GONE),
                    OkayRemedy.FACTORY);
        /* TODO: The chart above doesn't say if this case is OK -- is it? */
        addRNRemedy(new RNLocationInput(TOPO_STATUS.GONE,
                                        CONFIG_STATUS.GONE,
                                        OTHERSN_STATUS.HERE),
                    OkayRemedy.FACTORY);
    }

    /**
     * Make a table of recommendations for how to repair AN inconsistencies.
     */
    private static void initANRemedies() {

        /*
         * The Admin database is given the highest priority.
         *
         *   AN is on SN
         * topo|config|JEHA|  remedy
         * --------------------------------------------------------------
         *  T     T     T     no problems
         *  T     T     F                     Arbiter will add itself to the JE
         *                                    HA group automatically on start
         *  T     F     T                     call sna.createArbNode
         *  T     F     F                     call sna.createArbNode
         *  F     T     T                     remove from SN
         *  F     T     F          remove from this SN, call delete
         *  F     F     T
         *  F     F     T                    Arbiter needs to be removed from
         *                                   JE HA group by topology repair
         *  F     F     F     no problems    none
         */

        /* All three inputs (AdminDB, SN config, JE HA) are available */
        addANRemedy(new RNLocationInput(TOPO_STATUS.HERE,
                                        CONFIG_STATUS.HERE,
                                        JEHA_STATUS.HERE),
                    OkayRemedy.FACTORY);
        /* AN has not joined the group yet. */
        addANRemedy(new RNLocationInput(TOPO_STATUS.HERE,
                                        CONFIG_STATUS.HERE,
                                        JEHA_STATUS.GONE),
                    OkayRemedy.FACTORY);
        addANRemedy(new RNLocationInput(TOPO_STATUS.HERE,
                                        CONFIG_STATUS.GONE,
                                        JEHA_STATUS.HERE),
                    CreateRNRemedy.FACTORY);
        addANRemedy(new RNLocationInput(TOPO_STATUS.HERE,
                                        CONFIG_STATUS.GONE,
                                        JEHA_STATUS.GONE),
                    CreateRNRemedy.FACTORY);
        addANRemedy(new RNLocationInput(TOPO_STATUS.GONE,
                                        CONFIG_STATUS.HERE,
                                        JEHA_STATUS.HERE),
                    RemoveRNRemedy.FACTORY);
        addANRemedy(new RNLocationInput(TOPO_STATUS.GONE,
                                        CONFIG_STATUS.HERE,
                                        JEHA_STATUS.GONE),
                    RemoveRNRemedy.FACTORY);
        /* This should not happen in normal circumstances because JEHA
         * entry is removed before it is removed from topology.
         */
        addANRemedy(new RNLocationInput(TOPO_STATUS.GONE,
                                        CONFIG_STATUS.GONE,
                                        JEHA_STATUS.HERE),
                    NoFixRemedy.FACTORY);
        addANRemedy(new RNLocationInput(TOPO_STATUS.GONE,
                                        CONFIG_STATUS.GONE,
                                        JEHA_STATUS.GONE),
                    OkayRemedy.FACTORY);

        /* Needs to be disabled on the other SN */
        addANRemedy(new RNLocationInput(TOPO_STATUS.HERE,
                                        CONFIG_STATUS.HERE,
                                        OTHERSN_STATUS.HERE),
                    DisableRNRemedy.FACTORY);

        addANRemedy(new RNLocationInput(TOPO_STATUS.HERE,
                                        CONFIG_STATUS.HERE,
                                        OTHERSN_STATUS.GONE),
                    OkayRemedy.FACTORY);

        addANRemedy(new RNLocationInput(TOPO_STATUS.HERE,
                                        CONFIG_STATUS.GONE,
                                        OTHERSN_STATUS.HERE),
                    CreateRNRemedy.FACTORY);

        addANRemedy(new RNLocationInput(TOPO_STATUS.HERE,
                                        CONFIG_STATUS.GONE,
                                        OTHERSN_STATUS.GONE),
                  CreateRNRemedy.FACTORY);

        addANRemedy(new RNLocationInput(TOPO_STATUS.GONE,
                                        CONFIG_STATUS.HERE,
                                        OTHERSN_STATUS.HERE),
                    RemoveRNRemedy.FACTORY);
        addANRemedy(new RNLocationInput(TOPO_STATUS.GONE,
                                        CONFIG_STATUS.HERE,
                                        OTHERSN_STATUS.GONE),
                    RemoveRNRemedy.FACTORY);
        addANRemedy(new RNLocationInput(TOPO_STATUS.GONE,
                                        CONFIG_STATUS.GONE,
                                        OTHERSN_STATUS.GONE),
                    OkayRemedy.FACTORY);
        addANRemedy(new RNLocationInput(TOPO_STATUS.GONE,
                                        CONFIG_STATUS.GONE,
                                        OTHERSN_STATUS.HERE),
                    OkayRemedy.FACTORY);
    }


    /**
     * Use this method to add remedies to the RN remedies map, guarding against
     * two remedies for the same set of inputs.
     */
    private static void addRNRemedy(RNLocationInput key,
                                    RNRemedyFactory factory) {
        final RNRemedyFactory oldFactory = RN_REMEDIES.put(key, factory);
        if (oldFactory != null) {
            throw new IllegalStateException("Tried to overwrite remedy " +
                                            key + "/" + oldFactory +
                                            " with " + factory);
        }
    }

    private static void addANRemedy(RNLocationInput key,
                                    RNRemedyFactory factory) {
        final RNRemedyFactory oldFactory = AN_REMEDIES.put(key, factory);
        if (oldFactory != null) {
            throw new IllegalStateException("Tried to overwrite remedy " +
                                            key + "/" + oldFactory +
                                            " with " + factory);
        }
    }

   /**
     * Return all violations which are of a certain type, and are for a
     * specified kind of topology component.
     */
    private  Set filterViolations
                       (VerifyResults results, Class problemClass) {

        Set found = new HashSet<>();
        for (Problem p : results.getViolations()) {
            if (p.getClass().equals(problemClass)) {
                found.add(problemClass.cast(p));
            }
        }
        return found;
    }

    /** Apply all remedies in the list */
    public void applyRemedies(List repairs, AbstractPlan plan) {
        for (Remedy r: repairs) {
            applyRemedy(r, plan);
        }
    }

    /**
     * Given a remedy type, apply a fix.
     * @return true if fix was completed.
     */
    public boolean applyRemedy(Remedy remedy, AbstractPlan plan) {

        logger.log(Level.INFO, "{0} applying {1}", new Object[]{who, remedy});
        if (!remedy.isOkay() && !remedy.canFix()) {
            /* We should have had a way to fix this! */
            logger.log(Level.INFO, "{0} did not act upon {1}",
                       new Object[]{who, this});
            throw new UnsupportedOperationException();
        }
        return remedy.apply(plan);
    }

    /**
     * Fix Admin issues.
     */
    private boolean repairAdmin(FixAdminRemedy remedy, AbstractPlan plan) {
        Admin admin = plan.getAdmin();
        Parameters params = admin.getCurrentParameters();
        Topology topo = admin.getCurrentTopology();
        final AdminId targetId = remedy.getAdminId();
        final AdminParams ap = params.get(targetId);
        try {
            /*
             * Make sure that the AdminDB's params for a given admin are also
             * correctly reflected by the admin group's jeHAGroupDB, and also
             * that the Admin is started up.
             */
            ChangeServiceAddresses.changeAdminHAAddress
                (plan,
                 "repair Admin location for " + targetId,
                 params,
                 targetId);

            LoginManager loginMgr = admin.getLoginManager();
            RegistryUtils regUtils = new RegistryUtils(topo, loginMgr);
            StorageNodeAgentAPI sna =
                regUtils.getStorageNodeAgent(ap.getStorageNodeId());
            sna.createAdmin(ap.getMap());
            return true;
        } catch (OperationFaultException | RemoteException |
                 NotBoundException e) {
            logger.log(Level.INFO, "{0} repair of Admin saw {1}",
                       new Object[]{who, e});
            return false;
        }
    }

    /**
     * Update the admin parameters on the admin associated with the remedy if
     * they differ from the ones in the admin database or if the admin type
     * needs to be changed to match the datacenter.  Does not correct SN or
     * global parameters.
     */
    private boolean repairAdminParams(UpdateAdminParamsRemedy remedy,
                                      AbstractPlan plan) {
        final AdminId targetId = remedy.getAdminId();
        try {
            final State result = UpdateAdminParams.update(plan, null /* task */,
                                                          targetId);
            return result == State.SUCCEEDED;
        } catch (Exception e) {
            logger.log(Level.INFO, "{0} could not update admin params: {1}",
                       new Object[]{who, e});
            return false;
        }
    }

    /**
     * Remove the RN from the admin db and config.xml of this SN.
     */
    private boolean repairWithClearRN(ClearAdminConfigRemedy remedy,
                                      AbstractPlan plan) {
        Admin admin = plan.getAdmin();
        StorageNodeId snId = remedy.getSNId();
        RepNodeId rnId = remedy.getRNId();
        Topology topo = admin.getCurrentTopology();
        if (remedy.getRNLocationInput().presentInSNConfig) {
            logger.log(Level.INFO, "{0} trying to remove {1} from {2} config",
                       new Object[]{who, rnId, snId});
            RegistryUtils regUtils = new RegistryUtils(topo,
                                                       admin.getLoginManager());
            try {
                StorageNodeAgentAPI sna = regUtils.getStorageNodeAgent(snId);
                sna.destroyRepNode(rnId, true /*deleteData*/);
            } catch (NotBoundException | RemoteException re) {
                logger.log(Level.INFO,
                           "{0} couldn''t reach {1} to remove {2} {3}",
                           new Object[]{who, snId, rnId, re});
                return false;
            }
        }

        if (remedy.getRNLocationInput().presentInTopo) {
            topo.remove(rnId);
            admin.saveTopoAndRemoveRN(topo, plan.getDeployedInfo(),
                                      rnId, plan);
            logger.log(Level.INFO,
                      "{0} trying to remove {1} from topo and params",
                      new Object[]{who, rnId});
        }

        return true;
    }

    /**
     * Remove the AN from the admin db and config.xml of this SN.
     */
    private boolean repairWithClearAN(ClearAdminConfigRemedy remedy,
                                      AbstractPlan plan) {
        Admin admin = plan.getAdmin();
        StorageNodeId snId = remedy.getSNId();
        ArbNodeId anId = remedy.getANId();
        Topology topo = admin.getCurrentTopology();
        RepGroupId rgId = new RepGroupId(anId.getGroupId());
        if (remedy.getRNLocationInput().presentInSNConfig) {
            logger.log(Level.INFO, "{0} trying to remove {1} from {2} config",
                       new Object[]{who, anId, snId});
            RegistryUtils regUtils = new RegistryUtils(topo,
                                                       admin.getLoginManager());
            try {
                StorageNodeAgentAPI sna = regUtils.getStorageNodeAgent(snId);
                sna.destroyArbNode(anId, true /*deleteData*/);
            } catch (NotBoundException | RemoteException re) {
                logger.log(Level.INFO,
                           "{0} couldn''t reach {1} to remove {2} {3}",
                           new Object[]{who, snId, anId, re});
                return false;
            }
        }

        if (remedy.getRNLocationInput().presentInTopo) {

            /* Update helper hosts on peers. */
            Parameters params = admin.getCurrentParameters();
            String helpers =
                Utils.findHelpers(anId, params, topo);
            /* See if any peer RNs need their helper hosts updated */
            Set needsUpdate = new HashSet();
            RepGroup rg = topo.get(rgId);
            for (RepNode rn : rg.getRepNodes()) {
                if (!rn.getStorageNodeId().equals(snId)) {
                     RepNodeParams rnp = params.get(rn.getResourceId());
                     if (helperMismatch(rnp.getJEHelperHosts(), helpers)) {
                         RepNodeParams newrnp = new RepNodeParams(rnp);
                         newrnp.setJEHelperHosts(helpers);
                         needsUpdate.add(newrnp);
                     }
                }
            }

            topo.remove(anId);
            logger.log(Level.INFO,
                       "{0} trying to remove {1} from topo and params",
                       new Object[]{who, anId});
            admin.saveTopoAndRemoveAN(topo, plan.getDeployedInfo(),
                                      anId, plan);
            topo = admin.getCurrentTopology();
            if (needsUpdate.size() > 0) {
                admin.saveParams(needsUpdate,
                                 Collections.emptySet(),
                                 Collections.emptySet());
            }

            /* Send topology changes to all nodes.*/
            try {
                if (!Utils.broadcastTopoChangesToRNs(logger,
                                                     topo,
                                                     "remove AN repair " + anId,
                                                     admin.getParams().
                                                     getAdminParams(),
                                                     plan)) {
                    return false;
                }
            } catch (InterruptedException e) {
                return false;
            }
            logger.log(Level.INFO, "{0} removed AN {1}",
                       new Object[]{who, anId});
        }

        return true;
    }

    /**
     * Change the admin db to make this RN refer to the SN which JE HA thinks
     * is correct.  Note that the remedy's oldSNId may be null if we don't know
     * what to revert it to.
     */
    private boolean repairRevert(RevertRNRemedy remedy, AbstractPlan plan) {

        Admin admin = plan.getAdmin();

        /* Change the topology back to the "old" SN */
        final RepNodeId rnId = remedy.getRNId();
        final StorageNodeId oldSNId = remedy.getOldSNId();
        final ArbNodeId anId = remedy.getANId();
        final ResourceId resId = remedy.getResourceId();

        StorageNodeId correctSNId = null;
        String correctJEHAHostPort = null;
        String correctHelpers = null;
        JEHAInfo jeHAInfo = remedy.getJEHAInfo();
        Topology topo = admin.getCurrentTopology();
        Parameters params = admin.getCurrentParameters();

        if (jeHAInfo != null) {
            /*
             * We got a definitive statement from the JEHA repgroup db to
             * determine where the RN should live.
             */
            correctSNId = jeHAInfo.getSNId();
            correctHelpers = jeHAInfo.getHelpers();
            correctJEHAHostPort = jeHAInfo.getHostPort();
        } else if (oldSNId != null) {

            /* We had some other means of determining the proper SN */
            correctSNId = oldSNId;
            PortTracker portTracker = new PortTracker(topo, params, oldSNId);
            int haPort = portTracker.getNextPort(oldSNId);
            String haHostname = params.get(oldSNId).getHAHostname();
            correctJEHAHostPort = HostPortPair.getString(haHostname, haPort);
            correctHelpers = Utils.findHelpers(resId, params, topo);
            correctHelpers += "," + correctJEHAHostPort;
        }

        /* No known correct location - bail */
        if (correctSNId == null) {
            logger.log(Level.INFO, "{0} could not find correct owning SN {1}",
                       new Object[]{who, remedy});
            return false;
        }

        ChangedParams updated = null;
        boolean topoUpdated = false;
        if (rnId != null) {
            RepNode rn = topo.get(rnId);
            if (!rn.getStorageNodeId().equals(correctSNId)) {
                logger.log(Level.INFO, "{0} updating topology so {1} owns {2}",
                           new Object[]{who, correctSNId, rnId});
                RepNode updatedRN = new RepNode(correctSNId);
                RepGroup rg = topo.get(rn.getRepGroupId());
                rg.update(rn.getResourceId(),updatedRN);
                topoUpdated = true;
            }

             /* Revert the RN's params, and any peer params */
            updated =
                correctRNParams(topo,
                                params,
                                rnId,
                                correctSNId,
                                correctJEHAHostPort,
                                correctHelpers,
                                remedy.getNewStorageDir(),
                                correctSNId.equals(remedy.getSNId()),
                                admin.getLoginManager());
        } else {
            /* Work on AN */
            ArbNode an = topo.get(anId);
            if (!an.getStorageNodeId().equals(correctSNId)) {
                logger.log(Level.INFO, "{0} updating topology so {1} owns {2}",
                           new Object[]{who, correctSNId, anId});
                ArbNode updatedAN = new ArbNode(correctSNId);
                RepGroup rg = topo.get(an.getRepGroupId());
                rg.update(an.getResourceId(), updatedAN);
                topoUpdated = true;
            }

            updated =
                correctANParams(topo,
                                params,
                                anId,
                                correctSNId,
                                correctJEHAHostPort,
                                correctHelpers);

        }
        /* See which RNs might need to be prodded to refresh their params */
        boolean peersNeedUpdate = false;
        boolean rnNeedsUpdate = false;
        Set needsUpdate = updated.getRNP();
        for (RepNodeParams updatedRNP : needsUpdate) {
            if (updatedRNP.getRepNodeId().equals(rnId)) {
                rnNeedsUpdate = true;
            } else {
                peersNeedUpdate = true;
            }
        }

        Set anUpdate = updated.getANP();
        boolean anNeedsUpdate = false;
        for (ArbNodeParams updatedANP : anUpdate) {
            if (updatedANP.getArbNodeId().equals(anId)) {
                anNeedsUpdate = true;
            } else {
                peersNeedUpdate = true;
            }
        }

        /* Write the changes to the AdminDB */
        if (topoUpdated) {
            admin.saveTopoAndParams(topo, plan.getDeployedInfo(),
                                    needsUpdate,
                                    Collections.emptySet(),
                                    anUpdate, plan);
            try {
                Utils.broadcastTopoChangesToRNs
                    (logger, admin.getCurrentTopology(),
                     who + " updating topo",
                     admin.getParams().getAdminParams(), plan);
            } catch (InterruptedException e) {
                logger.log(Level.INFO, "{0} couldn''t update topo: {1}",
                           new Object[]{who, e});
                return false;
            }
        } else {
            if (!needsUpdate.isEmpty()) {
                plan.getAdmin().saveParams(
                    needsUpdate, Collections.emptySet(),
                    anUpdate);
            }
        }

        /*
         * Restart the RN on the correct SN, make sure it houses the RN with
         * the correct params.
         */
        if (rnNeedsUpdate) {
            try {
                RelocateRN.startRN(plan, correctSNId, rnId);
            } catch (Exception e) {
                logger.log(Level.INFO, "{0} couldn''t start {1}",
                           new Object[]{who, rnId});
            }
        }

        /*
         * Restart the AN on the correct SN, make sure it houses the AN with
         * the correct params.
         */
        if (anNeedsUpdate) {
            try {
                RelocateAN.createStartAN(plan, correctSNId, anId);
            } catch (Exception e) {
                logger.log(Level.INFO, "{0} couldn''t start {1}",
                           new Object[]{who, anId});
            }
        }

        /* Update params at peers, if needed */
        if (peersNeedUpdate) {
            try {
                Utils.refreshParamsOnPeers(plan, resId);
            } catch (Exception e) {
                logger.log(Level.INFO,
                           "{0} couldn''t update helper hosts at peers", who);
                return false;
            }
        }

        return true;
    }

    /**
     * Start the RN on this SN.
     */
    private boolean repairStartRN(CreateRNRemedy remedy, AbstractPlan plan) {

        try {
            RelocateRN.startRN(plan, remedy.getSNId(), remedy.getRNId());
        } catch (Exception e) {
            return false;
        }
        return true;
    }

    /**
     * Start the AN on this SN.
     */
    private boolean repairStartAN(CreateRNRemedy remedy, AbstractPlan plan) {

        try {
            RelocateAN.createStartAN(plan, remedy.getSNId(), remedy.getANId());
        } catch (Exception e) {
            return false;
        }
        return true;
    }

    /**
     * Remove the RN or AN from this SN.
     */
    private boolean repairRemove(RemoveRNRemedy remedy, AbstractPlan plan) {

        boolean retstatus = false;
        try {
            if (remedy.getRNId() != null) {
                retstatus =
                    Utils.destroyRepNode(plan,
                                         System.currentTimeMillis(),
                                         remedy.getSNId(),
                                         remedy.getRNId());
            } else {
                Admin admin = plan.getAdmin();
                retstatus =
                    Utils.destroyArbNode(admin,
                                         plan,
                                         remedy.getSNId(),
                                         remedy.getANId());
                /* Remove AN from JEHA group */
                if (remedy.getRNLocationInput().presentInJEHA) {
                    JEHAInfo jehainfo = getJEHAInfo(admin, remedy.getANId());
                    if (jehainfo != null &&
                        jehainfo.getSNId().equals(remedy.getSNId())) {
                        Utils.removeHAAddress(
                            admin.getCurrentTopology(),
                            admin.getParams().getAdminParams(),
                            remedy.getANId(), remedy.getSNId(), plan,
                            new RepGroupId(remedy.repGroupId),
                            remedy.getJEHAInfo().groupWideHelperHosts,
                            logger);
                    }
                }
            }
        } catch (InterruptedException e) {
            ResourceId rid =
                remedy.getRNId() != null ? remedy.getRNId() : remedy.getANId();
            logger.log(Level.INFO,
                       "{0} couldn''t remove {1} from {2} because of {3}",
                       new Object[]{who, rid, remedy.getSNId(), e});
        }
        return retstatus;
    }

    /**
     * Update the RN's parameters.
     */
    private boolean repairRNParams(UpdateRNParamsRemedy remedy,
                                   AbstractPlan plan) {
        try {
            final State result =
                UpdateRepNodeParams.update(plan, null /* task */,
                                           remedy.getRNId(), 
                                           true /* do Node type processing */, 
                                           true /* do health check */);
            return result == State.SUCCEEDED;
        } catch (Exception e) {
            logger.log(Level.INFO, "{0} could not update RN params: {1}",
                       new Object[]{who, e});
            return false;
        }
    }

    /**
     * Repair the AN parameters to correct inconsistencies.
     */
    private boolean repairANParams(AbstractPlan plan, ArbNodeId anId) {
        try {
            return repairANParamsInternal(plan, anId);

        } catch (NotBoundException | RemoteException e) {
            logger.log(Level.INFO,
                       "{0} couldn''t correct parameters for{1} because of {2}",
                       new Object[]{who, anId, e});
            return false;
        }
    }

    private boolean repairANParamsInternal(AbstractPlan plan,
                                          ArbNodeId anId)
        throws NotBoundException, RemoteException {

        /* Get admin DB parameters */
        final Admin admin = plan.getAdmin();
        final Parameters dbParams = admin.getCurrentParameters();
        final ArbNodeParams anDbParams = dbParams.get(anId);
        final Topology topo = admin.getCurrentTopology();
        final ArbNode thisRn = topo.get(anId);
        final StorageNodeId snId = thisRn.getStorageNodeId();

        /* Get SN configuration parameters */
        final RegistryUtils regUtils =
            new RegistryUtils(topo, plan.getLoginManager());
        final StorageNodeAgentAPI sna = regUtils.getStorageNodeAgent(snId);
        final LoadParameters configParams = sna.getParams();
        final CompareParamsResult snCompare =
            VerifyConfiguration.compareParams(configParams,
                                              anDbParams.getMap());

        /* Get in-memory parameters from the AN */
        LoadParameters serviceParams = null;
        try {
            final ArbNodeAdminAPI ana = regUtils.getArbNodeAdmin(anId);
            serviceParams = ana.getParams();
        } catch (RemoteException | NotBoundException e) {
            logger.log(Level.INFO, "{0} problem calling {1}: {2}",
                       new Object[]{who, anId, e});
        }

        /*
         * Check if parameters file needs to be updated, if the AN needs to
         * read them, and if the AN needs to be restarted.
         */
        final CompareParamsResult serviceCompare;
        final CompareParamsResult combinedCompare;
        if (serviceParams == null) {
            serviceCompare = CompareParamsResult.NO_DIFFS;
            combinedCompare = snCompare;
        } else {
            serviceCompare = VerifyConfiguration.compareServiceParams(
                snId, anId, serviceParams, dbParams);
            combinedCompare = VerifyConfiguration.combineCompareParamsResults(
                snCompare, serviceCompare);
        }
        if (combinedCompare == CompareParamsResult.MISSING) {
            logger.log(Level.INFO,
                       "{0} couldn''t update parameters for {1} " +
                       "because some parameters were missing",
                       new Object[] {who, anId});
            return false;
        }

        if (combinedCompare == CompareParamsResult.NO_DIFFS) {
            return true;
        }

        if (snCompare != CompareParamsResult.NO_DIFFS) {
            logger.log(Level.INFO, "{0} updating AN config parameters", who);
            sna.newArbNodeParameters(anDbParams.getMap());
        }
        if (serviceCompare == CompareParamsResult.DIFFS) {
            logger.log(Level.INFO, "{0} notify AN of new parameters", who);
            regUtils.getArbNodeAdmin(anId).newParameters();
        } else {

            /* Stop running node in preparation for restarting it */
            if (serviceCompare == CompareParamsResult.DIFFS_RESTART) {

                try {
                    Utils.disableAndStopAN(plan, snId, anId);
                } catch (OperationFaultException e) {
                    throw new CommandFaultException(
                        e.getMessage(), e, ErrorMessage.NOSQL_5400,
                        CommandResult.PLAN_CANCEL);
                }
            }

            /*
             * Restart the node, or start it if it was not running and is
             * not disabled
             */
            if ((serviceCompare == CompareParamsResult.DIFFS_RESTART) ||
                ((serviceParams == null) && !anDbParams.isDisabled())) {
                try {
                    Utils.startAN(plan, snId, anId);
                    Utils.waitForNodeState(plan, anId, ServiceStatus.RUNNING);
                } catch (Exception e) {
                    throw new CommandFaultException(
                        e.getMessage(), e, ErrorMessage.NOSQL_5400,
                        CommandResult.PLAN_CANCEL);
                }
            }
        }
        return true;
    }

    /**
     *
     * @param plan
     * @param resId AN or RN identifier
     * @return true if the fix was applied or not needed, and false if the
     * fix failed or could not be applied
     */
    private boolean repairHelpers(AbstractPlan plan,
                                  ResourceId resId) {
        /* Get admin DB parameters */
        final Admin admin = plan.getAdmin();
        final Parameters dbParams = admin.getCurrentParameters();
        final Topology topo = admin.getCurrentTopology();
        final String topoHelpersAsString =
            Utils.findHelpers(resId, dbParams, topo);
        boolean retStatus = true;
        String oldHelpers = null;

        if (resId instanceof ArbNodeId) {
            ArbNodeId anId = (ArbNodeId)resId;
            ArbNodeParams anp = dbParams.get(anId);
            if (anp == null) {
                return retStatus;
            }
            retStatus =
                Utils.updateHelperHost(admin, topo,
                                       anId, logger);

            try {
                /* Have the AN notice its new params */
                RegistryUtils registry =
                    new RegistryUtils(topo, admin.getLoginManager());
                ArbNodeAdminAPI anAdmin = registry.getArbNodeAdmin(anId);
                anAdmin.newParameters();
            } catch (Exception e) {

            }
        } else {
            RepNodeId rnId = (RepNodeId)resId;
            RepNodeParams rnp = dbParams.get(rnId);
            if (rnp == null) {
                return retStatus;
            }
            retStatus =
                Utils.updateHelperHost(admin, topo,
                                       rnId, logger);
            try {
                /* Have the RN notice its new params */
                RegistryUtils registry =
                    new RegistryUtils(topo, admin.getLoginManager());
                RepNodeAdminAPI rnAdmin = registry.getRepNodeAdmin(rnId);
                rnAdmin.newParameters();
            } catch (Exception e) {
            }
        }
        logger.log(Level.INFO,
                   "{0} repair of helper hosts for {1} old helpers {2} " +
                   "new helpers {3}",
                   new Object[]{who, resId, oldHelpers, topoHelpersAsString});
        return retStatus;
    }

    /**
     * Generate a set of correct RN params for all nodes of this shard. Set the
     * heap/cache, storage directory, and helper hosts correctly.
     * @param loginManager
     */
    private ChangedParams correctRNParams(Topology topo,
                                          Parameters params,
                                          RepNodeId rnId,
                                          StorageNodeId correctSNId,
                                          String correctJEHAHostPort,
                                          String correctHelpers,
                                          StorageDirectory newStorageDir,
                                          boolean correctSNIsNewSN,
                                          LoginManager loginManager) {
        /*
         * Do the params point at the right SN? If not, make a copy of the
         * RepNodeParams and fix its snId, and other attributes.
         */
        RepNodeParams rnp = params.get(rnId);
        RepNodeParams fixedRNP = new RepNodeParams(rnp);
        boolean addFixedParams = false;

        if (!rnp.getStorageNodeId().equals(correctSNId)) {
            fixedRNP.setStorageNodeId(correctSNId);
            Utils.setRNPHeapCacheGC(params.copyPolicies(),
                                    params.get(correctSNId),
                                    fixedRNP,
                                    topo);

            if (correctSNIsNewSN) {
                if (newStorageDir == null) {
                    fixedRNP.setStorageDirectory(null, 0L);
                } else {
                    fixedRNP.setStorageDirectory(newStorageDir.getPath(),
                                                 newStorageDir.getSize());
                }
            } else {
                /* Look in the remote SN config file for the storage dir info*/
                SNServices remoteInfo =
                        readOneSNRemoteParams(topo, correctSNId, loginManager);
                LoadParameters lp = remoteInfo.remoteParams;
                ParameterMap rMap = lp.getMap(rnId.getFullName(),
                                              ParameterState.REPNODE_TYPE);
                RepNodeParams remoteRNP = new RepNodeParams(rMap);
                fixedRNP.setStorageDirectory(remoteRNP.getStorageDirectoryPath(),
                                       remoteRNP.getStorageDirectorySize());
            }
            addFixedParams = true;

            logger.log(Level.INFO,
                       "{0} repair of repNodeParams for {1}/{2} set " +
                       "storagedir {3}",
                       new Object[]{who, correctSNId, rnId,
                                    fixedRNP.getStorageDirectoryPath()});
        }

        return correctCommonParams(params, rnp, addFixedParams, fixedRNP, rnId,
                                   topo, correctJEHAHostPort, correctHelpers);
    }

    private ChangedParams correctCommonParams(Parameters params,
                                              GroupNodeParams commonParams,
                                              boolean addFixedParams,
                                              GroupNodeParams fixedParams,
                                              ResourceId resId,
                                              Topology topo,
                                              String correctJEHAHostPort,
                                              String correctHelpers) {

        Set needUpdate = new HashSet<>();
        Set arbNeedUpdate = new HashSet<>();

        /* Is its HA address correct? */
        if (!commonParams.getJENodeHostPort().equals(correctJEHAHostPort)) {
            fixedParams.setJENodeHostPort(correctJEHAHostPort);
            addFixedParams = true;
        }

        /* Are the helpers correct? */
        if (helperMismatch(commonParams.getJEHelperHosts(), correctHelpers)) {
            fixedParams.setJEHelperHosts(correctHelpers);
            addFixedParams = true;
        }

        /* Note that we always assume that this RN should be enabled */
        if (commonParams.isDisabled()) {
            fixedParams.setDisabled(false);
            addFixedParams = true;
        }

        /* Get the rep group id */
        RepGroupId rgId;
        if (resId.getType() == ResourceType.REP_NODE) {
            rgId = topo.get((RepNodeId)resId).getRepGroupId();
            if (addFixedParams) {
                needUpdate.add((RepNodeParams)fixedParams);
            }
        } else {
            rgId = topo.get((ArbNodeId)resId).getRepGroupId();
            if (addFixedParams) {
                arbNeedUpdate.add((ArbNodeParams)fixedParams);
            }
        }

        /* See if any peer RNs need their helper hosts updated */
        for (RepNode peer : topo.get(rgId).getRepNodes()) {
            if (peer.getResourceId().equals(resId)) {
                continue;
            }
            RepNodeParams peerRNP = params.get(peer.getResourceId());
            if (helperMismatch(peerRNP.getJEHelperHosts(),
                               correctHelpers)) {
                RepNodeParams newRNP = new RepNodeParams(peerRNP);
                newRNP.setJEHelperHosts(correctHelpers);
                needUpdate.add(newRNP);
            }
        }

        /* See if any peer ANs need their helper hosts updated */
        for (ArbNode peer : topo.get(rgId).getArbNodes()) {
            if (peer.getResourceId().equals(resId)) {
                continue;
            }
            ArbNodeParams peerANP = params.get(peer.getResourceId());
            if (helperMismatch(peerANP.getJEHelperHosts(),
                               correctHelpers)) {
                ArbNodeParams newANP = new ArbNodeParams(peerANP);
                newANP.setJEHelperHosts(correctHelpers);
                arbNeedUpdate.add(newANP);
            }
        }

        return new ChangedParams(arbNeedUpdate, needUpdate);
    }

    /**
     * Generate a set of correct AN params for all nodes of this shard.
     */
    private ChangedParams correctANParams(Topology topo,
                                          Parameters params,
                                          ArbNodeId anId,
                                          StorageNodeId correctSNId,
                                          String correctJEHAHostPort,
                                          String correctHelpers) {
        /*
         * Do the params point at the right SN? If not, make a copy of the
         * params and fix its snId, and other attributes.
         */
        ArbNodeParams anp = params.get(anId);
        ArbNodeParams fixedANP = new ArbNodeParams(anp);
        boolean addFixedParams = false;
        if (!anp.getStorageNodeId().equals(correctSNId)) {
            fixedANP.setStorageNodeId(correctSNId);
            addFixedParams = true;

            logger.log(Level.INFO, "{0} repair of arbNodeParams for {1}/{2}",
                       new Object[]{who, correctSNId, anId});
        }

        return correctCommonParams(params, anp, addFixedParams, fixedANP, anId,
                                   topo, correctJEHAHostPort, correctHelpers);
    }

    /**
     * return true if the two helper host lists don't match
     */
    private boolean helperMismatch(String helperListA, String helperListB) {

        List helpersA = ParameterUtils.helpersAsList(helperListA);
        List helpersB = ParameterUtils.helpersAsList(helperListB);

        if (!helpersA.containsAll(helpersB)) {
            /* mismatch */
            return true;
        }

        if (!helpersB.containsAll(helpersA)) {
            /* mismatch */
            return true;
        }

        return false;
    }

    /**
     * Check whether the Admin DB's copy of RepNodeParams and the RN's version
     * match for the given shard, and update the RN if needed. Ignore any
     * connectivity issues; this method should succeed if possible, but should
     * not cause an error if not possible.
     *
     * Since this considers the Admin DB's copy to be authoritative, this
     * should only be used when we are sure that the Admin DB has been
     * previously validated and repaired if required.
     */
    private void ensureAdminDBAndRNParams(AbstractPlan plan,
                                          RepGroupId rgId) {
        Admin admin = plan.getAdmin();
        Topology topo = admin.getCurrentTopology();
        RegistryUtils regUtils = new RegistryUtils(topo,
                                                   admin.getLoginManager());
        Parameters currentParams = admin.getCurrentParameters();

        /* Check all the RNs of the shard */
        for (RepNode rn : topo.get(rgId).getRepNodes()) {
            RepNodeId rnId = rn.getResourceId();
            StorageNodeId snId = rn.getStorageNodeId();
            RepNodeParams rnp = currentParams.get(rnId);
            try {
                RepNodeAdminAPI rna = regUtils.getRepNodeAdmin(rnId);
                LoadParameters remoteParams = rna.getParams();
                if (remoteParams == null) {
                    logger.log(Level.INFO,
                               "{0} admin/rn param check for {1} did not " +
                               "find remote params for {2}",
                               new Object[]{who, plan, rnId});
                    continue;
                }
                ParameterMap remoteCopy =
                    remoteParams.getMapByType(ParameterState.REPNODE_TYPE);
                if (remoteCopy.equals(rnp.getMap())) {
                    /* Nothing to do, they match */
                    continue;
                }
                /* Write new params to the SN */
                StorageNodeAgentAPI sna = regUtils.getStorageNodeAgent(snId);
                sna.newRepNodeParameters(rnp.getMap());

                /* Notify the RN that there are new params. */
                RepNodeAdminAPI rnAdmin = regUtils.getRepNodeAdmin(rnId);
                rnAdmin.newParameters();
            } catch (RemoteException | NotBoundException ignore) {
                logger.log(Level.INFO,
                        "{0} failed to reach {1}/{2} to ensure admin/rn params",
                        new Object[]{who, snId, rnId});
            }
        }
    }

   /**
     * Remove empty shards if this topology has no RNs whatsoever. Used
     * when the initial deploy topology has failed before any RN or
     * partitions have been made.
     * TODO: what if the initial deploy fails because not all RNS could be
     * made? Then verify needs the number of partitions/the target topo
     * to do some fixing. Or just use topo rebalance?
     */
    public void repairInitialEmptyShards(VerifyResults results,
                                         AbstractPlan plan) {
        Set insufficientRNs =
            filterViolations(results, InsufficientRNs.class);
        logger.log(Level.FINE,
                   "{0} : RemoveInitialEmptyShards: insufficientRNs = {1}",
                   new Object[] {who, insufficientRNs});
        if (insufficientRNs.isEmpty()) {
            return;
        }

        /*
         * This is not an initial deployment; some RNs exist. Use topo
         * rebalance to fix the problem.
         */
        Topology currentTopo = plan.getAdmin().getCurrentTopology();
        if (!currentTopo.getRepNodeIds().isEmpty()) {
            logger.log(Level.FINE,
                       "{0} : RemoveInitialEmptyShards: {1} RNs exist, " +
                       "try another repair approach",
                       new Object[] {who, insufficientRNs});
            return;
        }

        /*
         * In general, an insufficient number of RNs means that the rebalance
         * command should be rerun. If there are no RNS at all, then the
         * initial deployment failed, and we can safely assume that there
         * are no underlying JE HA groups anywhere.
         */
        Set shardIds = currentTopo.getRepGroupIds();
        boolean shardsRemoved = false;
        for (RepGroupId rgId : shardIds) {
            currentTopo.remove(rgId);
            shardsRemoved = true;
        }

        if (shardsRemoved) {
            logger.log(Level.INFO, "{0} for {1} removed empty shards {2}",
                       new Object[]{who, plan, shardIds});
            plan.getAdmin().saveTopo(currentTopo,  plan.getDeployedInfo(),
                                     plan);
        }
    }

    /**
     * Encapsulate the information used to choose an Admin fix.
     */
    public static class AdminLocationInput {

        /* The SN which houses this Admin, based on JEHA */
        private final StorageNodeId jeHASNId;

        /* The SN which houses this Admin, based on its AdminParams */
        private final StorageNodeId adminParamsSNId;

        /* true if the adminParamsSNId also has the Admin in its config.xm */
        private final Boolean remoteNewSNCorrect;

        public AdminLocationInput(StorageNodeId jEHASNId,
                                  StorageNodeId adminParamsSNId,
                                  Boolean remoteNewSNCorrect) {
            this.jeHASNId = jEHASNId;
            this.adminParamsSNId = adminParamsSNId;
            this.remoteNewSNCorrect = remoteNewSNCorrect;
        }

        @Override
        public int hashCode() {
            final int prime = 31;
            int result = 1;
            result = prime
                    * result
                    + ((adminParamsSNId == null) ? 0
                            : adminParamsSNId.hashCode());
            result = prime * result
                    + ((jeHASNId == null) ? 0 : jeHASNId.hashCode());
            result = prime
                    * result
                    + ((remoteNewSNCorrect == null) ? 0
                            : remoteNewSNCorrect.hashCode());
            return result;
        }

        @Override
        public boolean equals(Object obj) {
            if (this == obj) {
                return true;
            }
            if (obj == null) {
                return false;
            }
            if (!(obj instanceof AdminLocationInput)) {
                return false;
            }
            AdminLocationInput other = (AdminLocationInput) obj;
            if (adminParamsSNId == null) {
                if (other.adminParamsSNId != null) {
                    return false;
                }
            } else if (!adminParamsSNId.equals(other.adminParamsSNId)) {
                return false;
            }
            if (jeHASNId == null) {
                if (other.jeHASNId != null) {
                    return false;
                }
            } else if (!jeHASNId.equals(other.jeHASNId)) {
                return false;
            }
            if (remoteNewSNCorrect == null) {
                if (other.remoteNewSNCorrect != null) {
                    return false;
                }
            } else if (!remoteNewSNCorrect.equals(other.remoteNewSNCorrect)) {
                return false;
            }

            return true;
        }

        @Override
        public String toString() {
            return "AdminLocationInput [jeHASNId=" + jeHASNId
                    + ", adminParamsSNId=" + adminParamsSNId
                    + ", remoteJEHASNCorrect=" + remoteNewSNCorrect + "]";
        }
    }

    /*
     * Use these enums for the input to the RNLocationInput, to avoid confusion
     * from mixing up booleans.
     */
    enum TOPO_STATUS {HERE, GONE}
    enum CONFIG_STATUS {HERE, GONE}
    enum JEHA_STATUS {HERE, GONE}
    enum OTHERSN_STATUS{HERE, GONE}

    /**
     * Encapsulate the information used to choose a RN fix.
     */
    public static class RNLocationInput {

        /* if true, this service is on this SN, according to the topo. */
        private final boolean presentInTopo;
        /* if true, this service is on this SN, according to the config.xml. */
        private final boolean presentInSNConfig;

        /*
         * if we are able to get groupDB info, then jeHAKnown is true. If it's
         * false, then presentInJE HA has no meaning.
         */
        private final boolean jeHAKnown;
        /* if true, this service is on this SN, according to the JEHAGroupDB */
        private final boolean presentInJEHA;

        /* if true, this service is present in another SN config file */
        private final boolean otherSNKnown;
        private final boolean presentInOtherSNConfig;

        /*
         * Use when you know neither the JE HA group info nor what
         * other SNs hold.
         */
        RNLocationInput(TOPO_STATUS topoStatus,
                        CONFIG_STATUS configStatus) {
            this(topoStatus, configStatus, false, JEHA_STATUS.GONE, false,
                 OTHERSN_STATUS.GONE);
        }

        /* Use when you know the JE HA group info. */
        RNLocationInput(TOPO_STATUS topoStatus,
                        CONFIG_STATUS configStatus,
                        JEHA_STATUS jeHAStatus) {
            this(topoStatus, configStatus, true, jeHAStatus, false,
                 OTHERSN_STATUS.GONE);
        }

        /*
         * Use when you don't know the JE HA group info, but know what the
         * other SNs hold.
         */
        RNLocationInput(TOPO_STATUS topoStatus,
                        CONFIG_STATUS configStatus,
                        OTHERSN_STATUS otherSNStatus) {
            this(topoStatus, configStatus, false, JEHA_STATUS.GONE, true,
                 otherSNStatus);
        }

        RNLocationInput(TOPO_STATUS topoStatus,
                        CONFIG_STATUS configStatus,
                        boolean jeHAKnown,
                        JEHA_STATUS jeHAStatus,
                        boolean otherSNKnown,
                        OTHERSN_STATUS otherConfigStatus) {

            this.presentInTopo = topoStatus.equals(TOPO_STATUS.HERE);
            this.presentInSNConfig = configStatus.equals(CONFIG_STATUS.HERE);
            this.jeHAKnown = jeHAKnown;
            this.presentInJEHA = jeHAStatus.equals(JEHA_STATUS.HERE);
            this.otherSNKnown = otherSNKnown;
            this.presentInOtherSNConfig =
                otherConfigStatus.equals(OTHERSN_STATUS.HERE);

        }

        @Override
        public int hashCode() {
            final int prime = 31;
            int result = 1;
            result = prime * result + (jeHAKnown ? 1231 : 1237);
            result = prime * result + (otherSNKnown ? 1231 : 1237);
            result = prime * result + (presentInJEHA ? 1231 : 1237);
            result = prime * result + (presentInOtherSNConfig ? 1231 : 1237);
            result = prime * result + (presentInSNConfig ? 1231 : 1237);
            result = prime * result + (presentInTopo ? 1231 : 1237);
            return result;
        }

        @Override
        public boolean equals(Object obj) {
            if (this == obj) {
                return true;
            }
            if (obj == null) {
                return false;
            }
            if (!(obj instanceof RNLocationInput)) {
                return false;
            }
            RNLocationInput other = (RNLocationInput) obj;
            if (jeHAKnown != other.jeHAKnown) {
                return false;
            }
            if (otherSNKnown != other.otherSNKnown) {
                return false;
            }
            if (presentInJEHA != other.presentInJEHA) {
                return false;
            }
            if (presentInOtherSNConfig != other.presentInOtherSNConfig) {
                return false;
            }
            if (presentInSNConfig != other.presentInSNConfig) {
                return false;
            }
            if (presentInTopo != other.presentInTopo) {
                return false;
            }
            return true;
        }

        @Override
        public String toString() {
            return "LocationInput [presentInTopo=" + presentInTopo
                + ", presentInSNConfig=" + presentInSNConfig
                + ", jeHAKnown=" + jeHAKnown + ", presentInJEHA="
                + presentInJEHA + ", otherSNKnown=" + otherSNKnown
                + ", presentInOtherSNConfig=" + presentInOtherSNConfig
                + "]";
        }
    }

    /** Provides a remedy for a problem. */
    public abstract static class Remedy {

        Remedy() { }

        /** Returns the resource associated with this remedy. */
        abstract ResourceId getResourceId();

        /** Describes this remedy. */
        abstract String problemDescription();

        /**
         * Returns whether the situation associated with this remedy is OK and
         * there was actually no problem.
         */
        public boolean isOkay() { return false; }

        /**
         * Returns whether there was a problem that can be fixed automatically.
         * This method returns false if the was no problem.
         */
        boolean canFix() { return false; }

        /**
         * Fixes the problem.
         *
         * @param plan the plan performing the fix
         * @return true if the fix was applied or not needed, and false if the
         * fix failed or could not be applied
         * @throws UnsupportedOperationException if this is a problem that
         * cannot be fixed automatically
         */
        abstract boolean apply(AbstractPlan plan);

        /**
         * Adds information to the toString result about additional subclass
         * fields.
         */
        abstract void toStringInternal(StringBuilder builder);

        /** Returns a simple name for the type of the remedy. */
        String remedyType() { return getClass().getSimpleName(); }

        @Override
        public final String toString() {
            final StringBuilder builder = new StringBuilder();
            builder.append("Remedy [");
            builder.append("remedyType=").append(remedyType());
            toStringInternal(builder);
            builder.append("]");
            return builder.toString();
        }
    }

    /** A factory for creating remedies for RN or AN problems. */
    abstract static class RNRemedyFactory {
        abstract Remedy createRemedy(
            TopologyCheck topoCheck, RNLocationInput rnLocationInput,
            StorageNodeId snId, ResourceId resId, JEHAInfo jeHAInfo,
            StorageNodeId oldSNId, StorageDirectory storageDirectory);
    }

    /** Situation is OK, no fix is needed. */
    public static class OkayRemedy extends Remedy {
        static final RNRemedyFactory FACTORY = new RNRemedyFactory() {
            @Override
            OkayRemedy createRemedy(
                TopologyCheck topoCheck, RNLocationInput rnLocationInput,
                StorageNodeId snId, ResourceId resId, JEHAInfo jeHAInfo,
                StorageNodeId oldSNId, StorageDirectory storageDirectory) {
                return new OkayRemedy(resId, jeHAInfo);
            }
        };
        private final ResourceId resourceId;
        private final JEHAInfo jeHAInfo;
        OkayRemedy(ResourceId resourceId, JEHAInfo jeHAInfo) {
            checkNull("resourceId", resourceId);
            this.resourceId = resourceId;
            this.jeHAInfo = jeHAInfo;
        }
        @Override
        ResourceId getResourceId() { return resourceId; }
        @Override
        String problemDescription() { return "No problem with " + resourceId; }
        @Override
        public boolean isOkay() { return true; }
        @Override
        boolean apply(AbstractPlan plan) { return true; }
        @Override
        void toStringInternal(StringBuilder builder) {
            builder.append(", resourceId=").append(resourceId)
                .append(", jeHAInfo=").append(jeHAInfo);
        }
        public JEHAInfo getJEHAInfo() { return jeHAInfo; }
    }

    /** Manual work needed. */
    static class NoFixRemedy extends Remedy {
        static final RNRemedyFactory FACTORY = new RNRemedyFactory() {
            @Override
            NoFixRemedy createRemedy(
                TopologyCheck topoCheck, RNLocationInput rnLocationInput,
                StorageNodeId snId, ResourceId resId, JEHAInfo jeHAInfo,
                StorageNodeId oldSNId, StorageDirectory storageDirectory) {
                return new NoFixRemedy(rnLocationInput, snId, resId, jeHAInfo,
                                       oldSNId);
            }
        };
        private final RNLocationInput rnLocationInput;
        private final StorageNodeId snId;
        private final ResourceId resourceId;
        private final JEHAInfo jeHAInfo;
        private final StorageNodeId oldSNId;
        NoFixRemedy(ResourceId resourceId) {
            this(null, null, resourceId, null, null);
        }
        NoFixRemedy(RNLocationInput rnLocationInput, StorageNodeId snId,
                    ResourceId resourceId, JEHAInfo jeHAInfo,
                    StorageNodeId oldSNId) {
            checkNull("resourceId", resourceId);
            this.rnLocationInput = rnLocationInput;
            this.snId = snId;
            this.resourceId = resourceId;
            this.jeHAInfo = jeHAInfo;
            this.oldSNId = oldSNId;
        }
        @Override
        ResourceId getResourceId() { return resourceId; }
        @Override
        String problemDescription() {
            return "No automatic fix available for problem with " + resourceId;
        }
        @Override
        boolean apply(AbstractPlan plan) {
            throw new UnsupportedOperationException();
        }
        @Override
        void toStringInternal(StringBuilder builder) {
            if (rnLocationInput != null) {
                builder.append(", rnLocationInput=").append(rnLocationInput);
            }
            if (snId != null) {
                builder.append(", snId=").append(snId);
            }
            builder.append(", resourceId=").append(resourceId);
            if (jeHAInfo != null) {
                builder.append(", jeHAInfo=").append(jeHAInfo);
            }
            if (oldSNId != null) {
                builder.append(", oldSNId=").append(oldSNId);
            }
        }
    }

    /** User must run plan repair-topology. */
    static class RunRepairRemedy extends Remedy {
        private final ResourceId resourceId;
        RunRepairRemedy(ResourceId resourceId) {
            checkNull("resourceId", resourceId);
            this.resourceId = resourceId;
        }
        @Override
        ResourceId getResourceId() { return resourceId; }
        @Override
        String problemDescription() {
            return "Please run plan repair-topology to fix inconsistent" +
                " location metadata";
        }
        @Override
        boolean apply(AbstractPlan plan) {
            throw new UnsupportedOperationException();
        }
        @Override
        void toStringInternal(StringBuilder builder) {
            builder.append(", resourceId=").append(resourceId);
        }
    }

    /** Fix an RN problem. */
    abstract static class RNRemedy extends Remedy {
        final TopologyCheck topoCheck;
        final RNLocationInput rNLocationInput;
        final StorageNodeId snId;
        final ResourceId resId;
        final JEHAInfo jeHAInfo;
        final int repGroupId;

        RNRemedy(TopologyCheck topoCheck, RNLocationInput rNLocationInput,
                 StorageNodeId snId, ResourceId resId, JEHAInfo jeHAInfo) {
            this.topoCheck = topoCheck;
            this.rNLocationInput = rNLocationInput;
            this.snId = snId;
            checkNull("resId", resId);
            this.resId = resId;
            this.jeHAInfo = jeHAInfo;
            repGroupId = Utils.getRepGroupId(resId).getGroupId();
        }
        @Override
        ResourceId getResourceId() { return resId; }
        @Override
        final boolean apply(AbstractPlan plan) {
            final boolean result = applyInternal(plan);

            /*
             * Update admin DB and params if the fix was successful and things
             * weren't already OK
             */
            if (result && !isOkay()) {
                topoCheck.ensureAdminDBAndRNParams(
                    plan, new RepGroupId(repGroupId));
            }
            return result;
        }
        abstract boolean applyInternal(AbstractPlan plan);
        RNLocationInput getRNLocationInput() { return rNLocationInput; }
        StorageNodeId getSNId() { return snId; }

        RepNodeId getRNId() {
            return resId.getType().isRepNode() ? (RepNodeId)resId : null;
        }

        ArbNodeId getANId() {
            return resId.getType().isArbNode() ? (ArbNodeId)resId : null;
        }

        JEHAInfo getJEHAInfo() { return jeHAInfo; }
        @Override
        void toStringInternal(StringBuilder builder) {
            if (jeHAInfo != null) {
                builder.append(", jeHAInfo=").append(jeHAInfo);
            }
            if (snId != null) {
                builder.append(", snId=").append(snId);
            }
            builder.append(", resId=").append(resId);
            if (rNLocationInput != null) {
                builder.append(", rNLocationInput=").append(rNLocationInput);
            }
        }
    }

    /** Remove RN from topo/params for this SN. */
    static class ClearAdminConfigRemedy extends RNRemedy {
        final static RNRemedyFactory FACTORY = new RNRemedyFactory() {
            @Override
            ClearAdminConfigRemedy createRemedy(
                TopologyCheck topoCheck, RNLocationInput rnLocationInput,
                StorageNodeId snId, ResourceId resId, JEHAInfo jeHAInfo,
                StorageNodeId oldSNId, StorageDirectory storageDirectory) {
                return new ClearAdminConfigRemedy(
                    topoCheck, rnLocationInput, snId, resId, jeHAInfo);
            }
        };
        ClearAdminConfigRemedy(TopologyCheck topoCheck,
                               RNLocationInput rnLocationInput,
                               StorageNodeId snId, ResourceId resId,
                               JEHAInfo jeHAInfo) {
            super(topoCheck, rnLocationInput, snId, resId, jeHAInfo);
        }
        @Override
        String problemDescription() {
            return resId + " is present in Admin metadata and on " + snId +
                " configuration but has not been created. Must be removed" +
                " from metadata";
        }
        @Override
        boolean canFix() { return true; }
        @Override
        boolean applyInternal(AbstractPlan plan) {
            if (resId.getType().isRepNode()) {
                return topoCheck.repairWithClearRN(this, plan);
            }
            return topoCheck.repairWithClearAN(this, plan);
        }
    }

    /** Tell an SN to create or start an RN. */
    static class CreateRNRemedy extends RNRemedy {
        final static RNRemedyFactory FACTORY = new RNRemedyFactory() {
            @Override
            CreateRNRemedy createRemedy(
                TopologyCheck topoCheck, RNLocationInput rnLocationInput,
                StorageNodeId snId, ResourceId resId, JEHAInfo jeHAInfo,
                StorageNodeId oldSNId, StorageDirectory storageDirectory) {
                return new CreateRNRemedy(
                    topoCheck, rnLocationInput, snId, resId, jeHAInfo);
            }
        };
        CreateRNRemedy(TopologyCheck topoCheck,
                       RNLocationInput rnLocationInput, StorageNodeId snId,
                       ResourceId resId, JEHAInfo jeHAInfo) {
            super(topoCheck, rnLocationInput, snId, resId, jeHAInfo);
        }
        @Override
        String problemDescription() {
            return "Must create or start " + resId + " on this SN.";
        }
        @Override
        boolean canFix() { return true; }
        @Override
        boolean applyInternal(AbstractPlan plan) {
            if (resId.getType().isRepNode()) {
                return topoCheck.repairStartRN(this, plan);
            }
            return topoCheck.repairStartAN(this, plan);
        }
    }

    /** User should run plan stop-service. */
    static class DisableRNRemedy extends RNRemedy {
        static final RNRemedyFactory FACTORY = new RNRemedyFactory() {
            @Override
            DisableRNRemedy createRemedy(
                TopologyCheck topoCheck, RNLocationInput rnLocationInput,
                StorageNodeId snId, ResourceId resId, JEHAInfo jeHAInfo,
                StorageNodeId oldSNId, StorageDirectory storageDirectory) {
                return new DisableRNRemedy(
                    topoCheck, rnLocationInput, snId, resId, jeHAInfo);
            }
        };
        DisableRNRemedy(TopologyCheck topoCheck,
                        RNLocationInput rnLocationInput, StorageNodeId snId,
                        ResourceId resId, JEHAInfo jeHAInfo) {
            super(topoCheck, rnLocationInput, snId, resId, jeHAInfo);
        }
        @Override
        String problemDescription() {
            return resId + " should be stopped and disabled on " + snId;
        }
        @Override
        boolean applyInternal(AbstractPlan plan) {
            throw new UnsupportedOperationException();
        }
    }

    /** Remove RN from this SN. */
    static class RemoveRNRemedy extends RNRemedy {
        static final RNRemedyFactory FACTORY = new RNRemedyFactory() {
            @Override
            RemoveRNRemedy createRemedy(
                TopologyCheck topoCheck, RNLocationInput rnLocationInput,
                StorageNodeId snId, ResourceId resId, JEHAInfo jeHAInfo,
                StorageNodeId oldSNId, StorageDirectory storageDirectory) {
                return new RemoveRNRemedy(
                    topoCheck, rnLocationInput, snId, resId, jeHAInfo);
            }
        };
        RemoveRNRemedy(TopologyCheck topoCheck,
                       RNLocationInput rnLocationInput, StorageNodeId snId,
                       ResourceId resId, JEHAInfo jeHAInfo) {
            super(topoCheck, rnLocationInput, snId, resId, jeHAInfo);
        }
        @Override
        String problemDescription() {
            return resId + " must be removed from " + snId;
        }
        @Override
        boolean canFix() { return true; }
        @Override
        boolean applyInternal(AbstractPlan plan) {
            return topoCheck.repairRemove(this, plan);
        }
    }

    /**
     * Revert RN from new SN back to old SN, or clean up for it to stay at the
     * new SN.
     */
    /* TODO: Maybe have a separate remedy for leaving the RN in the new SN? */
    static class RevertRNRemedy extends RNRemedy {
        static final RNRemedyFactory FACTORY = new RNRemedyFactory() {
            @Override
            RevertRNRemedy createRemedy(
                TopologyCheck topoCheck, RNLocationInput rnLocationInput,
                StorageNodeId snId, ResourceId resId, JEHAInfo jeHAInfo,
                StorageNodeId oldSNId, StorageDirectory newStorageDirectory) {
                return new RevertRNRemedy(
                    topoCheck, rnLocationInput, snId, resId, jeHAInfo,
                    oldSNId, newStorageDirectory);
            }
        };
        final StorageNodeId oldSNId;
        final StorageDirectory newStorageDirectory;
        RevertRNRemedy(TopologyCheck topoCheck,
                       RNLocationInput rnLocationInput, StorageNodeId snId,
                       ResourceId resId, JEHAInfo jeHAInfo,
                       StorageNodeId oldSNId,
                       StorageDirectory newStorageDirectory) {
            super(topoCheck, rnLocationInput, snId, resId, jeHAInfo);
            this.oldSNId = oldSNId;
            this.newStorageDirectory = newStorageDirectory;
        }
        @Override
        String problemDescription() {
            return resId + " must be moved back to its original hosting SN";
        }
        @Override
        boolean canFix() { return true; }
        @Override
        boolean applyInternal(AbstractPlan plan) {
            return topoCheck.repairRevert(this, plan);
        }
        @Override
        void toStringInternal(StringBuilder builder) {
            super.toStringInternal(builder);
            if (oldSNId != null) {
                builder.append(", oldSNId=").append(oldSNId);
            }
            if (newStorageDirectory != null) {
                builder.append(", newStorageDirPath=").
                        append(newStorageDirectory.getPath());
                builder.append(", newStorageDirSize=").
                        append(newStorageDirectory.getSize());
            }
        }
        StorageNodeId getOldSNId() {
            return oldSNId;
        }

        StorageDirectory getNewStorageDir() {
            return newStorageDirectory;
        }
    }

    /**
     * Update the RN parameters to fix differences or if the node type needs to
     * be changed to match the zone.
     */
    public static class UpdateRNParamsRemedy extends RNRemedy {
        public UpdateRNParamsRemedy(TopologyCheck topoCheck, RepNodeId rnId) {
            super(topoCheck, null, null, rnId, null);
        }
        @Override
        String problemDescription() {
            return "Change " + resId + " parameters to match saved values or" +
                " its zone type";
        }
        @Override
        boolean canFix() { return true; }
        @Override
        boolean applyInternal(AbstractPlan plan) {
            return topoCheck.repairRNParams(this, plan);
        }
    }

    /**
     * Update the AN parameters to fix differences.
     */
    public static class UpdateANParamsRemedy extends RNRemedy {
        public UpdateANParamsRemedy(TopologyCheck topoCheck, ArbNodeId anId) {
            super(topoCheck, null, null, anId, null);
        }
        @Override
        String problemDescription() {
            return "Change " + resId + " parameters to match saved values.";
        }
        @Override
        boolean canFix() { return true; }
        @Override
        boolean applyInternal(AbstractPlan plan) {
            return topoCheck.repairANParams(plan, (ArbNodeId)resId);
        }
    }

    /** Fix an admin problem. */
    abstract static class AdminRemedy extends Remedy {
        final TopologyCheck topoCheck;
        final AdminLocationInput adminLocationInput;
        final AdminId adminId;
        final JEHAInfo jeHAInfo;
        AdminRemedy(TopologyCheck topoCheck,
                    AdminLocationInput adminLocationInput, AdminId adminId,
                    JEHAInfo jeHAInfo) {
            this.topoCheck = topoCheck;
            this.adminLocationInput = adminLocationInput;
            checkNull("adminId", adminId);
            this.adminId = adminId;
            this.jeHAInfo = jeHAInfo;
        }
        @Override
        ResourceId getResourceId() { return adminId; }
        @Override
        void toStringInternal(StringBuilder builder) {
            if (jeHAInfo != null) {
                builder.append(", jeHAInfo=").append(jeHAInfo);
            }
            if (adminLocationInput != null) {
                builder.append(", adminLocationInput=")
                       .append(adminLocationInput);
            }
            builder.append(", adminId=").append(adminId);
        }
        AdminId getAdminId() { return adminId; }
    }

    /** Make the Admin's JE HA location consistent. */
    static class FixAdminRemedy extends AdminRemedy {
        FixAdminRemedy(TopologyCheck topoCheck,
                       AdminLocationInput adminLocationInput,
                       AdminId adminId, JEHAInfo jeHAInfo) {
            super(topoCheck, adminLocationInput, adminId, jeHAInfo);
        }
        @Override
        String problemDescription() {
            return "Ensure that the Admin's location metadata is consistent";
        }
        @Override
        boolean canFix() { return true; }
        @Override
        boolean apply(AbstractPlan plan) {
            return topoCheck.repairAdmin(this, plan);
        }
    }

    /**
     * Update the admin parameters to fix differences or if the admin type
     * needs to be changed to match the datacenter.
     */
    public static class UpdateAdminParamsRemedy extends AdminRemedy {
        public UpdateAdminParamsRemedy(TopologyCheck topoCheck,
                                       AdminId adminId) {
            super(topoCheck, null, adminId, null);
        }
        @Override
        String problemDescription() {
            return "Change " + adminId + " parameters to match saved values" +
                " or its zone type";
        }
        @Override
        boolean canFix() { return true; }
        @Override
        boolean apply(AbstractPlan plan) {
            return topoCheck.repairAdminParams(this, plan);
        }
    }

   /**
    * Repairs the helper hosts to match the RNs and ANs specified in the
    * topology.
    */
    public static class TopoHelperRemedy extends Remedy {
        final TopologyCheck topoCheck;
        private final ResourceId resourceId;
        public TopoHelperRemedy(TopologyCheck topoCheck,
                                ResourceId resourceId ) {
            checkNull("resourceId", resourceId);
            this.resourceId = resourceId;
            this.topoCheck = topoCheck;
        }

        @Override
        ResourceId getResourceId() { return resourceId; }
        @Override
        String problemDescription() {
            return "Helper parameters do not "+
                   "match Topology for " + resourceId; }
        @Override
        boolean apply(AbstractPlan plan) {
            return topoCheck.repairHelpers(plan, resourceId);
        }
        @Override
        boolean canFix() { return true; }
        @Override
        void toStringInternal(StringBuilder builder) {
            builder.append(", resourceId=").append(resourceId);
        }
    }

    /**
     * Process an SN's config.xml - to generate a list of the services it
     * thinks it hosts.
     */
    private SNServices processRemoteInfo(StorageNodeId snId,
                                         LoadParameters remoteParams) {

        /* Find all the RNs that are present in the SN's config file */
        List rnMaps =
            remoteParams.getAllMaps(ParameterState.REPNODE_TYPE);

        Set allRNs = new HashSet<>();
        for (ParameterMap map : rnMaps) {
            RepNodeId rnId = RepNodeId.parse(map.getName());
            allRNs.add(rnId);
        }

        /* Find all the Admins that are present in the SN's config file.*/
        ParameterMap adminMap =
            remoteParams.getMapByType(ParameterState.ADMIN_TYPE);
        AdminId aid = null;
        if (adminMap != null) {
            aid = new AdminId(adminMap.getOrZeroInt(ParameterState.AP_ID));
        }

        List arbMaps =
            remoteParams.getAllMaps(ParameterState.ARBNODE_TYPE);

        Set allARBs = new HashSet<>();
        for (ParameterMap map : arbMaps) {
            ArbNodeId arbId = ArbNodeId.parse(map.getName());
            allARBs.add(arbId);
        }

        return new SNServices(snId, allRNs, allARBs, aid, remoteParams);
    }

    /**
     * Info derived from the JEHA group db, about a node's hostname/port
     * and its peers.
     */
    public static class JEHAInfo {
        private final StorageNodeId translatedSNId;
        private final ReplicationNode jeReplicationNode;
        private final String groupWideHelperHosts;

        JEHAInfo(StorageNodeId translatedSNId,
                 ReplicationNode jeReplicationNode,
                 String groupWideHelperHosts) {
            this.translatedSNId = translatedSNId;
            this.jeReplicationNode = jeReplicationNode;
            this.groupWideHelperHosts = groupWideHelperHosts;
        }

        public StorageNodeId getSNId() {
            return translatedSNId;
        }

        String getHostPort() {
            return HostPortPair.getString(jeReplicationNode.getHostName(),
                                          jeReplicationNode.getPort());
        }

        String getHelpers() {
            return groupWideHelperHosts;
        }

        @Override
        public String toString() {
            return "JE derivedSN = " + translatedSNId +
                " RepNode=" + jeReplicationNode +
                " helpers=" + groupWideHelperHosts;
        }
    }

    class ChangedParams {
        private final Set anParams;
        private final Set rnParams;
        ChangedParams(Set anp, Set rnp) {
            anParams = anp;
            rnParams = rnp;
        }
        Set getANP() {
            return anParams;
        }
        Set getRNP() {
            return rnParams;
        }

    }
}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy