org.voltcore.agreement.MeshArbiter Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of voltdb Show documentation
VoltDB server libraries
There is a newer version: 10.1.1
/* This file is part of VoltDB.
 * Copyright (C) 2008-2020 VoltDB Inc.
 *
 * This program is free software: you can redistribute it and/or modify
 * it under the terms of the GNU Affero General Public License as
 * published by the Free Software Foundation, either version 3 of the
 * License, or (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU Affero General Public License for more details.
 *
 * You should have received a copy of the GNU Affero General Public License
 * along with VoltDB.  If not, see .
 */

package org.voltcore.agreement;

import static com.google_voltpatches.common.base.Predicates.equalTo;
import static com.google_voltpatches.common.base.Predicates.not;

import java.util.ArrayList;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.concurrent.TimeUnit;

import org.voltcore.logging.Level;
import org.voltcore.logging.VoltLogger;
import org.voltcore.messaging.FaultMessage;
import org.voltcore.messaging.Mailbox;
import org.voltcore.messaging.SiteFailureForwardMessage;
import org.voltcore.messaging.SiteFailureMessage;
import org.voltcore.messaging.Subject;
import org.voltcore.messaging.VoltMessage;
import org.voltcore.utils.CoreUtils;
import org.voltcore.utils.Pair;
import org.voltcore.utils.RateLimitedLogger;
import org.voltdb.VoltDB;

import com.google_voltpatches.common.collect.ImmutableMap;
import com.google_voltpatches.common.collect.Lists;
import com.google_voltpatches.common.collect.Maps;
import com.google_voltpatches.common.collect.Sets;
import com.google_voltpatches.common.primitives.Longs;

public class MeshArbiter {

    protected static final int FORWARD_STALL_COUNT = 20 * 5; // 5 seconds

    protected static final VoltLogger REJOIN_LOGGER = new VoltLogger("REJOIN");
    private static final long LOGGING_START = 10000;

    protected static final Subject [] justFailures = new Subject [] { Subject.FAILURE };
    protected static final Subject [] receiveSubjects = new Subject [] {
        Subject.FAILURE,
        Subject.SITE_FAILURE_UPDATE,
        Subject.SITE_FAILURE_FORWARD
    };

    /**
     * During arbitration this map keys contain failed sites we are seeking
     * resolution for, and the values indicate whether or not the fault was
     * witnessed directly or relayed by others
     */
    protected final Map m_inTrouble = Maps.newTreeMap();
    /**
     * The invoking agreement site hsid
     */
    protected final long m_hsId;
    protected final Mailbox m_mailbox;
    /**
     * Companion interface that aides in pinging, and getting safe site zookeeper
     * transaction ids
     */
    protected final MeshAide m_meshAide;
    /**
     * A map whree the keys describe graph links between alive sites and
     * sites listed in the {@link #m_inTrouble} map, and the values are
     * the safe zookeeper transaction ids reported by alive sites
     */
    protected final HashMap, Long> m_failedSitesLedger =
            Maps.newHashMap();

    protected final Map m_decidedSurvivors = Maps.newHashMap();
    protected final List m_localHistoricDecisions = Lists.newLinkedList();

    /**
     * Historic list of failed sites
     */
    protected final Set m_failedSites = Sets.newTreeSet();

    protected final Map m_forwardCandidates = Maps.newHashMap();
    /**
     * it builds mesh graphs, and determines the the kill set to resolve
     * an arbitration
     */
    protected final AgreementSeeker m_seeker;

    /**
     * useful when probing the state of this mesh arbiter
     */
    protected volatile int m_inTroubleCount = 0;
    /**
     * useful when probing the state of this mesh arbiter. Each
     * resolved arbitration increments this counter
     */
    protected volatile int m_failedSitesCount = 0;

    public MeshArbiter(final long hsId, final Mailbox mailbox,
            final MeshAide meshAide) {

        m_hsId  = hsId;
        m_mailbox = mailbox;
        m_meshAide = meshAide;
        m_seeker = new AgreementSeeker(ArbitrationStrategy.MATCHING_CARDINALITY, m_hsId);
    }

    public boolean isInArbitration() {
        return m_inTroubleCount > 0;
    }

    public int getFailedSitesCount() {
        return m_failedSitesCount;
    }

    enum Discard {
        Suicide {
            @Override
            void log(FaultMessage fm) {
                REJOIN_LOGGER.info("Agreement, Discarding " + name()
                        + " reporter: "
                        + CoreUtils.hsIdToString(fm.reportingSite));
            }
        },
        AlreadyFailed {
            @Override
            void log(FaultMessage fm) {
                REJOIN_LOGGER.info("Agreement, Discarding " + name() + " "
                        + CoreUtils.hsIdToString(fm.failedSite));
            }
        },
        ReporterFailed {
            @Override
            void log(FaultMessage fm) {
                REJOIN_LOGGER.info("Agreement, Discarding " + name() + " "
                        + CoreUtils.hsIdToString(fm.reportingSite));
            }
        },
        Unknown {
            @Override
            void log(FaultMessage fm) {
                REJOIN_LOGGER.info("Agreement, Discarding " + name() + " "
                        + CoreUtils.hsIdToString(fm.failedSite));
            }
        },
        ReporterUnknown {
            @Override
            void log(FaultMessage fm) {
                REJOIN_LOGGER.info("Agreement, Discarding " + name() + " "
                        + CoreUtils.hsIdToString(fm.reportingSite));
            }
        },
        ReporterWitnessed {
            @Override
            void log(FaultMessage fm) {
                REJOIN_LOGGER.info("Agreement, Discarding " + name() + " "
                        + CoreUtils.hsIdToString(fm.reportingSite));
            }
        },
        SelfUnwitnessed {
            @Override
            void log(FaultMessage fm) {
                REJOIN_LOGGER.info("Agreement, Discarding " + name() + " "
                        + CoreUtils.hsIdToString(fm.failedSite));
            }
        },
        AlreadyKnow {
            @Override
            void log(FaultMessage fm) {
                REJOIN_LOGGER.info("Agreement, Discarding " + name() + " "
                        + CoreUtils.hsIdToString(fm.failedSite)
                        + " reporter: "
                        + CoreUtils.hsIdToString(fm.reportingSite)
                        + (fm.decided ? " decided: true" : ""));
            }
        },
        OtherUnwitnessed {
            @Override
            void log(FaultMessage fm) {
                REJOIN_LOGGER.info("Agreement, Discarding " + name()
                        + " other: "
                        + CoreUtils.hsIdToString(fm.failedSite)
                        + ", repoter: "
                        + CoreUtils.hsIdToString(fm.reportingSite)
                        + ", survivors: ["
                        + CoreUtils.hsIdCollectionToString(fm.survivors)
                        + "]");
            }
        },
        SoleSurvivor {
            @Override
            void log(FaultMessage fm) {
                REJOIN_LOGGER.info("Agreement, Discarding " + name()
                        + " repoter: "
                        + CoreUtils.hsIdToString(fm.reportingSite));
            }
        },
        DoNot {
            @Override
            void log(FaultMessage fm) {
            }
        };

        abstract void log(FaultMessage fm);
    }

    protected Discard mayIgnore(Set hsIds, FaultMessage fm) {
        Boolean alreadyWitnessed = m_inTrouble.get(fm.failedSite);

        if (fm.failedSite == m_hsId) {
            return Discard.Suicide;
        } else if (m_failedSites.contains(fm.failedSite)) {
            return Discard.AlreadyFailed;
        } else if (!hsIds.contains(fm.failedSite)) {
            return Discard.Unknown;
        } else if (m_failedSites.contains(fm.reportingSite)) {
            return Discard.ReporterFailed;
        } else if (!hsIds.contains(fm.reportingSite)) {
            return Discard.ReporterUnknown;
        } else if (fm.isSoleSurvivor()) {
            return Discard.SoleSurvivor;
        } else if (Boolean.TRUE.equals(m_inTrouble.get(fm.reportingSite))) {
            return Discard.ReporterWitnessed;
        } else if (!fm.witnessed && fm.reportingSite == m_hsId) {
            return Discard.SelfUnwitnessed;
        } else if (   alreadyWitnessed != null
                   && (   alreadyWitnessed
                       || alreadyWitnessed == (fm.witnessed || fm.decided))) {
            return Discard.AlreadyKnow;
        } else if (fm.survivors.contains(fm.failedSite)) {
            /*
             * by the time we get here we fm.failedSite is
             * within our survivors: not among failed, and among
             * hsids (not(not(among hsids)))
             */
            return Discard.OtherUnwitnessed;
        } else {
            return Discard.DoNot;
        }
    }

    /**
     * Convenience wrapper for tests that don't care about unknown sites
     */
    Map reconfigureOnFault(Set hsIds, FaultMessage fm) {
        return reconfigureOnFault(hsIds, fm, new HashSet());
    }

    /**
     * Process the fault message, and if necessary start arbitration.
     * @param hsIds pre-failure mesh ids
     * @param fm a {@link FaultMessage}
     * @param unknownFaultedSites Sites that we don't know about, but are informed
     * have failed; tracked here so that we can remove the associated hosts
     * @return a map where the keys are the sites we need to disconnect from, and
     *   the values the last know safe zookeeper transaction ids for the sites
     *   we need to disconnect from. A map with entries indicate that an
     *   arbitration resolutions has been reached, while a map without entries
     *   indicate either a stale message, or that an agreement has not been
     *   reached
     */
    public Map reconfigureOnFault(Set hsIds, FaultMessage fm, Set unknownFaultedSites) {
        boolean proceed = false;
        long blockedOnReceiveStart = System.currentTimeMillis();
        long lastReportTime = 0;
        do {
            Discard ignoreIt = mayIgnore(hsIds, fm);
            if (Discard.DoNot == ignoreIt) {
                m_inTrouble.put(fm.failedSite, fm.witnessed || fm.decided);
                REJOIN_LOGGER.info("Agreement, Processing " + fm);
                proceed = true;
            } else {
                ignoreIt.log(fm);
            }

            if (Discard.Unknown == ignoreIt) {
                unknownFaultedSites.add(fm.failedSite);
            }
            fm = (FaultMessage) m_mailbox.recv(justFailures);

            // If fault resolution takes longer then 10 seconds start logging for every second
            final long now = System.currentTimeMillis();
            final long blockedTime = now - blockedOnReceiveStart;
            if (blockedTime > LOGGING_START) {
                if (now - lastReportTime > 1000) {
                    REJOIN_LOGGER.warn("Agreement, Failure resolution reporting stalled for " + TimeUnit.MILLISECONDS.toSeconds(blockedTime) + " seconds");
                    lastReportTime = now;
                }
            }
        } while (fm != null);

        if (!proceed) {
            return ImmutableMap.of();
        }

        m_inTroubleCount = m_inTrouble.size();

        // we are here if failed site was not previously recorded
        // or it was previously recorded but it became witnessed from unwitnessed
        m_seeker.startSeekingFor(Sets.difference(hsIds, m_failedSites), m_inTrouble);
        if (REJOIN_LOGGER.isDebugEnabled()) {
            REJOIN_LOGGER.debug(String.format("\n %s\n %s\n %s\n %s\n %s",
                                m_seeker.dumpAlive(), m_seeker.dumpDead(),
                                m_seeker.dumpReported(), m_seeker.dumpSurvivors(),
                                dumpInTrouble()));
        }

        discoverGlobalFaultData_send(hsIds);

        while (discoverGlobalFaultData_rcv(hsIds)) {
            final long now = System.currentTimeMillis();
            final long blockedTime = now - blockedOnReceiveStart;
            if (blockedTime > LOGGING_START) {
                if (now - lastReportTime > 1000) {
                    REJOIN_LOGGER.warn("Agreement, Failure global resolution stalled for " + TimeUnit.MILLISECONDS.toSeconds(blockedTime) + " seconds");
                    lastReportTime = now;
                }
            }
            Map lastTxnIdByFailedSite = extractGlobalFaultData(hsIds);
            if (lastTxnIdByFailedSite.isEmpty()) {
                return ImmutableMap.of();
            }

            Set witnessed = Maps.filterValues(m_inTrouble, equalTo(Boolean.TRUE)).keySet();
            Set notClosed = Sets.difference(witnessed, lastTxnIdByFailedSite.keySet());
            if ( !notClosed.isEmpty()) {
                REJOIN_LOGGER.warn("Agreement, witnessed but not decided: ["
                        + CoreUtils.hsIdCollectionToString(notClosed)
                        + "] seeker: " + m_seeker);
            }

            if (!notifyOnKill(hsIds, lastTxnIdByFailedSite)) {
                continue;
            }

            m_failedSites.addAll( lastTxnIdByFailedSite.keySet());
            m_failedSitesCount = m_failedSites.size();

            REJOIN_LOGGER.info(
                    "Agreement, Adding "
                  + CoreUtils.hsIdCollectionToString(lastTxnIdByFailedSite.keySet())
                  + " to failed sites history");

            clearInTrouble(lastTxnIdByFailedSite.keySet());
            m_seeker.clear();

            return lastTxnIdByFailedSite;
        }

        return ImmutableMap.of();
    }

    /**
     * Notify all survivors when you are closing links to nodes
     * @param decision map where the keys contain the kill sites
     *   and its values are their last known safe transaction ids
     * @return true if successfully confirmed that all survivors
     * agree on the decision, false otherwise.
     */
    protected boolean notifyOnKill(Set hsIds, Map decision) {

        SiteFailureMessage.Builder sfmb = SiteFailureMessage.
                builder()
                .decisions(decision.keySet())
                .failures(decision.keySet());

        Set dests = Sets.filter(m_seeker.getSurvivors(), not(equalTo(m_hsId)));
        if (dests.isEmpty()) return true;

        sfmb.survivors(Sets.difference(m_seeker.getSurvivors(), decision.keySet()));
        sfmb.safeTxnIds(getSafeTxnIdsForSites(hsIds));

        SiteFailureMessage sfm = sfmb.build();
        m_mailbox.send(Longs.toArray(dests), sfm);

        REJOIN_LOGGER.info("Agreement, Sending ["
                + CoreUtils.hsIdCollectionToString(dests) + "]  " + sfm);

        // Check to see we've made the same decision before, if so, it's likely
        // that we've entered a loop, exit here.
        if (m_localHistoricDecisions.size() >= 100) {
            // Too many decisions have been made without converging
            RateLimitedLogger.tryLogForMessage(System.currentTimeMillis(),
                                               10, TimeUnit.SECONDS,
                                               REJOIN_LOGGER,
                                               Level.WARN,
                                               "Agreement, %d local decisions have been made without converging",
                                               m_localHistoricDecisions.size());
        }
        for (SiteFailureMessage lhd : m_localHistoricDecisions) {
            if (lhd.m_survivors.equals(sfm.m_survivors)) {
                REJOIN_LOGGER.info("Agreement, detected decision loop. Exiting");
                return true;
            }
        }
        m_localHistoricDecisions.add(sfm);

        // Wait for all survivors in the local decision to send their decisions over.
        // If one of the host's decision conflicts with ours, remove that host's link
        // and repeat the decision process.
        final Set expectedSurvivors = Sets.filter(sfm.m_survivors, not(equalTo(m_hsId)));
        REJOIN_LOGGER.info("Agreement, Waiting for agreement on decision from survivors " +
                           CoreUtils.hsIdCollectionToString(expectedSurvivors));

        final Iterator iter = m_decidedSurvivors.values().iterator();
        while (iter.hasNext()) {
            final SiteFailureMessage remoteDecision = iter.next();
            if (expectedSurvivors.contains(remoteDecision.m_sourceHSId)) {
                if (remoteDecision.m_decision.contains(m_hsId)) {
                    iter.remove();
                    REJOIN_LOGGER.info("Agreement, Received inconsistent decision from " +
                                       CoreUtils.hsIdToString(remoteDecision.m_sourceHSId) + ", " + remoteDecision);
                    final FaultMessage localFault = new FaultMessage(m_hsId, remoteDecision.m_sourceHSId);
                    localFault.m_sourceHSId = m_hsId;
                    m_mailbox.deliverFront(localFault);
                    return false;
                }
            }
        }

        long start = System.currentTimeMillis();
        boolean allDecisionsMatch = true;
        long lastReportTime = 0;
        do {
            final VoltMessage msg = m_mailbox.recvBlocking(receiveSubjects, 5);
            if (msg == null) {
                // Send a heartbeat to keep the dead host timeout active.
                m_meshAide.sendHeartbeats(m_seeker.getSurvivors());
                // If fault resolution takes longer then 10 seconds start logging for every second
                final long now = System.currentTimeMillis();
                final long blockedTime = now - start;
                if (blockedTime > LOGGING_START) {
                    if (now - lastReportTime > 1000) {
                        REJOIN_LOGGER.warn("Agreement, Still waiting for decisions from " +
                                CoreUtils.hsIdCollectionToString(Sets.difference(expectedSurvivors, m_decidedSurvivors.keySet())) +
                                " after " + TimeUnit.MILLISECONDS.toSeconds(blockedTime) + " seconds");
                        lastReportTime = now;
                    }
                }
                continue;
            }

            if (m_hsId != msg.m_sourceHSId && !expectedSurvivors.contains(msg.m_sourceHSId)) {
                // Ignore messages from failed sites
                continue;
            }

            if (msg.getSubject() == Subject.SITE_FAILURE_UPDATE.getId()) {
                final SiteFailureMessage fm = (SiteFailureMessage) msg;
                if (!fm.m_decision.isEmpty()) {
                    if (expectedSurvivors.contains(fm.m_sourceHSId)) {
                        if (fm.m_decision.contains(m_hsId)) {
                            m_decidedSurvivors.remove(fm.m_sourceHSId);
                            // The remote host has decided that we are gone, remove the remote host
                            final FaultMessage localFault = new FaultMessage(m_hsId, fm.m_sourceHSId);
                            localFault.m_sourceHSId = m_hsId;
                            m_mailbox.deliverFront(localFault);
                            return false;
                        } else {
                            m_decidedSurvivors.put(fm.m_sourceHSId, fm);
                        }
                    }
                } else {
                    m_mailbox.deliverFront(fm);
                    return false;
                }
            } else if (msg.getSubject() == Subject.FAILURE.getId()) {
                final FaultMessage fm = (FaultMessage) msg;
                if (!fm.decided) {
                    // In case of concurrent fault, handle it
                    m_mailbox.deliverFront(msg);
                    return false;
                }

                /* So this is a final update message.
                 * For every final update message this node expects to receive, two messages will be
                 * actually send to ZookeeperServer thread. One is a fault message which is used to
                 * trigger a new round of message broadcasting, another is the original final update
                 * message which is used by mesh arbiter to reach the agreement.
                 */

                // Send the message to ZookeeperServer thread only if the final update message can give
                // us new information (new alive/dead host)
                if (!m_seeker.alreadyKnow(fm) && mayIgnore(hsIds, fm) == Discard.DoNot) {
                    m_mailbox.deliverFront(msg);
                    return false;
                }

            }

            for (SiteFailureMessage remoteDecision : m_decidedSurvivors.values()) {
                if (!sfm.m_survivors.equals(remoteDecision.m_survivors)) {
                    allDecisionsMatch = false;
                }
            }
        } while (!m_decidedSurvivors.keySet().containsAll(expectedSurvivors) && allDecisionsMatch);

        return true;
    }

    protected void clearInTrouble(Set decision) {
        m_forwardCandidates.clear();
        m_failedSitesLedger.clear();
        m_decidedSurvivors.clear();
        m_localHistoricDecisions.clear();
        m_inTrouble.clear();
        m_inTroubleCount = 0;
    }

    protected Map getSafeTxnIdsForSites(Set hsIds) {
        ImmutableMap.Builder safeb = ImmutableMap.builder();
        for (long h: Sets.filter(hsIds, not(equalTo(m_hsId)))) {
            safeb.put(h, m_meshAide.getNewestSafeTransactionForInitiator(h));
        }
        return safeb.build();
    }

    /**
     * Send one message to each surviving execution site providing this site's
     * multi-partition commit point and this site's safe txnid
     * (the receiver will filter the later for its
     * own partition). Do this once for each failed initiator that we know about.
     * Sends all data all the time to avoid a need for request/response.
     */
    private void discoverGlobalFaultData_send(Set hsIds) {
        Set dests = Sets.filter(m_seeker.getSurvivors(), not(equalTo(m_hsId)));

        SiteFailureMessage.Builder msgBuilder = SiteFailureMessage.
                builder()
                .survivors(m_seeker.getSurvivors())
                .failures(m_inTrouble.keySet())
                .safeTxnIds(getSafeTxnIdsForSites(hsIds));

        SiteFailureMessage sfm = msgBuilder.build();
        sfm.m_sourceHSId = m_hsId;

        updateFailedSitesLedger(hsIds, sfm);
        m_seeker.add(sfm);

        m_mailbox.send(Longs.toArray(dests), sfm);

        REJOIN_LOGGER.info("Agreement, Sending survivors " + sfm);
        if (REJOIN_LOGGER.isDebugEnabled()) {
            REJOIN_LOGGER.debug(String.format("\n %s\n %s\n %s\n %s\n %s",
                               m_seeker.dumpAlive(), m_seeker.dumpDead(),
                               m_seeker.dumpReported(), m_seeker.dumpSurvivors(),
                               dumpInTrouble()));
        }
    }

    protected void updateFailedSitesLedger(Set hsIds, SiteFailureMessage sfm) {
        for (Map.Entry e: sfm.m_safeTxnIds.entrySet()) {

            if(  !hsIds.contains(e.getKey())
               || m_hsId == e.getKey()
               || e.getKey() == sfm.m_sourceHSId) continue;

            m_failedSitesLedger.put(
                    Pair.of(sfm.m_sourceHSId, e.getKey()),
                    e.getValue());
        }
    }

    protected void addForwardCandidate(SiteFailureForwardMessage sffm) {
        SiteFailureForwardMessage prev = m_forwardCandidates.get(sffm.m_reportingHSId);

        if (prev != null && prev.m_survivors.size() < sffm.m_survivors.size()) return;

        m_forwardCandidates.put(sffm.m_reportingHSId, sffm);
    }

    /**
     * Collect the failure site update messages from all sites This site sent
     * its own mailbox the above broadcast the maximum is local to this site.
     * This also ensures at least one response.
     *
     * Concurrent failures can be detected by additional reports from the FaultDistributor
     * or a mismatch in the set of failed hosts reported in a message from another site
     */
    private boolean discoverGlobalFaultData_rcv(Set hsIds) {

        long blockedOnReceiveStart = System.currentTimeMillis();
        long lastReportTime = 0;
        boolean haveEnough = false;
        int [] forwardStallCount = new int[] {FORWARD_STALL_COUNT};

        do {
            VoltMessage m = m_mailbox.recvBlocking(receiveSubjects, 5);

            // If fault resolution takes longer then 10 seconds start logging for every second
            final long now = System.currentTimeMillis();
            final long blockedTime = now - blockedOnReceiveStart;
            if (blockedTime > LOGGING_START) {
                if (now - lastReportTime > 1000) {
                    haveNecessaryFaultInfo(m_seeker.getSurvivors(), true);
                    lastReportTime = now;
                }
            }

            if (m == null) {
                // Send a heartbeat to keep the dead host timeout active.  Needed because IV2 doesn't
                // generate its own heartbeats to keep this running.
                m_meshAide.sendHeartbeats(m_seeker.getSurvivors());

            } else if (m.getSubject() == Subject.SITE_FAILURE_UPDATE.getId()) {

                SiteFailureMessage sfm = (SiteFailureMessage) m;

                if (  !m_seeker.getSurvivors().contains(m.m_sourceHSId)
                    || m_failedSites.contains(m.m_sourceHSId)
                    || m_failedSites.containsAll(sfm.getFailedSites())) continue;

                if (!sfm.m_decision.isEmpty()) {
                    m_decidedSurvivors.put(sfm.m_sourceHSId, sfm);
                }

                updateFailedSitesLedger(hsIds, sfm);

                m_seeker.add(sfm);
                addForwardCandidate(new SiteFailureForwardMessage(sfm));

                REJOIN_LOGGER.info("Agreement, Received " + sfm);
                if (REJOIN_LOGGER.isDebugEnabled()) {
                    REJOIN_LOGGER.debug(String.format("\n %s\n %s\n %s\n %s\n %s",
                                       m_seeker.dumpAlive(), m_seeker.dumpDead(),
                                       m_seeker.dumpReported(), m_seeker.dumpSurvivors(),
                                       dumpInTrouble()));
                }

            } else if (m.getSubject() == Subject.SITE_FAILURE_FORWARD.getId()) {

                SiteFailureForwardMessage fsfm = (SiteFailureForwardMessage) m;

                addForwardCandidate(fsfm);

                if (   !hsIds.contains(fsfm.m_sourceHSId)
                    || m_seeker.getSurvivors().contains(fsfm.m_reportingHSId)
                    || m_failedSites.contains(fsfm.m_reportingHSId)
                    || m_failedSites.containsAll(fsfm.getFailedSites())) continue;

                m_seeker.add(fsfm);

                REJOIN_LOGGER.info("Agreement, Received forward " + fsfm);
                if (REJOIN_LOGGER.isDebugEnabled()) {
                    REJOIN_LOGGER.debug(String.format("\n %s\n %s\n %s\n %s\n %s",
                                        m_seeker.dumpAlive(), m_seeker.dumpDead(),
                                        m_seeker.dumpReported(), m_seeker.dumpSurvivors(),
                                        dumpInTrouble()));
                }

                forwardStallCount[0] = FORWARD_STALL_COUNT;

            } else if (m.getSubject() == Subject.FAILURE.getId()) {
                /*
                 * If the fault distributor reports a new fault, ignore it if it is known , otherwise
                 * re-deliver the message to ourself and then abort so that the process can restart.
                 */
                FaultMessage fm = (FaultMessage) m;

                Discard ignoreIt = mayIgnore(hsIds, fm);
                if (Discard.DoNot == ignoreIt) {
                    m_mailbox.deliverFront(m);
                    REJOIN_LOGGER.info("Agreement, Detected a concurrent failure from FaultDistributor, new failed site "
                            + CoreUtils.hsIdToString(fm.failedSite));
                    return false;
                } else {
                    if (REJOIN_LOGGER.isDebugEnabled()) {
                        ignoreIt.log(fm);
                    }
                }
            }

            haveEnough = haveEnough || haveNecessaryFaultInfo(m_seeker.getSurvivors(), false);
            if (haveEnough) {

                Iterator> itr =
                        m_forwardCandidates.entrySet().iterator();

                while (itr.hasNext()) {
                    Map.Entry e = itr.next();
                    Set unseenBy = m_seeker.forWhomSiteIsDead(e.getKey());
                    if (unseenBy.size() > 0) {
                        m_mailbox.send(Longs.toArray(unseenBy), e.getValue());
                        REJOIN_LOGGER.info("Agreement, fowarding to "
                                + CoreUtils.hsIdCollectionToString(unseenBy)
                                + " " + e.getValue());
                    }
                    itr.remove();
                }
            }

        } while (!haveEnough || m_seeker.needForward(forwardStallCount));

        return true;
    }

    private boolean haveNecessaryFaultInfo( Set survivors, boolean log) {
        List> missingMessages = new ArrayList>();
        for (long survivingSite : survivors) {
            for (Long failingSite : m_inTrouble.keySet()) {
                Pair key = Pair.of( survivingSite, failingSite);
                if (   survivingSite != failingSite
                    && !m_failedSitesLedger.containsKey(key)) {
                    missingMessages.add(key);
                }
            }
        }
        if (log) {
            StringBuilder sb = new StringBuilder();
            sb.append('[');
            boolean first = true;
            for (Pair p : missingMessages) {
                if (!first) sb.append(", ");
                first = false;
                sb.append(CoreUtils.hsIdToString(p.getFirst()));
                sb.append("+>");
                sb.append(CoreUtils.hsIdToString(p.getSecond()));
            }
            sb.append(']');
            if (missingMessages.isEmpty() && m_seeker.needForward()) {
                sb.append(" ");
                sb.append(m_seeker);
            }

            REJOIN_LOGGER.warn("Agreement, Failure resolution stalled waiting for (Reporter +> Failed) " +
                                "information: " + sb.toString());
        }
        return missingMessages.isEmpty();
    }

    private Map extractGlobalFaultData(Set hsIds) {

        if (!haveNecessaryFaultInfo(m_seeker.getSurvivors(), false)) {
            VoltDB.crashLocalVoltDB("Error extracting fault data", true, null);
        }

        Set toBeKilled = m_seeker.nextKill();
        if (toBeKilled.isEmpty()) {
            REJOIN_LOGGER.warn("Agreement, seeker failed to yield a kill set: "+m_seeker);
        }

        Map initiatorSafeInitPoint = new HashMap();

        Iterator, Long>> iter =
            m_failedSitesLedger.entrySet().iterator();

        while (iter.hasNext()) {

            final Map.Entry, Long> entry = iter.next();
            final Pair key = entry.getKey();
            final Long safeTxnId = entry.getValue();

            if (   !hsIds.contains(key.getFirst())
                || !toBeKilled.contains(key.getSecond())) {
                continue;
            }

            Long initiatorId = key.getSecond();
            if (!initiatorSafeInitPoint.containsKey(initiatorId)) {
                initiatorSafeInitPoint.put( initiatorId, Long.MIN_VALUE);
            }

            initiatorSafeInitPoint.put( initiatorId,
                    Math.max(initiatorSafeInitPoint.get(initiatorId), safeTxnId));
        }

        assert(!initiatorSafeInitPoint.containsValue(Long.MIN_VALUE));

        return ImmutableMap.copyOf(initiatorSafeInitPoint);
    }

    public String dumpInTrouble() {
        StringBuilder sb = new StringBuilder();
        sb.append("InTrouble: ");
        sb.append("{ ");
        int count = 0;
        for (Map.Entry e : m_inTrouble.entrySet()) {
            if (count++ > 0) sb.append(", ");
            sb.append(CoreUtils.hsIdToString(e.getKey())).append(":").append(e.getValue());
        };
        sb.append(" }");
        return sb.toString();
    }
}