org.voltdb.iv2.MpInitiator Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of voltdb Show documentation
VoltDB server libraries
There is a newer version: 10.1.1
/* This file is part of VoltDB.
 * Copyright (C) 2008-2018 VoltDB Inc.
 *
 * This program is free software: you can redistribute it and/or modify
 * it under the terms of the GNU Affero General Public License as
 * published by the Free Software Foundation, either version 3 of the
 * License, or (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU Affero General Public License for more details.
 *
 * You should have received a copy of the GNU Affero General Public License
 * along with VoltDB.  If not, see .
 */

package org.voltdb.iv2;

import java.util.List;
import java.util.concurrent.CancellationException;
import java.util.concurrent.ExecutionException;

import org.apache.zookeeper_voltpatches.KeeperException;
import org.apache.zookeeper_voltpatches.ZooKeeper;
import org.voltcore.messaging.HostMessenger;
import org.voltcore.messaging.TransactionInfoBaseMessage;
import org.voltcore.utils.CoreUtils;
import org.voltcore.zk.LeaderElector;
import org.voltdb.BackendTarget;
import org.voltdb.CatalogContext;
import org.voltdb.CommandLog;
import org.voltdb.MemoryStats;
import org.voltdb.ProducerDRGateway;
import org.voltdb.Promotable;
import org.voltdb.StartAction;
import org.voltdb.StatsAgent;
import org.voltdb.TTLManager;
import org.voltdb.VoltDB;
import org.voltdb.VoltZK;
import org.voltdb.iv2.RepairAlgo.RepairResult;
import org.voltdb.messaging.DumpMessage;
import org.voltdb.messaging.Iv2InitiateTaskMessage;

/**
 * Subclass of Initiator to manage multi-partition operations.
 * This class is primarily used for object construction and configuration plumbing;
 * Try to avoid filling it with lots of other functionality.
 */
public class MpInitiator extends BaseInitiator implements Promotable
{
    public static final int MP_INIT_PID = TxnEgo.PARTITIONID_MAX_VALUE;

    public MpInitiator(HostMessenger messenger, List buddyHSIds, StatsAgent agent, int leaderNodeId)
    {
        super(VoltZK.iv2mpi,
                messenger,
                MP_INIT_PID,
                new MpScheduler(
                    MP_INIT_PID,
                    buddyHSIds,
                    new SiteTaskerQueue(MP_INIT_PID),
                    leaderNodeId),
                "MP",
                agent,
                StartAction.CREATE /* never for rejoin */);
    }

    @Override
    public void configure(BackendTarget backend,
                          CatalogContext catalogContext,
                          String serializedCatalog,
                          int numberOfPartitions,
                          StartAction startAction,
                          StatsAgent agent,
                          MemoryStats memStats,
                          CommandLog cl,
                          String coreBindIds,
                          boolean isLowestSiteId)
        throws KeeperException, InterruptedException, ExecutionException
    {
        // note the mp initiator always uses a non-ipc site, even though it's never used for anything
        if (backend.isValgrindTarget) {
            backend = BackendTarget.NATIVE_EE_JNI;
        }

        super.configureCommon(backend, catalogContext, serializedCatalog,
                numberOfPartitions, startAction, null, null, cl, coreBindIds, false);
        // Hacky
        MpScheduler sched = (MpScheduler)m_scheduler;
        MpRoSitePool sitePool = new MpRoSitePool(m_initiatorMailbox.getHSId(),
                backend,
                catalogContext,
                m_partitionId,
                m_initiatorMailbox);
        sched.setMpRoSitePool(sitePool);

        // add ourselves to the ephemeral node list which BabySitters will watch for this
        // partition
        LeaderElector.createParticipantNode(m_messenger.getZK(),
                LeaderElector.electionDirForPartition(VoltZK.leaders_initiators, m_partitionId),
                Long.toString(getInitiatorHSId()), null);
    }

    @Override
    public void initDRGateway(StartAction startAction, ProducerDRGateway nodeDRGateway, boolean createMpDRGateway)
    {
        // No-op on MPI
    }

    @Override
    public void acceptPromotion()
    {
        try {
            long startTime = System.currentTimeMillis();
            Boolean success = false;
            // Look for the previous MPI (if any)
            int deadMPIHost = Integer.MAX_VALUE;
            Cartographer cartographer = VoltDB.instance().getCartographer();
            if (cartographer != null) {
                Long deadMPIHSId = cartographer.getHSIdForMultiPartitionInitiator();
                if (deadMPIHSId != null) {
                    deadMPIHost = CoreUtils.getHostIdFromHSId(deadMPIHSId);
                }
            }
            m_term = createTerm(m_messenger.getZK(),
                    m_partitionId, getInitiatorHSId(), m_initiatorMailbox,
                    m_whoami);
            m_term.start();
            while (!success) {
                final RepairAlgo repair = m_initiatorMailbox.constructRepairAlgo(m_term.getInterestingHSIds(),
                                deadMPIHost, m_whoami, false);

                // term syslogs the start of leader promotion.
                long txnid = Long.MIN_VALUE;
                try {
                    RepairResult res = repair.start().get();
                    txnid = res.m_txnId;
                    success = true;
                } catch (CancellationException e) {
                    success = false;
                }
                if (success) {
                    m_initiatorMailbox.setLeaderState(txnid);
                    List restartTxns = ((MpPromoteAlgo)repair).getInterruptedTxns();
                    if (!restartTxns.isEmpty()) {
                        // Should only be one restarting MP txn
                        if (restartTxns.size() > 1) {
                            tmLog.fatal("Detected a fatal condition while repairing multipartition transactions " +
                                    "following a cluster topology change.");
                            tmLog.fatal("The MPI found multiple transactions requiring restart: ");
                            for (Iv2InitiateTaskMessage txn : restartTxns) {
                                tmLog.fatal("Restart candidate: " + txn);
                            }
                            tmLog.fatal("This node will fail.  Please contact VoltDB support with your cluster's " +
                                    "log files.");
                            m_initiatorMailbox.send(
                                    com.google_voltpatches.common.primitives.Longs.toArray(m_term.getInterestingHSIds().get()),
                                    new DumpMessage());
                            throw new RuntimeException("Failing promoted MPI node with unresolvable repair condition.");
                        }
                        tmLog.debug(m_whoami + " restarting MP transaction: " + restartTxns.get(0));
                        Iv2InitiateTaskMessage firstMsg = restartTxns.get(0);
                        assert(firstMsg.getTruncationHandle() == TransactionInfoBaseMessage.UNUSED_TRUNC_HANDLE);
                        m_initiatorMailbox.repairReplicasWith(null, firstMsg);
                    }
                    tmLog.info(m_whoami
                             + "finished leader promotion. Took "
                             + (System.currentTimeMillis() - startTime) + " ms.");

                    // THIS IS where map cache should be updated, not
                    // in the promotion algorithm.
                    LeaderCacheWriter iv2masters = new LeaderCache(m_messenger.getZK(),
                            m_zkMailboxNode);
                    iv2masters.put(m_partitionId, m_initiatorMailbox.getHSId());
                    TTLManager.instance().scheduleTTLTasks();
                }
                else {
                    // The only known reason to fail is a failed replica during
                    // recovery; that's a bounded event (by k-safety).
                    // CrashVoltDB here means one node failure causing another.
                    // Don't create a cascading failure - just try again.
                    tmLog.info(m_whoami
                             + "interrupted during leader promotion after "
                             + (System.currentTimeMillis() - startTime) + " ms. of "
                             + "trying. Retrying.");
                }
            }
        } catch (Exception e) {
            VoltDB.crashLocalVoltDB("Terminally failed leader promotion.", true, e);
        }
    }

    /**
     * The MPInitiator does not have user data to rejoin.
     */
    @Override
    public boolean isRejoinable()
    {
        return false;
    }

    @Override
    public Term createTerm(ZooKeeper zk, int partitionId, long initiatorHSId, InitiatorMailbox mailbox,
            String whoami)
    {
        return new MpTerm(zk, initiatorHSId, mailbox, whoami);
    }

    /**
     * Update the MPI's Site's catalog.  Unlike the SPI, this is not going to
     * run from the same Site's thread; this is actually going to run from some
     * other local SPI's Site thread.  Since the MPI's site thread is going to
     * be blocked running the EveryPartitionTask for the catalog update, this
     * is currently safe with no locking.  And yes, I'm a horrible person.
     */
    public void updateCatalog(String diffCmds, CatalogContext context, boolean isReplay,
            boolean requireCatalogDiffCmdsApplyToEE, boolean requiresNewExportGeneration)
    {
        // note this will never require snapshot isolation because the MPI has no snapshot funtionality
        m_executionSite.updateCatalog(diffCmds, context, false, true, Long.MIN_VALUE, Long.MIN_VALUE, Long.MIN_VALUE,
                isReplay, requireCatalogDiffCmdsApplyToEE, requiresNewExportGeneration);
        MpScheduler sched = (MpScheduler)m_scheduler;
        sched.updateCatalog(diffCmds, context);
    }

    public void updateSettings(CatalogContext context)
    {
        m_executionSite.updateSettings(context);
        MpScheduler sched = (MpScheduler)m_scheduler;
        sched.updateSettings(context);
    }

    @Override
    public void enableWritingIv2FaultLog() {
        m_initiatorMailbox.enableWritingIv2FaultLog();
    }
}