All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.splout.db.qnode.Deployer Maven / Gradle / Ivy

Go to download

Splout SQL is a read only, horizontally scalable and partitioned SQL database that plays well with Hadoop.

The newest version!
package com.splout.db.qnode;

/*
 * #%L
 * Splout SQL Server
 * %%
 * Copyright (C) 2012 Datasalt Systems S.L.
 * %%
 * This program is free software: you can redistribute it and/or modify
 * it under the terms of the GNU Affero General Public License as published by
 * the Free Software Foundation, either version 3 of the License, or
 * (at your option) any later version.
 * 
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 * 
 * You should have received a copy of the GNU Affero General Public License
 * along with this program.  If not, see .
 * #L%
 */

import java.text.SimpleDateFormat;
import java.util.ArrayList;
import java.util.Date;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;
import java.util.Set;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
import java.util.concurrent.TimeUnit;

import com.splout.db.common.JSONSerDe;
import org.apache.commons.collections.MapUtils;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.thrift.transport.TTransportException;

import com.hazelcast.core.ICountDownLatch;
import com.hazelcast.core.IMap;
import com.splout.db.common.PartitionEntry;
import com.splout.db.common.ReplicationEntry;
import com.splout.db.common.Tablespace;
import com.splout.db.hazelcast.CoordinationStructures;
import com.splout.db.hazelcast.TablespaceVersion;
import com.splout.db.qnode.beans.DeployInfo;
import com.splout.db.qnode.beans.DeployRequest;
import com.splout.db.qnode.beans.DeployStatus;
import com.splout.db.qnode.beans.QueryStatus;
import com.splout.db.qnode.beans.SwitchVersionRequest;
import com.splout.db.thrift.DNodeService;
import com.splout.db.thrift.DeployAction;
import com.splout.db.thrift.PartitionMetadata;

/**
 * The Deployer is a specialized module ({@link com.splout.db.qnode.QNodeHandlerModule}) of the
 * {@link com.splout.db.qnode.QNode} that performs the business logic associated with a distributed deployment. It is
 * used by the {@link com.splout.db.qnode.QNodeHandler}.
 */
@SuppressWarnings({ "unchecked", "rawtypes" })
public class Deployer extends QNodeHandlerModule {

	private final static Log log = LogFactory.getLog(Deployer.class);
	private ExecutorService deployThread;

	@SuppressWarnings("serial")
	public static class UnexistingVersion extends Exception {

		public UnexistingVersion() {
			super();
		}

		public UnexistingVersion(String message) {
			super(message);
		}
	}

	/**
	 * Runnable that deals with the asynchronous part of the deployment. Particularly, it waits until DNodes finish their
	 * work, and then performs the version switch.
	 */
	public class ManageDeploy implements Runnable {

		// Number of seconds to wait until another
		// check to see if timeout was reached or
		// if a DNode failed.
		private long secondsToCheckFailureOrTimeout = 60l;

		private long version;
		private List dnodes;
		private long timeoutSeconds;
		private List deployRequests;
    private long dnodesSpreadMetadataTimeout;
		private boolean isReplicaBalancingEnabled;

		public ManageDeploy(List dnodes, List deployRequests, long version,
		    long timeoutSeconds, long secondsToCheckFailureOrTimeout, long dnodesSpreadMetadataTimeout, boolean isReplicaBalancingEnabled) {
			this.dnodes = dnodes;
			this.deployRequests = deployRequests;
			this.version = version;
			this.timeoutSeconds = timeoutSeconds;
			this.secondsToCheckFailureOrTimeout = secondsToCheckFailureOrTimeout;
      this.dnodesSpreadMetadataTimeout = Math.max(dnodesSpreadMetadataTimeout, 1);
			this.isReplicaBalancingEnabled = isReplicaBalancingEnabled;
		}

		@Override
		public void run() {
			log.info(context.getConfig().getProperty(QNodeProperties.PORT) + " Executing deploy for version ["
			    + version + "]");
			CoordinationStructures.DEPLOY_IN_PROGRESS.incrementAndGet();

			try {
				long waitSeconds = 0;
				ICountDownLatch countDownLatchForDeploy = context.getCoordinationStructures()
				    .getCountDownLatchForDeploy(version);
				boolean finished;
				do {
					finished = countDownLatchForDeploy.await(secondsToCheckFailureOrTimeout, TimeUnit.SECONDS);
					waitSeconds += secondsToCheckFailureOrTimeout;
					if(!finished) {
						// If any of the DNodes failed, then we cancel the deployment.
						if(checkForFailure()) {
							explainErrors();
							abortDeploy(dnodes, "One or more DNodes failed", version);
							return;
						}
						// Let's see if we reached the timeout.
						// Negative timeoutSeconds => waits forever
						if(waitSeconds > timeoutSeconds && timeoutSeconds >= 0) {
							log.warn("Deploy of version [" + version + "] timed out. Reached [" + waitSeconds
							    + "] seconds.");
							abortDeploy(dnodes, "Timeout reached", version);
							return;
						}
					}
				} while(!finished);

				// It's still possible that the deploy failed so let's check it again
				if(checkForFailure()) {
					explainErrors();
					abortDeploy(dnodes, "One or more DNodes failed.", version);
					return;
				}

				// Check after the wait than the complete tablespaces are available to that QNode. If that is the
				// case for this QNode it will be probably the case for the rest of QNodes.
        long millisToWait = 50;
        double acumulatedMillis = 0.;
				List versionsToCheck = switchActions();
				do {
					Thread.sleep(millisToWait);
          acumulatedMillis += millisToWait;

          // Let's see if we reached the timeout.
          // Negative timeoutSeconds => waits forever
          if((acumulatedMillis/1000) > dnodesSpreadMetadataTimeout) {
            log.warn("Deploy of version [" + version + "] timed out when waiting DNodes to spread the metadata. Reached [" + (acumulatedMillis/1000)
                + "] seconds.");
            abortDeploy(dnodes, "Timeout reached", version);
            return;
          }

					Iterator it = versionsToCheck.iterator();
					while(it.hasNext()) {
						SwitchVersionRequest req = it.next();
						Tablespace t = context.getTablespaceVersionsMap().get(
						    new TablespaceVersion(req.getTablespace(), req.getVersion()));
						// Check that this TablespaceVersion has been reported by some node through Hazelcast
						if(t != null && t.getReplicationMap() != null && t.getPartitionMap() != null
						    && t.getPartitionMap().getPartitionEntries() != null
						    && t.getReplicationMap().getReplicationEntries() != null
						    && t.getReplicationMap().getReplicationEntries().size() > 0) {
							if(t.getPartitionMap().getPartitionEntries().size() == t.getReplicationMap()
							    .getReplicationEntries().size()) {
								log.info("Ok, TablespaceVersion [" + req.getTablespace() + ", " + req.getVersion()
								    + "] being handled by enough DNodes as reported by Hazelcast. ("
								    + t.getReplicationMap().getReplicationEntries() + ")");
								it.remove();
							}
						}
					}
				} while(versionsToCheck.size() > 0);

				log.info("All DNodes performed the deploy of version [" + version
				    + "]. Publishing tablespaces...");

				// We finish by publishing the versions table with the new versions.
				try {
					switchVersions(switchActions());
				} catch(UnexistingVersion e) {
					throw new RuntimeException(
					    "Unexisting version after deploying this version. Sounds like a bug.", e);
				}

				// If some replicas are under-replicated, start a balancing process
				context.maybeBalance();

				log.info("Deploy of version [" + version + "] Finished PROPERLY. :-)");
				context.getCoordinationStructures().logDeployMessage(version,
				    "Deploy of version [" + version + "] finished properly.");
				context.getCoordinationStructures().getDeploymentsStatusPanel()
				    .put(version, DeployStatus.FINISHED);
			} catch(InterruptedException e) {
				log.error("Error while deploying version [" + version + "]", e);
				abortDeploy(dnodes, e.getMessage(), version);
			} catch(Throwable t) {
				t.printStackTrace();
				throw new RuntimeException(t);
			} finally {
				CoordinationStructures.DEPLOY_IN_PROGRESS.decrementAndGet();
			}
		}

		/**
		 * Compose the list of switch actions to switch
		 * 
		 * @return
		 */
		private List switchActions() {
			ArrayList actions = new ArrayList();
			for(DeployRequest req : deployRequests) {
				actions.add(new SwitchVersionRequest(req.getTablespace(), version));
			}
			return actions;
		}

		/**
		 * Log DNodes errors in deployment. We log both to the QNode logger and to Hazelcast so the info is persisted in the
		 * session.
		 */
		private void explainErrors() {
			IMap deployErrorPanel = context.getCoordinationStructures().getDeployErrorPanel(
			    version);
			String msg = "Deployment of version [" + version + "] failed in DNode[";
			for(Entry entry : deployErrorPanel.entrySet()) {
				String fMsg = msg + entry.getKey() + "] - it failed with the error [" + entry.getValue() + "]";
				log.error(fMsg);
				context.getCoordinationStructures().logDeployMessage(version, fMsg);
			}
		}

		/**
		 * Return true if one or more of the DNodes reported an error.
		 */
		private boolean checkForFailure() {
			IMap deployErrorPanel = context.getCoordinationStructures().getDeployErrorPanel(
			    version);
			if(!isReplicaBalancingEnabled) {
				return !deployErrorPanel.isEmpty();
			}
			// If replica balancing is enabled we check whether we could survive after the failed DNodes
			Set failedDNodes = new HashSet(deployErrorPanel.keySet());
			// Check if deploy needs to be canceled or if the system could auto-rebalance itself afterwards
			for(DeployRequest deployRequest : deployRequests) {
				for(ReplicationEntry repEntry : deployRequest.getReplicationMap()) {
					if(failedDNodes.containsAll(repEntry.getNodes())) {
						// There is AT LEAST one partition that depends on the failed DNodes so the deploy must fail!
						return true;
					}
				}
			}
			return false;
		}
	} /* End ManageDeploy */

	/**
	 * The Deployer deals with deploy and switch version requests.
	 */
	public Deployer(QNodeHandlerContext context) {
		super(context);
		deployThread = Executors.newFixedThreadPool(1);
	}

	/**
	 * Call this method for starting an asynchronous deployment given a proper deploy request - proxy method for
	 * {@link QNodeHandler}. Returns a {@link QueryStatus} with the status of the request.
	 * 
	 * @throws InterruptedException
	 */
	public DeployInfo deploy(List deployRequests) throws InterruptedException {
		DeployInfo deployInfo = new DeployInfo();

		// A new unique version number is generated.
		long version = context.getCoordinationStructures().uniqueVersionId();
		deployInfo.setVersion(version);

		List tablespaces = new ArrayList();
		List dataURIs = new ArrayList();

		for(DeployRequest request : deployRequests) {
			tablespaces.add(request.getTablespace());
			dataURIs.add(request.getData_uri());
		}

		deployInfo.setTablespacesDeployed(tablespaces);
		deployInfo.setDataURIs(dataURIs);

		Date startTime = new Date();
		deployInfo.setStartedAt(new SimpleDateFormat("yyyy-MM-dd HH:mm:ss").format(startTime));

		context.getCoordinationStructures().logDeployMessage(version,
		    "Deploy [" + version + "] for tablespaces " + tablespaces + " started.");
		context.getCoordinationStructures().getDeploymentsStatusPanel().put(version, DeployStatus.ONGOING);

		// Generate the list of actions per DNode
		Map> actionsPerDNode = generateDeployActionsPerDNode(deployRequests,
		    version);

		// Starting the countdown latch.
		ICountDownLatch countDownLatchForDeploy = context.getCoordinationStructures()
		    .getCountDownLatchForDeploy(version);
		Set dnodesInvolved = actionsPerDNode.keySet();
		countDownLatchForDeploy.trySetCount(dnodesInvolved.size());

		// Sending deploy signals to each DNode
		for(Map.Entry> actionPerDNode : actionsPerDNode.entrySet()) {
			DNodeService.Client client = null;
			boolean renew = false;
			try {
				try {
					client = context.getDNodeClientFromPool(actionPerDNode.getKey());
				} catch(TTransportException e) {
					renew = true;
					throw e;
				}
				client.deploy(actionPerDNode.getValue(), version);
			} catch(Exception e) {
				String errorMsg = "Error sending deploy actions to DNode [" + actionPerDNode.getKey() + "]";
				log.error(errorMsg, e);
				abortDeploy(new ArrayList(actionsPerDNode.keySet()), errorMsg, version);
				deployInfo.setError("Error connecting to DNode " + actionPerDNode.getKey());
				context.getCoordinationStructures().getDeployInfoPanel().put(version, deployInfo);
				return deployInfo;
			} finally {
				context.returnDNodeClientToPool(actionPerDNode.getKey(), client, renew);
			}
		}

		// Initiating an asynchronous process to manage the deployment
		deployThread.execute(new ManageDeploy(new ArrayList(actionsPerDNode.keySet()), deployRequests,
		    version, context.getConfig().getLong(QNodeProperties.DEPLOY_TIMEOUT, -1), context.getConfig()
		        .getLong(QNodeProperties.DEPLOY_SECONDS_TO_CHECK_ERROR),
        context.getConfig().getLong(QNodeProperties.DEPLOY_DNODES_SPREAD_METADATA_TIMEOUT, 180),
        context.getConfig().getBoolean(QNodeProperties.REPLICA_BALANCE_ENABLE)));

		context.getCoordinationStructures().getDeployInfoPanel().put(version, deployInfo);
		return deployInfo;
	}

	/**
	 * DNodes are informed to stop the deployment, as something failed.
	 * 
	 * @throws InterruptedException
	 */
	public void abortDeploy(List dnodes, String deployerErrorMessage, long version) {
		for(String dnode : dnodes) {
			DNodeService.Client client = null;
			boolean renew = false;
			try {
				try {
					client = context.getDNodeClientFromPool(dnode);
				} catch(TTransportException e) {
					renew = true;
					throw e;
				}
				client.abortDeploy(version);
			} catch(Exception e) {
				log.error("Error sending abort deploy flag to DNode [" + dnode + "]", e);
			} finally {
				if(client != null) {
					context.returnDNodeClientToPool(dnode, client, renew);
				}
			}
		}
		context.getCoordinationStructures().logDeployMessage(version,
		    "Deploy failed due to: " + deployerErrorMessage);
		context.getCoordinationStructures().getDeploymentsStatusPanel().put(version, DeployStatus.FAILED);
		CoordinationStructures.DEPLOY_IN_PROGRESS.decrementAndGet();
	}

	/**
	 * Switches current versions being served for some tablespaces, in an atomic way.
	 */
	public void switchVersions(List switchRequest) throws UnexistingVersion {
		// We compute the new versions table, and then try to update it
		// We use optimistic locking: we read the original
		// map and try to update it. If the original has changed during
		// this process, we retry: reload the original map, ...
		Map versionsTable;
		Map newVersionsTable;
		do {
			versionsTable = context.getCoordinationStructures().getCopyVersionsBeingServed();
			newVersionsTable = new HashMap();
			if(versionsTable != null) {
				newVersionsTable.putAll(versionsTable);
			}

			for(SwitchVersionRequest req : switchRequest) {
				TablespaceVersion tsv = new TablespaceVersion(req.getTablespace(), req.getVersion());
				newVersionsTable.put(tsv.getTablespace(), tsv.getVersion());
			}
		} while(!context.getCoordinationStructures().updateVersionsBeingServed(versionsTable,
		    newVersionsTable));
	}

	/**
	 * Generates the list of individual deploy actions that has to be sent to each DNode.
	 */
	private static Map> generateDeployActionsPerDNode(
	    List deployRequests, long version) {
		HashMap> actions = new HashMap>();

		long deployDate = System.currentTimeMillis(); // Here is where we decide the data of the deployment for all deployed
		                                              // tablespaces

		for(DeployRequest req : deployRequests) {
			for(Object obj : req.getReplicationMap()) {
				ReplicationEntry rEntry = (ReplicationEntry) obj;
				PartitionEntry pEntry = null;
				for(PartitionEntry partEntry : req.getPartitionMap()) {
					if(partEntry.getShard().equals(rEntry.getShard())) {
						pEntry = partEntry;
					}
				}
				if(pEntry == null) {
          String msg = "No Partition metadata for shard: " + rEntry.getShard()
              + " this is very likely to be a software bug.";
          log.error(msg);
          try {
            log.error("Partition map: " + JSONSerDe.ser(req.getPartitionMap()));
            log.error("Replication map: " + JSONSerDe.ser(req.getReplicationMap()));
          } catch (JSONSerDe.JSONSerDeException e) {
            log.error("JSON error", e);
          }
          throw new RuntimeException(msg);
				}
				// Normalize DNode ids -> The convention is that DNodes are identified by host:port . So we need to strip the
				// protocol, if any
				for(int i = 0; i < rEntry.getNodes().size(); i++) {
					String dnodeId = rEntry.getNodes().get(i);
					if(dnodeId.startsWith("tcp://")) {
						dnodeId = dnodeId.substring("tcp://".length(), dnodeId.length());
					}
					rEntry.getNodes().set(i, dnodeId);
				}
				for(String dNode : rEntry.getNodes()) {
					List actionsSoFar = (List) MapUtils.getObject(actions, dNode,
					    new ArrayList());
					actions.put(dNode, actionsSoFar);
					DeployAction deployAction = new DeployAction();
					deployAction.setDataURI(req.getData_uri() + "/" + rEntry.getShard() + ".db");
					deployAction.setTablespace(req.getTablespace());
					deployAction.setVersion(version);
					deployAction.setPartition(rEntry.getShard());

					// Add partition metadata to the deploy action for DNodes to save it
					PartitionMetadata metadata = new PartitionMetadata();
					metadata.setMinKey(pEntry.getMin());
					metadata.setMaxKey(pEntry.getMax());
					metadata.setNReplicas(rEntry.getNodes().size());
					metadata.setDeploymentDate(deployDate);
					metadata.setInitStatements(req.getInitStatements());
					metadata.setEngineId(req.getEngine());

					deployAction.setMetadata(metadata);
					actionsSoFar.add(deployAction);
				}
			}
		}
		return actions;
	}
}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy