All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.splout.db.dnode.DNodeHandler Maven / Gradle / Ivy

Go to download

Splout SQL is a read only, horizontally scalable and partitioned SQL database that plays well with Hadoop.

The newest version!
package com.splout.db.dnode;

/*
 * #%L
 * Splout SQL Server
 * %%
 * Copyright (C) 2012 Datasalt Systems S.L.
 * %%
 * This program is free software: you can redistribute it and/or modify
 * it under the terms of the GNU Affero General Public License as published by
 * the Free Software Foundation, either version 3 of the License, or
 * (at your option) any later version.
 * 
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 * 
 * You should have received a copy of the GNU Affero General Public License
 * along with this program.  If not, see .
 * #L%
 */

import java.io.File;
import java.io.IOException;
import java.net.URISyntaxException;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Collections;
import java.util.List;
import java.util.Map;
import java.util.concurrent.*;
import java.util.concurrent.atomic.AtomicBoolean;
import java.util.concurrent.atomic.AtomicInteger;
import java.util.concurrent.atomic.AtomicLong;

import net.sf.ehcache.Cache;
import net.sf.ehcache.Element;

import org.apache.commons.io.FileSystemUtils;
import org.apache.commons.io.FileUtils;
import org.apache.commons.io.filefilter.TrueFileFilter;
import org.apache.commons.lang.exception.ExceptionUtils;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;

import com.google.common.base.Function;
import com.google.common.collect.Lists;
import com.hazelcast.core.EntryEvent;
import com.hazelcast.core.EntryListener;
import com.hazelcast.core.Hazelcast;
import com.hazelcast.core.HazelcastInstance;
import com.hazelcast.core.ICountDownLatch;
import com.splout.db.benchmark.PerformanceTool;
import com.splout.db.common.JSONSerDe;
import com.splout.db.common.JSONSerDe.JSONSerDeException;
import com.splout.db.common.SploutConfiguration;
import com.splout.db.common.ThriftReader;
import com.splout.db.common.ThriftWriter;
import com.splout.db.dnode.beans.BalanceFileReceivingProgress;
import com.splout.db.dnode.beans.DNodeStatusResponse;
import com.splout.db.dnode.beans.DNodeSystemStatus;
import com.splout.db.engine.EngineManager;
import com.splout.db.engine.ManagerFactory;
import com.splout.db.hazelcast.CoordinationStructures;
import com.splout.db.hazelcast.DNodeInfo;
import com.splout.db.hazelcast.DistributedRegistry;
import com.splout.db.hazelcast.HazelcastConfigBuilder;
import com.splout.db.hazelcast.HazelcastConfigBuilder.HazelcastConfigBuilderException;
import com.splout.db.hazelcast.HazelcastProperties;
import com.splout.db.qnode.ReplicaBalancer;
import com.splout.db.qnode.ReplicaBalancer.BalanceAction;
import com.splout.db.thrift.DNodeException;
import com.splout.db.thrift.DeployAction;
import com.splout.db.thrift.PartitionMetadata;
import com.splout.db.thrift.RollbackAction;

/**
 * The business logic for the DNode: responding to queries, downloading new deployments, handling ZooKeeper events and
 * so forth.
 */
public class DNodeHandler implements IDNodeHandler {

	private final static Log log = LogFactory.getLog(DNodeHandler.class);

	protected SploutConfiguration config;
	private HazelcastInstance hz;
	private DistributedRegistry dnodesRegistry;
	private CoordinationStructures coord;
	private HttpFileExchanger httpExchanger;

	// The {@link Fetcher} is the responsible for downloading new deployment data.
	Cache dbCache;

	protected ExecutorService deployThread;
	protected Object deployLock = new Object();

	// This flag is needed for unit testing.
	protected AtomicInteger deployInProgress = new AtomicInteger(0);
	// Indicates that the last deploy failed because of timeout. This info can then be answered via a status() request.
	AtomicBoolean lastDeployTimedout = new AtomicBoolean(false);

	// Thrift exception code used in DNodeException
	public final static int EXCEPTION_ORDINARY = 0;
	public final static int EXCEPTION_UNEXPECTED = 1;

	// A hard limit on the number of results that this DNode can return per SQL query
	private int maxResultsPerQuery;

	// The following variables are used for monitoring and providing statistics:
	private PerformanceTool performanceTool = new PerformanceTool();
	private String lastException = null;
	private long lastExceptionTime;
	private AtomicInteger failedQueries = new AtomicInteger(0);
	private long upSince;

	// The {@link Fetcher} is the responsible for downloading new deployment data.
	private Fetcher fetcher;

	// Above this query time the query will be logged as slow query
	private long absoluteSlowQueryLimit;
	private long slowQueries = 0;

  // Deploy parallelism
  private int deployParallelism;

	// This map will hold all the current balance file transactions being done
	private ConcurrentHashMap balanceActionsStateMap = new ConcurrentHashMap();

	// The factory we will use to instantiate managers for each partition associated with an {@link SploutEngine}
	private ManagerFactory factory;

	public DNodeHandler(Fetcher fetcher) {
		this.fetcher = fetcher;
	}

	public DNodeHandler() {
	}

	/**
	 * Returns the address (host:port) of this DNode.
	 */
	public String whoAmI() {
		return config.getString(DNodeProperties.HOST) + ":" + config.getInt(DNodeProperties.PORT);
	}

	public String httpExchangerAddress() {
		return "http://" + config.getString(DNodeProperties.HOST) + ":"
		    + config.getInt(HttpFileExchangerProperties.HTTP_PORT);
	}

	/**
	 * This inner class will listen for additions to the balance actions map, so that if a balance action has to be taken
	 * and this DNode is the one who has the send the file, it will start doing so.
	 */
	private class BalanceActionItemListener implements
	    EntryListener {

		@Override
		public void entryAdded(EntryEvent event) {
			BalanceAction action = event.getKey();
			if(action.getOriginNode().equals(whoAmI())) {
				// I must do a balance action!
				File toSend = new File(getLocalStorageFolder(action.getTablespace(), action.getPartition(),
				    action.getVersion()), action.getPartition() + ".db");
				File metadataFile = getLocalMetadataFile(action.getTablespace(), action.getPartition(),
				    action.getVersion());
				// send both the .db and the .meta file -> when the other part has both files it will move them atomically...
				httpExchanger.send(action.getTablespace(), action.getPartition(), action.getVersion(), toSend,
				    action.getFinalNode(), false);
				httpExchanger.send(action.getTablespace(), action.getPartition(), action.getVersion(),
				    metadataFile, action.getFinalNode(), false);
			}
		}

		@Override
		public void entryRemoved(EntryEvent event) {
			// usually we won't care - but the final DNode might have pro-actively removed this action
		}

		@Override
		public void entryUpdated(EntryEvent event) {
		}

		@Override
		public void entryEvicted(EntryEvent event) {
		}
	}

	/**
	 * This inner class will perform the business logic associated with receiving files: what to do on failures, bad CRC,
	 * file received OK...
	 */
	private class FileReceiverCallback implements HttpFileExchanger.ReceiveFileCallback {

		@Override
		public void onProgress(String tablespace, Integer partition, Long version, File file,
		    long totalSize, long sizeDownloaded) {

			if(file.getName().endsWith(".db")) {
				getProgressFromLocalPanel(tablespace, partition, version).progressBinaryFile(totalSize,
				    sizeDownloaded);
			}
		}

		@Override
		public void onFileReceived(String tablespace, Integer partition, Long version, File file) {
			BalanceFileReceivingProgress progress = getProgressFromLocalPanel(tablespace, partition, version);
			if(file.getName().endsWith(".meta")) {
				progress.metaFileReceived(file);
			} else if(file.getName().endsWith(".db")) {
				progress.binaryFileReceived(file);
			}

			// this can be reached simultaneously by 2 different threads so we must synchronized it
			// (thread that downloaded the .meta file and thread that downloaded the .db file)
			synchronized(FileReceiverCallback.this) {
				if(progress.isReceivedMetaFile() && progress.isReceivedBinaryFile()) {
					// This assures that the move will only be done once
					if(new File(progress.getMetaFile()).exists() && new File(progress.getBinaryFile()).exists()) {
						// check if we already have the binary & meta -> then move partition
						// and then remove this action from the panel so that it's completed.
						try {
							// we need to remove existing files if they exist
							// they might be stalled from old deployments
							File meta = getLocalMetadataFile(tablespace, partition, version);
							if(meta.exists()) {
								meta.delete();
							}
							FileUtils.moveFile(new File(progress.getMetaFile()), meta);
							File binaryToMove = new File(progress.getBinaryFile());
							File binary = new File(getLocalStorageFolder(tablespace, partition, version),
							    binaryToMove.getName());
							if(binary.exists()) {
								binary.delete();
							}
							FileUtils.moveToDirectory(binaryToMove,
							    getLocalStorageFolder(tablespace, partition, version), true);
							log.info("Balance action successfully completed, received both .db and .meta files ("
							    + tablespace + ", " + partition + ", " + version + ")");
							// Publish new changes to HZ
							dnodesRegistry.changeInfo(new DNodeInfo(config));
						} catch(IOException e) {
							log.error(e);
						} finally {
							removeBalanceActionFromHZPanel(tablespace, partition, version);
						}
					}
				}
			}
		}

		@Override
		public void onBadCRC(String tablespace, Integer partition, Long version, File file) {
			removeBalanceActionFromHZPanel(tablespace, partition, version);
		}

		@Override
		public void onError(Throwable t, String tablespace, Integer partition, Long version, File file) {
			removeBalanceActionFromHZPanel(tablespace, partition, version);
		}

		// --- Helper methods --- //

		/**
		 * Will remove the BalanceAction associated with this file receiving from HZ data structure.
		 */
		private synchronized void removeBalanceActionFromHZPanel(String tablespace, Integer partition,
		    Long version) {
			// first remove the local tracking of this action
			String lookupKey = tablespace + "_" + partition + "_" + version;
			if(balanceActionsStateMap.containsKey(lookupKey)) {
				balanceActionsStateMap.remove(lookupKey);
				// then remove from HZ
				BalanceAction actionToRemove = null;
				for(Map.Entry actionEntry : coord.getDNodeReplicaBalanceActionsSet()
				    .entrySet()) {
					BalanceAction action = actionEntry.getKey();
					if(action.getTablespace().equals(tablespace) && action.getPartition() == partition
					    && action.getVersion() == version && action.getFinalNode().equals(httpExchanger.address())) {
						actionToRemove = action;
					}
				}
				if(actionToRemove == null) {
					// no need to worry - another thread might have gone into this code already almost simultaneously
				} else {
					coord.getDNodeReplicaBalanceActionsSet().remove(actionToRemove);
					log.info("Removed balance action [" + actionToRemove + "] from HZ panel.");
				}
			}
		}

		/**
		 * Will obtain a bean to fill some progress in a local hashmap or create it and put it otherwise.
		 */
		private synchronized BalanceFileReceivingProgress getProgressFromLocalPanel(String tablespace,
		    Integer partition, Long version) {
			String lookupKey = tablespace + "_" + partition + "_" + version;
			BalanceFileReceivingProgress progress = balanceActionsStateMap.get(lookupKey);
			if(progress == null) {
				progress = new BalanceFileReceivingProgress(tablespace, partition, version);
				balanceActionsStateMap.put(lookupKey, progress);
			}
			return progress;
		}
	}

	/**
	 * Initialization logic: initialize things, connect to ZooKeeper, create Thrift server, etc.
	 * 
	 * @see com.splout.db.dnode.IDNodeHandler#init(com.splout.db.common.SploutConfiguration)
	 */
	public void init(SploutConfiguration config) throws Exception {
		this.config = config;
		long evictionSeconds = config.getLong(DNodeProperties.EH_CACHE_SECONDS);
		maxResultsPerQuery = config.getInt(DNodeProperties.MAX_RESULTS_PER_QUERY);
		int maxCachePools = config.getInt(DNodeProperties.EH_CACHE_N_ELEMENTS);
		absoluteSlowQueryLimit = config.getLong(DNodeProperties.SLOW_QUERY_ABSOLUTE_LIMIT);
    deployParallelism = config.getInt(DNodeProperties.DEPLOY_PARALLELISM);
		factory = new ManagerFactory();
		factory.init(config);
		// We create a Cache for holding SQL connection pools to different tablespace versions
		// http://stackoverflow.com/questions/2583429/how-to-differentiate-between-time-to-live-and-time-to-idle-in-ehcache
		dbCache = new Cache("dbCache", maxCachePools, false, false, Integer.MAX_VALUE, evictionSeconds);
		dbCache.initialise();
		if(fetcher == null) {
			// The Fetcher in charge of downloading new deployments
			this.fetcher = new Fetcher(config);
		}
		// When a tablespace version is expired, the connection pool is closed by an expiration handler
		dbCache.getCacheEventNotificationService().registerListener(new CacheListener());
		// The thread that will execute deployments asynchronously
		deployThread = Executors.newFixedThreadPool(1);
		// A thread that will listen to file exchanges through HTTP
		httpExchanger = new HttpFileExchanger(config, new FileReceiverCallback());
		httpExchanger.init();
		httpExchanger.start();
		// Connect with the cluster.
		hz = Hazelcast.newHazelcastInstance(HazelcastConfigBuilder.build(config));
		coord = new CoordinationStructures(hz);
		coord.getDNodeReplicaBalanceActionsSet().addEntryListener(new BalanceActionItemListener(), false);
		// Add shutdown hook
		Runtime.getRuntime().addShutdownHook(new Thread() {
			@Override
			public void run() {
				try {
					log.info("Shutdown hook called - trying to gently stop DNodeHandler " + whoAmI() + " ...");
					DNodeHandler.this.stop();
				} catch(Throwable e) {
					log.error("Error in ShutdownHook", e);
				}
			}
		});
		upSince = System.currentTimeMillis();
	}

	/**
	 * Registers the dnode in the cluster. This gives green ligth to use it.
	 */
	@Override
	public void giveGreenLigth() {
		int minutesToCheckRegister = config.getInt(HazelcastProperties.MAX_TIME_TO_CHECK_REGISTRATION, 5);
		int oldestMembersLeading = config.getInt(HazelcastProperties.OLDEST_MEMBERS_LEADING_COUNT, 3);

		dnodesRegistry = new DistributedRegistry(CoordinationStructures.DNODES, new DNodeInfo(config), hz,
		    minutesToCheckRegister, oldestMembersLeading);
		dnodesRegistry.register();
	}

	/**
	 * Deletes the files and folders kept by the DNode for a particular tablespace and version.
	 */
	private void deleteLocalVersion(com.splout.db.thrift.TablespaceVersion version) throws IOException {
		File dataFolder = new File(config.getString(DNodeProperties.DATA_FOLDER));
		File tablespaceFolder = new File(dataFolder, version.getTablespace());
		File versionFolder = new File(tablespaceFolder, version.getVersion() + "");
		if(versionFolder.exists()) {
			File[] partitions = versionFolder.listFiles();
			if(partitions != null) {
				for(File partition : partitions) {
					if(partition.isDirectory()) {
						// remove references to engine in ECache
						// so that space in disk is immediately available
						String dbKey = version.getTablespace() + "_" + version.getVersion() + "_"
						    + partition.getName();
						synchronized(dbCache) {
							if(dbCache.get(dbKey) != null) {
								dbCache.remove(dbKey);
								log.info("-- Removing references from ECache: " + dbKey);
							}
						}
					}
				}
			}
			FileUtils.deleteDirectory(versionFolder);
			log.info("-- Successfully removed " + versionFolder);
		} else {
			// Could happen, nothing to worry
		}
	}

	/**
	 * This method will be called either before publishing a new tablespace after a deploy or when a query is issued
	 * to a tablespace/version which is not "warmed" (e.g. after Splout restart, or after long inactivity).
	 */
	private Element loadManagerInEHCache(String tablespace, long version, int partition, File dbFolder, PartitionMetadata partitionMetadata) throws DNodeException {
		try {
			// Create new EHCache item value with a {@link EngineManager}
			EngineManager manager = factory.getManagerIn(dbFolder, partitionMetadata);
			String dbKey = tablespace + "_" + version + "_" + partition;
			Element dbPoolInCache = new Element(dbKey, manager);
			dbCache.put(dbPoolInCache);
			return dbPoolInCache;
		} catch(Exception e) {
			log.error(e);
			e.printStackTrace();
			throw new DNodeException(EXCEPTION_ORDINARY,
			    "Error (" + e.getMessage() + ") instantiating a manager for a data partition");
		}	
	}
	
	/**
	 * Thrift RPC method -> Given a tablespace and a version, execute the SQL query
	 */
	@Override
	public String sqlQuery(String tablespace, long version, int partition, String query)
	    throws DNodeException {

		try {
			try {
				performanceTool.startQuery();
				// Look for the EHCache database pool cache
				String dbKey = tablespace + "_" + version + "_" + partition;
				Element dbPoolInCache = null;
				synchronized(dbCache) {
					dbPoolInCache = dbCache.get(dbKey);
					if(dbPoolInCache == null) {
						File dbFolder = getLocalStorageFolder(tablespace, partition, version);
						if(!dbFolder.exists()) {
							log.warn("Asked for " + dbFolder + " but it doesn't exist!");
							throw new DNodeException(EXCEPTION_ORDINARY, "Requested tablespace (" + tablespace
							    + ") + version (" + version + ") is not available.");
						}
						File metadata = getLocalMetadataFile(tablespace, partition, version);
						ThriftReader reader = new ThriftReader(metadata);
						PartitionMetadata partitionMetadata = (PartitionMetadata) reader
						    .read(new PartitionMetadata());
						reader.close();
						dbPoolInCache = loadManagerInEHCache(tablespace, version, partition, dbFolder, partitionMetadata);
					}
				}
				// Query the {@link SQLite4JavaManager} and return
				String result = ((EngineManager) dbPoolInCache.getObjectValue())
				    .query(query, maxResultsPerQuery);
				long time = performanceTool.endQuery();
				log.info("serving query [" + tablespace + "]" + " [" + version + "] [" + partition + "] ["
				    + query + "] time [" + time + "] OK.");
				// double prob = performanceTool.getHistogram().getLeftAccumulatedProbability(time);
				// if(prob > 0.95) {
				// // slow query!
				// log.warn("[SLOW QUERY] Query time over 95 percentil: [" + query + "] time [" + time + "]");
				// slowQueries++;
				// }
				if(time > absoluteSlowQueryLimit) {
					// slow query!
					log.warn("[SLOW QUERY] Query time over absolute slow query time (" + absoluteSlowQueryLimit
					    + ") : [" + query + "] time [" + time + "]");
					slowQueries++;
				}
				return result;
			} catch(Throwable e) {
				unexpectedException(e);
				throw new DNodeException(EXCEPTION_UNEXPECTED, e.getMessage());
			}
		} catch(DNodeException e) {
			log.info("serving query [" + tablespace + "]" + " [" + version + "] [" + partition + "] [" + query
			    + "] FAILED [" + e.getMsg() + "]");
			failedQueries.incrementAndGet();
			throw e;
		}
	}

	private void abortDeploy(long version, String errorMessage) {
		ConcurrentMap panel = coord.getDeployErrorPanel(version);
		panel.put(whoAmI(), errorMessage);
		deployInProgress.decrementAndGet();
	}

	/**
	 * Thrift RPC method -> Given a list of {@link DeployAction}s and a version identifying the deployment perform an
	 * asynchronous deploy.
	 */
	@Override
	public String deploy(final List deployActions, final long version) throws DNodeException {
		try {
			synchronized(deployLock) {
				/*
				 * Here we instantiate a Thread for waiting for the deploy so that we are able to implement deploy timeout... If
				 * the deploy takes too much then we cancel it. We achieve this by using Java asynchronous Future objects.
				 */
				Thread deployWait = new Thread() {
					public void run() {
						Future future = deployThread.submit(new Runnable() {
							// This code is executed by the solely deploy thread, not the one who waits
							@Override
							public void run() {
								try {
									deployInProgress.incrementAndGet();
									lastDeployTimedout.set(false);
									log.info("Starting deploy actions [" + deployActions + "]");
									long start = System.currentTimeMillis();
									long totalSize = 0;

									// Ask for the total size of the deployment first.
									for(DeployAction action : deployActions) {
										long plusSize = fetcher.sizeOf(action.getDataURI());
										if(plusSize == Fetcher.SIZE_UNKNOWN) {
											totalSize = Fetcher.SIZE_UNKNOWN;
											break;
										}
										totalSize += plusSize;
									}

									final double totalKnownSize = totalSize / (1024d * 1024d);
									final long startTime = System.currentTimeMillis();
									final AtomicLong bytesSoFar = new AtomicLong(0l);

									final Fetcher.Reporter reporter = new Fetcher.Reporter() {
										@Override
										public void progress(long consumed) {
											long now = System.currentTimeMillis();
											double totalSoFar = bytesSoFar.addAndGet(consumed) / (1024d * 1024d);
											double secondsSoFar = (now - startTime) / 1000d;
											double mBytesPerSec = totalSoFar / secondsSoFar;

											String msg = "[" + whoAmI() + " progress/speed report]: Fetched [";
											if(totalSoFar > 999) {
												msg += String.format("%.3f", (totalSoFar / 1024d)) + "] GBs so far ";
											} else {
												msg += String.format("%.3f", totalSoFar) + "] MBs so far ";
											}

											if(totalKnownSize != Fetcher.SIZE_UNKNOWN) {
												msg += "(out of [";
												if(totalKnownSize > 999) {
													msg += String.format("%.3f", (totalKnownSize / 1024d)) + "] GBs) ";
												} else {
													msg += String.format("%.3f", totalKnownSize) + "] MBs) ";
												}
											}
											msg += "- Current deployment speed is [" + String.format("%.3f", mBytesPerSec)
											    + "] MB/s.";
											// Add a report of the estimated remaining time if we can
											if(totalKnownSize != Fetcher.SIZE_UNKNOWN) {
												double missingSize = (totalKnownSize - totalSoFar);
												long remainingSecs = (long) (missingSize / mBytesPerSec);
												String timeRemaining = "";
												if(remainingSecs > 3600) { // hours, minutes and secs
													int hours = (int) (remainingSecs / 3600);
													int restOfSeconds = (int) (remainingSecs % 3600);
													timeRemaining = hours + " hours and " + (int) (restOfSeconds / 60)
													    + " minutes and " + (restOfSeconds % 60) + " seconds";
												} else if(remainingSecs > 60) { // minutes and secs
													timeRemaining = (int) (remainingSecs / 60) + " minutes and "
													    + (remainingSecs % 60) + " seconds";
												} else { // secs
													timeRemaining = remainingSecs + " seconds";
												}
												msg += " Estimated remaining time is [" + timeRemaining + "].";
											}
											log.info(msg);
											coord.logDeploySpeed(version, whoAmI(), msg);
										}
									};

                  // Parallel execution of deploy actions
                  ExecutorService executor = Executors.newFixedThreadPool(deployParallelism);
                  ExecutorCompletionService ecs = new ExecutorCompletionService(executor);
                  ArrayList> deployFutures = new ArrayList>();

									for(final DeployAction action : deployActions) {
                    deployFutures.add(ecs.submit(new Callable() {
                      @Override
                      public Boolean call() throws Exception {
                        // Downloads data and updates some structs
                        runDeployAction(reporter, action, version);
                        return true;
                      }
                    }));
                  }

                  // Waiting all tasks to finish.
                  for (int i = 0; i < deployActions.size(); i++) {
                    // Get will throw an exception if the callable returned it.
                    try {
                      ecs.take().get();
                    } catch (ExecutionException e) {
                      // One job was wrong. Stopping the rest.
                      for (Future task : deployFutures) {
                        task.cancel(true);
                      }
                      executor.shutdown();
                      throw e.getCause();
                    } catch (InterruptedException e) {
                      // Somebody interrupted the deployment thread. Stopping the
                      // rest of tasks.
                      for (Future task : deployFutures) {
                        task.cancel(true);
                      }
                      executor.shutdown();
                      throw e;
                    }
                  }

                  executor.shutdown();

                  // Publish new DNodeInfo in distributed registry.
									// This makes QNodes notice that a new version is available...
									// PartitionMap and ReplicationMap will be built incrementally as DNodes finish.
									dnodesRegistry.changeInfo(new DNodeInfo(config));

									long end = System.currentTimeMillis();
									log.info("Local deploy actions [" + deployActions + "] successfully finished in "
									    + (end - start) + " ms.");
									deployInProgress.decrementAndGet();
								} catch(Throwable t) {
									// In order to avoid stale deployments, we flag this deploy to be aborted
									log.warn("Error deploying [" + deployActions + "] barrier + [" + version + "]", t);
									abortDeploy(version, ExceptionUtils.getStackTrace(t));
								} finally {
									// Decrement the countdown latch. On 0, deployer knows that the deploy
									// finished.
									ICountDownLatch countdown = coord.getCountDownLatchForDeploy(version);
									countdown.countDown();
								}
							}
						});
						try {
							// This line makes the wait thread wait for the deploy as long as the configuration tells
							// If the timeout passes a TimeoutException is thrown
							future.get(config.getInt(DNodeProperties.DEPLOY_TIMEOUT_SECONDS), TimeUnit.SECONDS);
						} catch(InterruptedException e) {
							log.warn("Interrupted exception waiting for local deploy to finish - killing deployment",
							    e);
							abortDeploy(version, ExceptionUtils.getStackTrace(e));
							future.cancel(true);
						} catch(ExecutionException e) {
							log.warn("Execution exception waiting for local deploy to finish - killing deployment.", e);
							abortDeploy(version, ExceptionUtils.getStackTrace(e));
						} catch(TimeoutException e) {
							log.warn("Timeout waiting for local deploy to finish - killing deployment.", e);
							abortDeploy(version,
							    "Timeout reached - " + config.getInt(DNodeProperties.DEPLOY_TIMEOUT_SECONDS)
							        + " seconds");
							future.cancel(true);
							lastDeployTimedout.set(true);
						}
					}
				};
				deployWait.start();
			}
			// Everything is asynchronous so this is quickly reached - it just means the process has started
			return JSONSerDe.ser(new DNodeStatusResponse("Ok. Deploy initiated"));
		} catch(Throwable t) {
			unexpectedException(t);
			throw new DNodeException(EXCEPTION_UNEXPECTED, t.getMessage());
		}
	}

  /**
   * Runs a deploy action. Downloads file and warm up the data.
   */
  private void runDeployAction(Fetcher.Reporter reporter, DeployAction action, long version) throws IOException, URISyntaxException, DNodeException {
    // 1- Store metadata
    File metadataFile = getLocalMetadataFile(action.getTablespace(),
        action.getPartition(), version);
    if(!metadataFile.getParentFile().exists()) {
      metadataFile.getParentFile().mkdirs();
    }
    ThriftWriter writer = new ThriftWriter(metadataFile);
    writer.write(action.getMetadata());
    writer.close();
    // 2- Call the fetcher for fetching
    File fetchedContent = fetcher.fetch(action.getDataURI(), reporter);
    // If we reach this point then the fetch has been OK
    File dbFolder = getLocalStorageFolder(action.getTablespace(), action.getPartition(),
        version);
    if(dbFolder.exists()) { // If the new folder where we want to deploy already exists means it is
                            // somehow
                            // stalled from a previous failed deploy - it is ok to delete it
      FileUtils.deleteDirectory(dbFolder);
    }
    // 4- Perform a "mv" for finally making the data available
    FileUtils.moveDirectory(fetchedContent, dbFolder);
    // 5- Preemptively load the Manager in case initialization is slow
    // Managers might warm up for a while (e.g. loading data into memory)
    loadManagerInEHCache(action.getTablespace(), action.getVersion(), action.getPartition(), dbFolder, action.getMetadata());
  }

  /**
	 * Thrift RPC method -> Given a list of {@link RollbackAction}s, perform a synchronous rollback
	 */
	@Override
	public String rollback(List rollbackActions, String ignoreMe) throws DNodeException {
		// The DNode doesn't need to do anything special for rolling back a version.
		// It can serve any version that is stored locally.
		try {
			return JSONSerDe.ser(new DNodeStatusResponse("Ok. Rollback order received."));
		} catch(JSONSerDeException e) {
			unexpectedException(e);
			throw new DNodeException(EXCEPTION_UNEXPECTED, e.getMessage());
		}
	}

	/*
	 * Any unexpected exception must be redirected to this method. In this way we can monitor it using state variables.
	 * The state variables will then be passed onto the appropriated bean in the status() RPC call.
	 */
	private void unexpectedException(Throwable t) {
		t.printStackTrace();
		log.error("Unexpected Exception", t);
		lastException = t.getMessage();
		lastExceptionTime = System.currentTimeMillis();
	}

	/**
	 * Returns an {@link com.splout.db.dnode.beans.DNodeSystemStatus} filled with the appropriated data.
	 */
	@Override
	public String status() throws DNodeException {
		try {
			DNodeSystemStatus status = new DNodeSystemStatus();
			if(lastException == null) {
				status.setSystemStatus("UP");
				status.setLastExceptionTime(-1);
			} else {
				status.setSystemStatus("Last exception: " + lastException);
				status.setLastExceptionTime(lastExceptionTime);
			}
			status.setUpSince(upSince);
			status.setFailedQueries(failedQueries.get());
			status.setnQueries(performanceTool.getNQueries());
			status.setAverage(performanceTool.getAverage());
			status.setSlowQueries(slowQueries);
			status.setDeployInProgress(deployInProgress.get() > 0);
			status.setHttpExchangerAddress(httpExchangerAddress());
			status.setBalanceActionsStateMap(balanceActionsStateMap);
			File folder = new File(config.getString(DNodeProperties.DATA_FOLDER));
			if(folder.exists()) {
				status.setFreeSpaceInDisk(FileSystemUtils.freeSpaceKb(folder.toString()));
				status.setOccupiedSpaceInDisk(FileUtils.sizeOfDirectory(folder));
				Collection files = FileUtils.listFilesAndDirs(folder, TrueFileFilter.INSTANCE,
				    TrueFileFilter.INSTANCE);
				status.setFiles(new ArrayList(Lists.transform(Lists.newArrayList(files),
				    new Function() {
					    @Override
					    public String apply(File file) {
						    return file.getAbsolutePath() + " (" + FileUtils.sizeOf(file) + " bytes)";
					    }
				    })));
				Collections.sort(status.getFiles());
			} else {
				status.setOccupiedSpaceInDisk(0);
				status.setFreeSpaceInDisk(FileSystemUtils.freeSpaceKb("."));
				status.setFiles(new ArrayList());
			}
			return JSONSerDe.ser(status);
		} catch(Throwable t) {
			unexpectedException(t);
			throw new DNodeException(EXCEPTION_UNEXPECTED, t.getMessage());
		}
	}

	protected File getLocalStorageFolder(String tablespace, int partition, long version) {
		return getLocalStorageFolder(config, tablespace, partition, version);
	}

	/**
	 * Returns the folder where the DNode that uses the provided Configuration will store the binary data for this
	 * tablespace, version and partition.
	 */
	public static File getLocalStorageFolder(SploutConfiguration config, String tablespace, int partition,
	    long version) {
		String dataFolder = config.getString(DNodeProperties.DATA_FOLDER);
		return new File(dataFolder + "/"
		    + getLocalStoragePartitionRelativePath(tablespace, partition, version));
	}

	public static String getLocalStoragePartitionRelativePath(String tablespace, int partition,
	    long version) {
		return tablespace + "/" + version + "/" + partition;
	}

	protected File getLocalMetadataFile(String tablespace, int partition, long version) {
		return getLocalMetadataFile(config, tablespace, partition, version);
	}

	/**
	 * Returns the file where the DNode that uses the provided Configuration will store the metadata for this tablespace,
	 * version and partition.
	 */
	public static File getLocalMetadataFile(SploutConfiguration config, String tablespace, int partition,
	    long version) {
		String dataFolder = config.getString(DNodeProperties.DATA_FOLDER);
		return new File(dataFolder + "/" + getLocalMetadataFileRelativePath(tablespace, partition, version));
	}

	public static String getLocalMetadataFileRelativePath(String tablespace, int partition, long version) {
		return tablespace + "/" + version + "/" + partition + ".meta";
	}

	public boolean isDeployInProgress() {
		return deployInProgress.get() > 0;
	}

	/**
	 * Properly dispose this DNode.
	 */
	public void stop() throws Exception {
		dbCache.dispose();
		deployThread.shutdownNow();
		factory.close();
		httpExchanger.close();
		hz.getLifecycleService().shutdown();
	}

	@Override
	public String abortDeploy(long version) throws DNodeException {
		// For simplicity, current implementation cancels all queued deploys.
		// There can only be one deploy being handled at a time, but multiple may have been queued.
		try {
			if(isDeployInProgress()) {
				synchronized(deployLock) { // No new deploys to be handled until we cancel the current one
					// Note that it is not always guaranteed that threads will be properly shutdown...
					deployThread.shutdownNow();
					while(!deployThread.isTerminated()) {
						try {
							Thread.sleep(100);
						} catch(InterruptedException e) {
							log.error("Deploy Thread interrupted - continuing anyway.", e);
						}
					}
					deployThread = Executors.newFixedThreadPool(1);
				}
				return JSONSerDe.ser(new DNodeStatusResponse("Ok. Deploy cancelled."));
			} else {
				return JSONSerDe.ser(new DNodeStatusResponse("No deploy in progress."));
			}
		} catch(JSONSerDeException e) {
			unexpectedException(e);
			throw new DNodeException(EXCEPTION_UNEXPECTED, e.getMessage());
		}
	}

	@Override
	public String deleteOldVersions(List versions)
	    throws DNodeException {
		for(com.splout.db.thrift.TablespaceVersion version : versions) {
			log.info("Going to remove " + version + " as I have been told to do so.");
			try {
				deleteLocalVersion(version);
			} catch(Throwable e) {
				unexpectedException(e);
				throw new DNodeException(EXCEPTION_UNEXPECTED, e.getMessage());
			}
		}
		try {
			// Publish new DNodeInfo in distributed registry.
			// This makes QNodes notice that a new version is available...
			// PartitionMap and ReplicationMap will be built incrementally as DNodes finish.
			dnodesRegistry.changeInfo(new DNodeInfo(config));
			return JSONSerDe.ser(new DNodeStatusResponse("Ok. Delete old versions executed."));
		} catch(JSONSerDeException e) {
			unexpectedException(e);
			throw new DNodeException(EXCEPTION_UNEXPECTED, e.getMessage());
		}
	}

	// ----------------- TEST API ----------------- //

	private AtomicBoolean shutDownByTestAPI = new AtomicBoolean(false);

	/*
	 * This method is called by unit / integration tests in order to simulate failures and recoveries in DNodes and such.
	 */
	@Override
	public String testCommand(String commandStr) throws DNodeException {
		if(!config.getBoolean(DNodeProperties.HANDLE_TEST_COMMANDS)) {
			throw new DNodeException(EXCEPTION_ORDINARY, "Can't handle test commands as "
			    + DNodeProperties.HANDLE_TEST_COMMANDS + " is not set to true.");
		}
		TestCommands command = TestCommands.valueOf(commandStr);
		if(command == null) {
			throw new DNodeException(EXCEPTION_ORDINARY, "Unknown test command: " + commandStr);
		}
		if(command.equals(TestCommands.SHUTDOWN)) {
			// on-demand shutdown
			// This is a "soft-shutdown" so we can recover from it.
			// It is designed for unit and integration testing.
			shutDownByTestAPI.set(true);
			dnodesRegistry.unregister();
			log.info("Received a shutdown by test API.");
			hz.getLifecycleService().shutdown();
		} else if(command.equals(TestCommands.RESTART)) {
			// on-demand restart
			// This is a "soft-restart" after a "soft-shutdown".
			// It is designed for unit and integration testing.
			shutDownByTestAPI.set(false);
			try {
				hz = Hazelcast.newHazelcastInstance(HazelcastConfigBuilder.build(config));
				coord = new CoordinationStructures(hz);
				log.info("Received a restart by test API.");
				giveGreenLigth();
			} catch(HazelcastConfigBuilderException e) {
				log.error("Error while trying to connect to Hazelcast", e);
				throw new DNodeException(EXCEPTION_UNEXPECTED, e.getMessage());
			}
		}
		try {
			return JSONSerDe.ser(new DNodeStatusResponse("Ok. Test command " + commandStr
			    + " received properly."));
		} catch(JSONSerDeException e) {
			unexpectedException(e);
			throw new DNodeException(EXCEPTION_UNEXPECTED, e.getMessage());
		}
	}

	// --- Getters mainly for testing --- /

	public CoordinationStructures getCoord() {
		return coord;
	}

	public DistributedRegistry getDnodesRegistry() {
		return dnodesRegistry;
	}
}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy