com.splout.db.qnode.QNodeHandler Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of splout-server Show documentation
Show all versions of splout-server Show documentation
Splout SQL is a read only, horizontally scalable and
partitioned SQL database that plays well with Hadoop.
The newest version!
package com.splout.db.qnode;
/*
* #%L
* Splout SQL Server
* %%
* Copyright (C) 2012 Datasalt Systems S.L.
* %%
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU Affero General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU Affero General Public License
* along with this program. If not, see .
* #L%
*/
import com.google.common.base.Joiner;
import com.hazelcast.core.*;
import com.splout.db.common.JSONSerDe;
import com.splout.db.common.JSONSerDe.JSONSerDeException;
import com.splout.db.common.SploutConfiguration;
import com.splout.db.common.Tablespace;
import com.splout.db.dnode.beans.DNodeSystemStatus;
import com.splout.db.hazelcast.*;
import com.splout.db.qnode.Deployer.UnexistingVersion;
import com.splout.db.qnode.QNodeHandlerContext.DNodeEvent;
import com.splout.db.qnode.QNodeHandlerContext.TablespaceVersionInfoException;
import com.splout.db.qnode.Querier.QuerierException;
import com.splout.db.qnode.beans.*;
import com.splout.db.thrift.DNodeService;
import com.yammer.metrics.Metrics;
import com.yammer.metrics.core.Counter;
import com.yammer.metrics.core.Histogram;
import com.yammer.metrics.core.Meter;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.thrift.transport.TTransportException;
import org.codehaus.jackson.type.TypeReference;
import java.io.IOException;
import java.util.*;
import java.util.Map.Entry;
import java.util.concurrent.TimeUnit;
/**
* Implements the business logic for the {@link QNode}.
*
* The QNode is the most complex and delicate part of Splout. Among its responsabilities are:
*
* - Handling deploys asynchronously: One QNode will lead a deployment. It will put a flag in ZooKeeper and trigger an
* asynchronous deploy to all involved DNodes. Then, it has to finalize the deploy properly when all DNodes are ready.
* This is handled by the {@link Deployer} module.
* - Performing queries and multiqueries. This is handled by the {@link Querier} module.
* - Handling rollbacks: Rollbacks are easy since we just need to change the version in ZooKeeper (DNodes already have
* the data for past version in disk). The number of versions that are saved in the system per tablespace can be
* configured (see {@link QNodeProperties}).
*
* For convenience, there is some in-memory state grabbed from ZooKeeper in {@link QNodeHandlerContext}. This state is
* passed through all modules ({@link Deployer} and such). Care has to be taken to have consistent in-memory state, for
* that it is important to handle ZooKeeper events properly and be notified always on the paths that we are interested
* in.
*
* One of the important business logic parts of this class is to synchronize the versions in ZooKeeper. Because we only
* want to keep a certain amount of versions, the QNodes have to check for this and remove stalled versions if needed.
* Then, DNodes will receive a notification and they will be able to delete the old data from disk.
*
* The QNode returns JSON strings for all of its methods. The beans that are serialized are indicated in the
* documentation.
*/
public class QNodeHandler implements IQNodeHandler {
/**
* The JSON type reference for deserializing Multi-query results
*/
public final static TypeReference> MULTIQUERY_TYPE_REF = new TypeReference>() {
};
private final static Log log = LogFactory.getLog(QNodeHandler.class);
private QNodeHandlerContext context;
private Deployer deployer;
private Querier querier;
private SploutConfiguration config;
private CoordinationStructures coord;
private Thread warmingThread;
private final Counter meterQueriesServed = Metrics.newCounter(QNodeHandler.class, "queries-served");
private final Meter meterRequestsPerSecond = Metrics.newMeter(QNodeHandler.class, "queries-second",
"queries-second", TimeUnit.SECONDS);
private final Histogram meterResultSize = Metrics.newHistogram(QNodeHandler.class, "response-size");
/**
* Keep track of die/alive DNodes events.
*/
public class DNodesListener implements EntryListener {
@Override
public void entryAdded(EntryEvent event) {
log.info("DNode [" + event.getValue() + "] joins the cluster as ready to server requests.");
// Update TablespaceVersions
try {
String dnode = event.getValue().getAddress();
log.info(Thread.currentThread().getName() + " : populating client queue for [" + dnode
+ "] as it connected.");
context.initializeThriftClientCacheFor(dnode);
context.updateTablespaceVersions(event.getValue(), QNodeHandlerContext.DNodeEvent.ENTRY);
log.info(Thread.currentThread() + ": Maybe balance (entryAdded)");
context.maybeBalance();
} catch (TablespaceVersionInfoException e) {
throw new RuntimeException(e);
} catch (TTransportException e) {
throw new RuntimeException(e);
} catch (InterruptedException e) {
throw new RuntimeException(e);
}
}
@Override
public void entryRemoved(EntryEvent event) {
log.info("DNode [" + event.getValue() + "] left.");
// Update TablespaceVersions
try {
context.discardThriftClientCacheFor(event.getValue().getAddress());
context.updateTablespaceVersions(event.getValue(), QNodeHandlerContext.DNodeEvent.LEAVE);
log.info(Thread.currentThread() + ": Maybe balance (entryRemoved)");
context.maybeBalance();
} catch (TablespaceVersionInfoException e) {
throw new RuntimeException(e);
} catch (InterruptedException e) {
throw new RuntimeException(e);
}
}
@Override
public void entryUpdated(EntryEvent event) {
// Update TablespaceVersions
try {
context.updateTablespaceVersions(event.getValue(), QNodeHandlerContext.DNodeEvent.UPDATE);
} catch (TablespaceVersionInfoException e) {
throw new RuntimeException(e);
}
}
@Override
public void entryEvicted(EntryEvent event) {
// Never happens
log.error("Event entryEvicted received for [" + event + "]. "
+ "Should have never happened... Something wrong in the code");
}
}
public class VersionListener implements EntryListener> {
private void check(EntryEvent> event) {
if (!CoordinationStructures.KEY_FOR_VERSIONS_BEING_SERVED.equals(event.getKey())) {
throw new RuntimeException("Unexpected key " + event.getKey() + " for map "
+ CoordinationStructures.KEY_FOR_VERSIONS_BEING_SERVED);
}
}
private void processAddOrUpdate(EntryEvent> event) {
check(event);
try {
// We perform all changes together with the aim of atomicity
updateLocalTablespace(event.getValue());
// After a change in versions (deployment, rollback, delete) we must
// synchronize tablespace versions to see if we have to remove some.
context.synchronizeTablespaceVersions();
} catch (Exception e) {
log.error(
"Error changing serving tablespace [" + event.getKey() + " to version [" + event.getValue()
+ "]. Probably the system is now unstable.", e);
}
}
@Override
public void entryAdded(EntryEvent> event) {
// log.info("New versions table event received.");
processAddOrUpdate(event);
}
@Override
public void entryUpdated(EntryEvent> event) {
// log.info("Updated versions table event received.");
processAddOrUpdate(event);
}
@Override
public synchronized void entryRemoved(EntryEvent> event) {
check(event);
// TODO: make this operation atomical. ConcurrentHashMap.clear() is not.
log.info("Versions table removed!. Clearing up all tablespace versions.");
context.getCurrentVersionsMap().clear();
return;
}
@Override
public void entryEvicted(EntryEvent> event) {
throw new RuntimeException("Should never happen. Something is really wrong :O");
}
}
public void init(final SploutConfiguration config) throws Exception {
this.config = config;
log.info(this + " - Initializing QNode...");
// Connect with the cluster.
HazelcastInstance hz = Hazelcast.newHazelcastInstance(HazelcastConfigBuilder.build(config));
int minutesToCheckRegister = config.getInt(HazelcastProperties.MAX_TIME_TO_CHECK_REGISTRATION, 5);
int oldestMembersLeading = config.getInt(HazelcastProperties.OLDEST_MEMBERS_LEADING_COUNT, 3);
// we must instantiate the DistributedRegistry even if we're not a DNode to be able to receive memembership leaving
// in race conditions such as all DNodes leaving.
new DistributedRegistry(CoordinationStructures.DNODES, null, hz, minutesToCheckRegister,
oldestMembersLeading);
coord = new CoordinationStructures(hz);
context = new QNodeHandlerContext(config, coord);
// Initialialize DNodes tracking
initDNodesTracking();
// Initialize versions to be served tracking
initVersionTracking();
// Now instantiate modules
deployer = new Deployer(context);
querier = new Querier(context);
log.info(Thread.currentThread() + " - Initializing QNode [DONE].");
warmingThread = new Thread() {
@Override
public void run() {
try {
log.info("Currently warming up for [" + config.getInt(QNodeProperties.WARMING_TIME)
+ "] - certain actions will only be taken afterwards.");
Thread.sleep(config.getInt(QNodeProperties.WARMING_TIME) * 1000);
log.info("Warming time ended [OK] Now the QNode will operate fully normally.");
} catch (InterruptedException e) {
log.error("Warming time interrupted - ");
}
context.getIsWarming().set(false);
}
};
warmingThread.start();
}
/**
* Initializes the tracking of DNodes joining and leaving the cluster.
*/
private void initDNodesTracking() {
IMap dnodes = context.getCoordinationStructures().getDNodes();
// CAUTION: We must register the listener BEFORE reading the list
// of dnodes. Otherwise we could have a race condition.
dnodes.addEntryListener(new DNodesListener(), true);
Set dNodes = new HashSet();
for (DNodeInfo dnodeInfo : dnodes.values()) {
dNodes.add(dnodeInfo.getAddress());
try {
context.initializeThriftClientCacheFor(dnodeInfo.getAddress());
context.updateTablespaceVersions(dnodeInfo, DNodeEvent.ENTRY);
} catch (TablespaceVersionInfoException e) {
throw new RuntimeException(e);
} catch (TTransportException e) {
throw new RuntimeException(e);
} catch (InterruptedException e) {
throw new RuntimeException(e);
}
}
log.info("Alive DNodes at QNode startup [" + Joiner.on(", ").skipNulls().join(dNodes) + "]");
log.info("TablespaceVersion map at QNode startup [" + context.getTablespaceVersionsMap() + "]");
}
/**
* Loads the tablespaces information in memory to being ready to serve them, and starts to keep track of changes in
* tablespace's version to be served. To be called at initialization.
*/
private void initVersionTracking() throws IOException {
IMap> versions = context.getCoordinationStructures()
.getVersionsBeingServed();
// CAUTION: We register the listener before updating the in memory versions
// because if we do the other way around, we could lose updates to tablespace
// versions or new tablespaces.
VersionListener listener = new VersionListener();
versions.addEntryListener(listener, true);
Map vBeingServed = new HashMap();
String persistenceFolder = config.getString(HazelcastProperties.HZ_PERSISTENCE_FOLDER);
if (persistenceFolder != null && !persistenceFolder.equals("")) {
TablespaceVersionStore vStore = new TablespaceVersionStore(persistenceFolder);
Map vBeingServedFromDisk = vStore
.load(CoordinationStructures.KEY_FOR_VERSIONS_BEING_SERVED);
if (vBeingServedFromDisk == null) {
log.info("No state about versions to be served in disk.");
} else {
vBeingServed = vBeingServedFromDisk;
log.info("Loaded tablespace versions to be served from disk: " + vBeingServedFromDisk);
}
}
Map vBeingServedFromHz = null;
do {
vBeingServedFromHz = context.getCoordinationStructures().getCopyVersionsBeingServed();
if (vBeingServedFromHz != null) {
// We assume info in memory (Hazelcast) is fresher than info in disk
for (Map.Entry entry : vBeingServedFromHz.entrySet()) {
vBeingServed.put(entry.getKey(), entry.getValue());
}
}
} while (!context.getCoordinationStructures().updateVersionsBeingServed(vBeingServedFromHz,
vBeingServed));
updateLocalTablespace(vBeingServed);
log.info("Tablespaces versions after merging loaded disk state with HZ: " + vBeingServed);
}
private void updateLocalTablespace(Map tablespacesAndVersions) throws IOException {
log.info("Update local tablespace: " + tablespacesAndVersions);
if (tablespacesAndVersions == null) {
return;
}
// CAREFUL TODO: That is not atomic. Something should
// be done to make that update atomic.
context.getCurrentVersionsMap().putAll(tablespacesAndVersions);
String persistenceFolder = config.getString(HazelcastProperties.HZ_PERSISTENCE_FOLDER);
if (persistenceFolder != null && !persistenceFolder.equals("")) {
TablespaceVersionStore vStore = new TablespaceVersionStore(persistenceFolder);
vStore.store(CoordinationStructures.KEY_FOR_VERSIONS_BEING_SERVED, tablespacesAndVersions);
}
}
/**
* Given a key, a tablespace and a SQL, query it to the appropriated DNode and return the result.
*
* Returns a {@link QueryStatus}.
*
* @throws QuerierException
*/
public QueryStatus query(String tablespace, String key, String sql, String partition)
throws JSONSerDeException, QuerierException {
if (sql == null) {
return new ErrorQueryStatus("Null sql provided, can't query.");
}
if (sql.length() < 1) {
return new ErrorQueryStatus("Empty sql provided, can't query.");
}
if (key == null && partition == null) {
return new ErrorQueryStatus(
"Null key / partition provided, can't query. Either partition or key must not be null.");
}
if (key != null && partition != null) {
return new ErrorQueryStatus(
"(partition, key) parameters are mutually exclusive. Please use one or other, not both at the same time.");
}
meterQueriesServed.inc();
meterRequestsPerSecond.mark();
/*
* The queries are handled by the specialized module {@link Querier}
*/
QueryStatus result = querier.query(tablespace, key, sql, partition);
if (result.getResult() != null) {
meterResultSize.update(result.getResult().size());
}
return result;
}
/**
* Multi-query: use {@link Querier} for as many shards as needed and return a list of {@link QueryStatus}
*
* Returns a list of {@link QueryStatus}.
*/
public ArrayList multiQuery(String tablespaceName, List keyMins,
List keyMaxs, String sql) throws JSONSerDeException {
if (sql == null) {
return new ArrayList(Arrays.asList(new QueryStatus[]{new ErrorQueryStatus(
"Null sql provided, can't query.")}));
}
if (sql.length() < 1) {
return new ArrayList(Arrays.asList(new QueryStatus[]{new ErrorQueryStatus(
"Empty sql provided, can't query.")}));
}
if (keyMins.size() != keyMaxs.size()) {
// This has to be handled before! We are not going to be polite here
throw new RuntimeException(
"This is very likely a software bug: Inconsistent parameters received in "
+ QNodeHandler.class + " for multiQuery() : " + tablespaceName + ", " + keyMins + ","
+ keyMaxs + ", " + sql);
}
Set impactedKeys = new HashSet();
Long version = context.getCurrentVersionsMap().get(tablespaceName);
if (version == null) {
return new ArrayList(Arrays.asList(new QueryStatus[]{new ErrorQueryStatus(
"No available version for tablespace " + tablespaceName)}));
}
// TODO Object creation (new TablespaceVersion), not very efficient for performance
Tablespace tablespace = context.getTablespaceVersionsMap().get(
new TablespaceVersion(tablespaceName, version));
if (tablespace == null) { // This can happen if, at startup, we only received the version and not the DNodeInfo
return new ArrayList(Arrays.asList(new QueryStatus[]{new ErrorQueryStatus(
"No available information for tablespace version " + tablespaceName + "," + version)}));
}
if (keyMins.size() == 0) {
impactedKeys.addAll(tablespace.getPartitionMap().findPartitions(null, null)); // all partitions are hit
}
for (int i = 0; i < keyMins.size(); i++) {
impactedKeys.addAll(tablespace.getPartitionMap().findPartitions(keyMins.get(i), keyMaxs.get(i)));
}
ArrayList toReturn = new ArrayList();
for (Integer shardKey : impactedKeys) {
toReturn.add(querier.query(tablespaceName, sql, shardKey));
}
meterQueriesServed.inc();
meterRequestsPerSecond.mark();
return toReturn;
}
/**
* Given a list of {@link DeployRequest}, perform an asynchronous deploy. This is currently the most important part of
* Splout and the most complex one. Here we are involving several DNodes asynchronously and later we will check that
* everything finished.
*
* Returns a {@link DeployInfo}.
*/
public DeployInfo deploy(List deployRequest) throws Exception {
/*
* The deployment is handled by the specialized module {@link Deployer}
*/
return deployer.deploy(deployRequest);
}
/**
* Rollback: Set the version of some tablespaces to a particular one.
*
* Returns a {@link StatusMessage}.
*/
public StatusMessage rollback(List rollbackRequest) throws JSONSerDeException {
try {
// TODO: Coordinate with context.synchronizeTablespaceVersions() because one could being deleting some tablespace
// when other is trying a rollback.
deployer.switchVersions(rollbackRequest);
// TODO: Change this status message to something more programmatic
return new StatusMessage("Done");
} catch (UnexistingVersion e) {
return new StatusMessage(e.getMessage() + ". Not possible to rollback to unexisting version.");
}
}
/**
* Returns the {@link QNodeStatus} filled correctly.
*/
public QNodeStatus overview() throws Exception {
QNodeStatus status = new QNodeStatus();
status.setClusterSize(coord.getHz().getCluster().getMembers().size());
Map aliveDNodes = new HashMap();
for (DNodeInfo dnode : context.getCoordinationStructures().getDNodes().values()) {
DNodeService.Client client = null;
boolean renew = false;
try {
client = getContext().getDNodeClientFromPool(dnode.getAddress());
aliveDNodes.put(dnode.getAddress(), JSONSerDe.deSer(client.status(), DNodeSystemStatus.class));
} catch (TTransportException e) {
renew = true;
throw e;
} finally {
if (client != null) {
context.returnDNodeClientToPool(dnode.getAddress(), client, renew);
}
}
}
status.setdNodes(aliveDNodes);
Map tablespaceMap = new HashMap();
for (Map.Entry currentVersion : context.getCurrentVersionsMap().entrySet()) {
Tablespace tablespace = context.getTablespaceVersionsMap().get(
new TablespaceVersion(currentVersion.getKey(), currentVersion.getValue()));
if (tablespace != null) { // this might happen and it is not a bug
tablespaceMap.put(currentVersion.getKey(), tablespace);
}
}
status.setTablespaceMap(tablespaceMap);
return status;
}
/**
* Returns the list of tablespaces
*/
public Set tablespaces() throws Exception {
return context.getCurrentVersionsMap().keySet();
}
/**
* Return all available versions for each tablespace
*/
@Override
public Map allTablespaceVersions(final String tablespace) throws Exception {
HashMap ret = new HashMap();
Set> versions = context.getTablespaceVersionsMap().entrySet();
for (Entry entry : versions) {
if (entry.getKey().getTablespace().equals(tablespace)) {
ret.put(entry.getKey().getVersion(), entry.getValue());
}
}
return ret;
}
/**
* Return a properly filled {@link DNodeSystemStatus}
*/
@Override
public DNodeSystemStatus dnodeStatus(String dnode) throws Exception {
DNodeService.Client client = null;
boolean renew = false;
try {
client = getContext().getDNodeClientFromPool(dnode);
return JSONSerDe.deSer(client.status(), DNodeSystemStatus.class);
} catch (TTransportException e) {
renew = true;
throw e;
} finally {
if (client != null) {
context.returnDNodeClientToPool(dnode, client, renew);
}
}
}
/**
* Returns an overview of what happened with all deployments so far. There is a short status associated with each
* deploy, and there is a set of detailed log messages as well.
*/
@Override
public DeploymentsStatus deploymentsStatus() throws Exception {
DeploymentsStatus status = new DeploymentsStatus();
status.setFailedDeployments(new ArrayList());
status.setFinishedDeployments(new ArrayList());
status.setOngoingDeployments(new ArrayList());
for (Map.Entry deployment : coord.getDeploymentsStatusPanel().entrySet()) {
long deployVersion = deployment.getKey();
DeploymentStatus dStatus = new DeploymentStatus();
dStatus.setDeploymentId(deployVersion);
DeployInfo dInfo = coord.getDeployInfoPanel().get(deployVersion);
dStatus.setDate("");
if (dInfo != null) {
dStatus.setDate(dInfo.getStartedAt());
dStatus.setDataURIs(dInfo.getDataURIs());
dStatus.setTablespacesDeployed(dInfo.getTablespacesDeployed());
} else {
log.warn("Null DeployInfo for deploy: " + deployVersion
+ " - it should be persisted in any case.");
}
List logsPerDeploy = new ArrayList();
logsPerDeploy.addAll(coord.getDeployLogPanel(deployVersion));
logsPerDeploy.addAll(coord.getDeploySpeedPanel(deployVersion).values());
Collections.sort(logsPerDeploy);
dStatus.setLogMessages(logsPerDeploy);
if (deployment.getValue().equals(DeployStatus.FAILED)) {
status.getFailedDeployments().add(dStatus);
} else if (deployment.getValue().equals(DeployStatus.FINISHED)) {
status.getFinishedDeployments().add(dStatus);
} else if (deployment.getValue().equals(DeployStatus.ONGOING)) {
status.getOngoingDeployments().add(dStatus);
}
}
Comparator byDeploymentDateDesc = new Comparator() {
@Override
public int compare(DeploymentStatus dStatus1, DeploymentStatus dStatus2) {
return dStatus2.getDate().compareTo(dStatus1.getDate());
}
};
final int maxDeploymentsToShowPerCategory = 10;
Collections.sort(status.getFailedDeployments(), byDeploymentDateDesc);
Collections.sort(status.getFinishedDeployments(), byDeploymentDateDesc);
Collections.sort(status.getOngoingDeployments(), byDeploymentDateDesc);
status.setFailedDeployments(status.getFailedDeployments().subList(0,
Math.min(status.getFailedDeployments().size(), maxDeploymentsToShowPerCategory)));
status.setFinishedDeployments(status.getFinishedDeployments().subList(0,
Math.min(status.getFinishedDeployments().size(), maxDeploymentsToShowPerCategory)));
status.setOngoingDeployments(status.getOngoingDeployments().subList(0,
Math.min(status.getOngoingDeployments().size(), maxDeploymentsToShowPerCategory)));
return status;
}
@Override
public Tablespace tablespace(String tablespace) throws Exception {
Long version = context.getCurrentVersionsMap().get(tablespace);
if (version == null) {
return null;
}
Tablespace t = context.getTablespaceVersionsMap().get(new TablespaceVersion(tablespace, version));
return t;
}
/**
* Get the list of DNodes
*/
@Override
public List getDNodeList() throws Exception {
return context.getDNodeList();
}
/**
* Properly dispose this QNodeHandler.
*/
@Override
public void close() throws Exception {
context.close();
if (warmingThread != null) {
warmingThread.interrupt();
warmingThread.join();
}
}
/**
* Used for testing.
*/
public QNodeHandlerContext getContext() {
return context;
}
/**
* Used for testing.
*/
public Deployer getDeployer() {
return deployer;
}
/**
* Allows the user to manually tell the QNode to look for old tablespace versions to remove.
* This happens automatically on every deploy. But if some disks are about to be filled,
* this can be invoked manually after changing {@link QNodeProperties.#VERSIONS_PER_TABLESPACE}
* configuration property and restarting the services.
*/
@Override
public StatusMessage cleanOldVersions() throws Exception {
// Check old versions to remove on demand
List removed = context.synchronizeTablespaceVersions();
String rmvdTxt = "";
for (com.splout.db.thrift.TablespaceVersion tV : removed) {
rmvdTxt += tV.getTablespace() + ":" + tV.getVersion() + ", ";
}
if (rmvdTxt.length() > 0) {
rmvdTxt = rmvdTxt.substring(0, rmvdTxt.length() - 1);
return new StatusMessage("Removing tablespace versions: " + rmvdTxt);
} else {
return new StatusMessage("No old tablespace versions to remove. Change "
+ QNodeProperties.VERSIONS_PER_TABLESPACE
+ " configuration property and restart the QNodes if you intend to free some space.");
}
}
}