com.splout.db.qnode.QNodeHandler Maven / Gradle / Ivy
Show all versions of splout-server Show documentation
package com.splout.db.qnode;
/*
* #%L
* Splout SQL Server
* %%
* Copyright (C) 2012 Datasalt Systems S.L.
* %%
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU Affero General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU Affero General Public License
* along with this program. If not, see .
* #L%
*/
import java.io.IOException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;
import java.util.Set;
import java.util.concurrent.TimeUnit;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.thrift.transport.TTransportException;
import org.codehaus.jackson.type.TypeReference;
import com.google.common.base.Joiner;
import com.hazelcast.core.EntryEvent;
import com.hazelcast.core.EntryListener;
import com.hazelcast.core.Hazelcast;
import com.hazelcast.core.HazelcastInstance;
import com.hazelcast.core.IMap;
import com.splout.db.common.JSONSerDe;
import com.splout.db.common.JSONSerDe.JSONSerDeException;
import com.splout.db.common.SploutConfiguration;
import com.splout.db.common.Tablespace;
import com.splout.db.dnode.beans.DNodeSystemStatus;
import com.splout.db.hazelcast.CoordinationStructures;
import com.splout.db.hazelcast.DNodeInfo;
import com.splout.db.hazelcast.DistributedRegistry;
import com.splout.db.hazelcast.HazelcastConfigBuilder;
import com.splout.db.hazelcast.HazelcastProperties;
import com.splout.db.hazelcast.TablespaceVersion;
import com.splout.db.hazelcast.TablespaceVersionStore;
import com.splout.db.qnode.Deployer.UnexistingVersion;
import com.splout.db.qnode.QNodeHandlerContext.DNodeEvent;
import com.splout.db.qnode.QNodeHandlerContext.TablespaceVersionInfoException;
import com.splout.db.qnode.Querier.QuerierException;
import com.splout.db.qnode.beans.DeployInfo;
import com.splout.db.qnode.beans.DeployRequest;
import com.splout.db.qnode.beans.ErrorQueryStatus;
import com.splout.db.qnode.beans.QNodeStatus;
import com.splout.db.qnode.beans.QueryStatus;
import com.splout.db.qnode.beans.StatusMessage;
import com.splout.db.qnode.beans.SwitchVersionRequest;
import com.splout.db.thrift.DNodeService;
import com.yammer.metrics.Metrics;
import com.yammer.metrics.core.Counter;
import com.yammer.metrics.core.Histogram;
import com.yammer.metrics.core.Meter;
/**
* Implements the business logic for the {@link QNode}.
*
* The QNode is the most complex and delicate part of Splout. Among its responsabilities are:
*
* - Handling deploys asynchronously: One QNode will lead a deployment. It will put a flag in ZooKeeper and trigger an
* asynchronous deploy to all involved DNodes. Then, it has to finalize the deploy properly when all DNodes are ready.
* This is handled by the {@link Deployer} module.
* - Performing queries and multiqueries. This is handled by the {@link Querier} module.
* - Handling rollbacks: Rollbacks are easy since we just need to change the version in ZooKeeper (DNodes already have
* the data for past version in disk). The number of versions that are saved in the system per tablespace can be
* configured (see {@link QNodeProperties}).
*
* For convenience, there is some in-memory state grabbed from ZooKeeper in {@link QNodeHandlerContext}. This state is
* passed through all modules ({@link Deployer} and such). Care has to be taken to have consistent in-memory state, for
* that it is important to handle ZooKeeper events properly and be notified always on the paths that we are interested
* in.
*
* One of the important business logic parts of this class is to synchronize the versions in ZooKeeper. Because we only
* want to keep a certain amount of versions, the QNodes have to check for this and remove stalled versions if needed.
* Then, DNodes will receive a notification and they will be able to delete the old data from disk.
*
* The QNode returns JSON strings for all of its methods. The beans that are serialized are indicated in the
* documentation.
*/
public class QNodeHandler implements IQNodeHandler {
/**
* The JSON type reference for deserializing Multi-query results
*/
public final static TypeReference> MULTIQUERY_TYPE_REF = new TypeReference>() {
};
private final static Log log = LogFactory.getLog(QNodeHandler.class);
private QNodeHandlerContext context;
private Deployer deployer;
private Querier querier;
private SploutConfiguration config;
private CoordinationStructures coord;
private Thread warmingThread;
private final Counter meterQueriesServed = Metrics.newCounter(QNodeHandler.class, "queries-served");
private final Meter meterRequestsPerSecond = Metrics.newMeter(QNodeHandler.class, "queries-second",
"queries-second", TimeUnit.SECONDS);
private final Histogram meterResultSize = Metrics.newHistogram(QNodeHandler.class, "response-size");
/**
* Keep track of die/alive DNodes events.
*/
public class DNodesListener implements EntryListener {
@Override
public void entryAdded(EntryEvent event) {
log.info("DNode [" + event.getValue() + "] joins the cluster as ready to server requests.");
// Update TablespaceVersions
try {
String dnode = event.getValue().getAddress();
log.info(Thread.currentThread().getName() + " : populating client queue for [" + dnode
+ "] as it connected.");
context.initializeThriftClientCacheFor(dnode);
context.updateTablespaceVersions(event.getValue(), QNodeHandlerContext.DNodeEvent.ENTRY);
log.info(Thread.currentThread() + ": Maybe balance (entryAdded)");
context.maybeBalance();
} catch(TablespaceVersionInfoException e) {
throw new RuntimeException(e);
} catch(TTransportException e) {
throw new RuntimeException(e);
} catch(InterruptedException e) {
throw new RuntimeException(e);
}
}
@Override
public void entryRemoved(EntryEvent event) {
log.info("DNode [" + event.getValue() + "] left.");
// Update TablespaceVersions
try {
context.discardThriftClientCacheFor(event.getValue().getAddress());
context.updateTablespaceVersions(event.getValue(), QNodeHandlerContext.DNodeEvent.LEAVE);
log.info(Thread.currentThread() + ": Maybe balance (entryRemoved)");
context.maybeBalance();
} catch(TablespaceVersionInfoException e) {
throw new RuntimeException(e);
} catch(InterruptedException e) {
throw new RuntimeException(e);
}
}
@Override
public void entryUpdated(EntryEvent event) {
// Update TablespaceVersions
try {
context.updateTablespaceVersions(event.getValue(), QNodeHandlerContext.DNodeEvent.UPDATE);
} catch(TablespaceVersionInfoException e) {
throw new RuntimeException(e);
}
}
@Override
public void entryEvicted(EntryEvent event) {
// Never happens
log.error("Event entryEvicted received for [" + event + "]. "
+ "Should have never happened... Something wrong in the code");
}
}
public class VersionListener implements EntryListener> {
private void check(EntryEvent> event) {
if(!CoordinationStructures.KEY_FOR_VERSIONS_BEING_SERVED.equals(event.getKey())) {
throw new RuntimeException("Unexpected key " + event.getKey() + " for map "
+ CoordinationStructures.KEY_FOR_VERSIONS_BEING_SERVED);
}
}
private void processAddOrUpdate(EntryEvent> event) {
check(event);
try {
// We perform all changes together with the aim of atomicity
updateLocalTablespace(event.getValue());
} catch(IOException e) {
log.error(
"Error changing serving tablespace [" + event.getKey() + " to version [" + event.getValue()
+ "]. Probably the system is now unstable.", e);
}
}
@Override
public void entryAdded(EntryEvent> event) {
// log.info("New versions table event received.");
processAddOrUpdate(event);
}
@Override
public void entryUpdated(EntryEvent> event) {
// log.info("Updated versions table event received.");
processAddOrUpdate(event);
}
@Override
public synchronized void entryRemoved(EntryEvent> event) {
check(event);
// TODO: make this operation atomical. ConcurrentHashMap.clear() is not.
log.info("Versions table removed!. Clearing up all tablespace versions.");
context.getCurrentVersionsMap().clear();
return;
}
@Override
public void entryEvicted(EntryEvent> event) {
throw new RuntimeException("Should never happen. Something is really wrong :O");
}
}
public void init(final SploutConfiguration config) throws Exception {
this.config = config;
log.info(this + " - Initializing QNode...");
// Connect with the cluster.
HazelcastInstance hz = Hazelcast.newHazelcastInstance(HazelcastConfigBuilder.build(config));
int minutesToCheckRegister = config.getInt(HazelcastProperties.MAX_TIME_TO_CHECK_REGISTRATION, 5);
int oldestMembersLeading = config.getInt(HazelcastProperties.OLDEST_MEMBERS_LEADING_COUNT, 3);
// we must instantiate the DistributedRegistry even if we're not a DNode to be able to receive memembership leaving
// in race conditions such as all DNodes leaving.
new DistributedRegistry(CoordinationStructures.DNODES, null, hz, minutesToCheckRegister,
oldestMembersLeading);
coord = new CoordinationStructures(hz);
context = new QNodeHandlerContext(config, coord);
// Initialialize DNodes tracking
initDNodesTracking();
// Initialize versions to be served tracking
initVersionTracking();
// Now instantiate modules
deployer = new Deployer(context);
querier = new Querier(context);
// Get updated tablespace + version information
context.synchronizeTablespaceVersions();
log.info(Thread.currentThread() + " - Initializing QNode [DONE].");
warmingThread = new Thread() {
@Override
public void run() {
try {
log.info("Currently warming up for [" + config.getInt(QNodeProperties.WARMING_TIME)
+ "] - certain actions will only be taken afterwards.");
Thread.sleep(config.getInt(QNodeProperties.WARMING_TIME) * 1000);
log.info("Warming time ended [OK] Now the QNode will operate fully normally.");
} catch(InterruptedException e) {
log.error("Warming time interrupted - ");
}
context.getIsWarming().set(false);
}
};
warmingThread.start();
}
/**
* Initializes the tracking of DNodes joining and leaving the cluster.
*/
private void initDNodesTracking() {
IMap dnodes = context.getCoordinationStructures().getDNodes();
// CAUTION: We must register the listener BEFORE reading the list
// of dnodes. Otherwise we could have a race condition.
dnodes.addEntryListener(new DNodesListener(), true);
Set dNodes = new HashSet();
for(DNodeInfo dnodeInfo : dnodes.values()) {
dNodes.add(dnodeInfo.getAddress());
try {
context.initializeThriftClientCacheFor(dnodeInfo.getAddress());
context.updateTablespaceVersions(dnodeInfo, DNodeEvent.ENTRY);
} catch(TablespaceVersionInfoException e) {
throw new RuntimeException(e);
} catch(TTransportException e) {
throw new RuntimeException(e);
} catch(InterruptedException e) {
throw new RuntimeException(e);
}
}
log.info("Alive DNodes at QNode startup [" + Joiner.on(", ").skipNulls().join(dNodes) + "]");
log.info("TablespaceVersion map at QNode startup [" + context.getTablespaceVersionsMap() + "]");
}
/**
* Loads the tablespaces information in memory to being ready to serve them, and starts to keep track of changes in
* tablespace's version to be served. To be called at initialization.
*/
private void initVersionTracking() throws IOException {
IMap> versions = context.getCoordinationStructures()
.getVersionsBeingServed();
// CAUTION: We register the listener before updating the in memory versions
// because if we do the other way around, we could lose updates to tablespace
// versions or new tablespaces.
VersionListener listener = new VersionListener();
versions.addEntryListener(listener, true);
String persistenceFolder = config.getString(HazelcastProperties.HZ_PERSISTENCE_FOLDER);
if(persistenceFolder != null && !persistenceFolder.equals("")) {
TablespaceVersionStore vStore = new TablespaceVersionStore(persistenceFolder);
Map vBeingServedFromDisk = vStore
.load(CoordinationStructures.KEY_FOR_VERSIONS_BEING_SERVED);
if(vBeingServedFromDisk != null) {
Map vBeingServed = null;
do {
vBeingServed = context.getCoordinationStructures().getCopyVersionsBeingServed();
if(vBeingServed != null) {
// We assume info in memory (Hazelcast) is fresher than info in disk
for(Map.Entry entry : vBeingServed.entrySet()) {
vBeingServedFromDisk.put(entry.getKey(), entry.getValue());
}
}
} while(!context.getCoordinationStructures().updateVersionsBeingServed(vBeingServed,
vBeingServedFromDisk));
log.info("Loading tablespace versions to be served: " + vBeingServedFromDisk);
updateLocalTablespace(vBeingServedFromDisk);
}
}
}
private void updateLocalTablespace(Map tablespacesAndVersions) throws IOException {
log.info("Update local tablespace: " + tablespacesAndVersions);
if(tablespacesAndVersions == null) {
return;
}
// CAREFUL TODO: That is not atomic. Something should
// be done to make that update atomic.
context.getCurrentVersionsMap().putAll(tablespacesAndVersions);
String persistenceFolder = config.getString(HazelcastProperties.HZ_PERSISTENCE_FOLDER);
if(persistenceFolder != null && !persistenceFolder.equals("")) {
TablespaceVersionStore vStore = new TablespaceVersionStore(persistenceFolder);
vStore.store(CoordinationStructures.KEY_FOR_VERSIONS_BEING_SERVED, tablespacesAndVersions);
}
}
/**
* Given a key, a tablespace and a SQL, query it to the appropriated DNode and return the result.
*
* Returns a {@link QueryStatus}.
*
* @throws QuerierException
*/
public QueryStatus query(String tablespace, String key, String sql, String partition)
throws JSONSerDeException, QuerierException {
if(sql == null) {
return new ErrorQueryStatus("Null sql provided, can't query.");
}
if(sql.length() < 1) {
return new ErrorQueryStatus("Empty sql provided, can't query.");
}
if(key == null && partition == null) {
return new ErrorQueryStatus(
"Null key / partition provided, can't query. Either partition or key must not be null.");
}
if(key != null && partition != null) {
return new ErrorQueryStatus(
"(partition, key) parameters are mutually exclusive. Please use one or other, not both at the same time.");
}
meterQueriesServed.inc();
meterRequestsPerSecond.mark();
/*
* The queries are handled by the specialized module {@link Querier}
*/
QueryStatus result = querier.query(tablespace, key, sql, partition);
if(result.getResult() != null) {
meterResultSize.update(result.getResult().size());
}
return result;
}
/**
* Multi-query: use {@link Querier} for as many shards as needed and return a list of {@link QueryStatus}
*
* Returns a list of {@link QueryStatus}.
*/
public ArrayList multiQuery(String tablespaceName, List keyMins,
List keyMaxs, String sql) throws JSONSerDeException {
if(sql == null) {
return new ArrayList(Arrays.asList(new QueryStatus[] { new ErrorQueryStatus(
"Null sql provided, can't query.") }));
}
if(sql.length() < 1) {
return new ArrayList(Arrays.asList(new QueryStatus[] { new ErrorQueryStatus(
"Empty sql provided, can't query.") }));
}
if(keyMins.size() != keyMaxs.size()) {
// This has to be handled before! We are not going to be polite here
throw new RuntimeException(
"This is very likely a software bug: Inconsistent parameters received in "
+ QNodeHandler.class + " for multiQuery() : " + tablespaceName + ", " + keyMins + ","
+ keyMaxs + ", " + sql);
}
Set impactedKeys = new HashSet();
Long version = context.getCurrentVersionsMap().get(tablespaceName);
if(version == null) {
return new ArrayList(Arrays.asList(new QueryStatus[] { new ErrorQueryStatus(
"No available version for tablespace " + tablespaceName) }));
}
// TODO Object creation (new TablespaceVersion), not very efficient for performance
Tablespace tablespace = context.getTablespaceVersionsMap().get(
new TablespaceVersion(tablespaceName, version));
if(tablespace == null) { // This can happen if, at startup, we only received the version and not the DNodeInfo
return new ArrayList(Arrays.asList(new QueryStatus[] { new ErrorQueryStatus(
"No available information for tablespace version " + tablespaceName + "," + version) }));
}
if(keyMins.size() == 0) {
impactedKeys.addAll(tablespace.getPartitionMap().findPartitions(null, null)); // all partitions are hit
}
for(int i = 0; i < keyMins.size(); i++) {
impactedKeys.addAll(tablespace.getPartitionMap().findPartitions(keyMins.get(i), keyMaxs.get(i)));
}
ArrayList toReturn = new ArrayList();
for(Integer shardKey : impactedKeys) {
toReturn.add(querier.query(tablespaceName, sql, shardKey));
}
meterQueriesServed.inc();
meterRequestsPerSecond.mark();
return toReturn;
}
/**
* Given a list of {@link DeployRequest}, perform an asynchronous deploy. This is currently the most important part of
* Splout and the most complex one. Here we are involving several DNodes asynchronously and later we will check that
* everything finished.
*
* Returns a {@link DeployInfo}.
*/
public DeployInfo deploy(List deployRequest) throws Exception {
/*
* The deployment is handled by the specialized module {@link Deployer}
*/
return deployer.deploy(deployRequest);
}
/**
* Rollback: Set the version of some tablespaces to a particular one.
*
* Returns a {@link StatusMessage}.
*/
public StatusMessage rollback(List rollbackRequest) throws JSONSerDeException {
try {
// TODO: Coordinate with context.synchronizeTablespaceVersions() because one could being deleting some tablespace
// when other is trying a rollback.
deployer.switchVersions(rollbackRequest);
// TODO: Change this status message to something more programmatic
return new StatusMessage("Done");
} catch(UnexistingVersion e) {
return new StatusMessage(e.getMessage() + ". Not possible to rollback to unexisting version.");
}
}
/**
* Returns the {@link QNodeStatus} filled correctly.
*/
public QNodeStatus overview() throws Exception {
QNodeStatus status = new QNodeStatus();
status.setClusterSize(coord.getHz().getCluster().getMembers().size());
Map aliveDNodes = new HashMap();
for(DNodeInfo dnode : context.getCoordinationStructures().getDNodes().values()) {
DNodeService.Client client = null;
boolean renew = false;
try {
client = getContext().getDNodeClientFromPool(dnode.getAddress());
aliveDNodes.put(dnode.getAddress(), JSONSerDe.deSer(client.status(), DNodeSystemStatus.class));
} catch(TTransportException e) {
renew = true;
throw e;
} finally {
if(client != null) {
context.returnDNodeClientToPool(dnode.getAddress(), client, renew);
}
}
}
status.setdNodes(aliveDNodes);
Map tablespaceMap = new HashMap();
for(Map.Entry currentVersion : context.getCurrentVersionsMap().entrySet()) {
Tablespace tablespace = context.getTablespaceVersionsMap().get(
new TablespaceVersion(currentVersion.getKey(), currentVersion.getValue()));
if(tablespace != null) { // this might happen and it is not a bug
tablespaceMap.put(currentVersion.getKey(), tablespace);
}
}
status.setTablespaceMap(tablespaceMap);
return status;
}
/**
* Returns the list of tablespaces
*/
public Set tablespaces() throws Exception {
return context.getCurrentVersionsMap().keySet();
}
/**
* Return all available versions for each tablespace
*/
@Override
public Map allTablespaceVersions(final String tablespace) throws Exception {
HashMap ret = new HashMap();
Set> versions = context.getTablespaceVersionsMap().entrySet();
for(Entry entry : versions) {
if(entry.getKey().getTablespace().equals(tablespace)) {
ret.put(entry.getKey().getVersion(), entry.getValue());
}
}
return ret;
}
/**
* Return a properly filled {@link DNodeSystemStatus}
*/
@Override
public DNodeSystemStatus dnodeStatus(String dnode) throws Exception {
DNodeService.Client client = null;
boolean renew = false;
try {
client = getContext().getDNodeClientFromPool(dnode);
return JSONSerDe.deSer(client.status(), DNodeSystemStatus.class);
} catch(TTransportException e) {
renew = true;
throw e;
} finally {
if(client != null) {
context.returnDNodeClientToPool(dnode, client, renew);
}
}
}
@Override
public Tablespace tablespace(String tablespace) throws Exception {
Long version = context.getCurrentVersionsMap().get(tablespace);
if(version == null) {
return null;
}
Tablespace t = context.getTablespaceVersionsMap().get(new TablespaceVersion(tablespace, version));
return t;
}
/**
* Get the list of DNodes
*/
@Override
public List getDNodeList() throws Exception {
return context.getDNodeList();
}
/**
* Properly dispose this QNodeHandler.
*/
@Override
public void close() throws Exception {
context.close();
if(warmingThread != null) {
warmingThread.interrupt();
warmingThread.join();
}
}
/**
* Used for testing.
*/
public QNodeHandlerContext getContext() {
return context;
}
/**
* Used for testing.
*/
public Deployer getDeployer() {
return deployer;
}
}