org.voltdb.client.Distributer Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of voltdbclient Show documentation
Show all versions of voltdbclient Show documentation
VoltDB client interface libraries
/* This file is part of VoltDB.
* Copyright (C) 2008-2017 VoltDB Inc.
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU Affero General Public License as
* published by the Free Software Foundation, either version 3 of the
* License, or (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Affero General Public License for more details.
*
* You should have received a copy of the GNU Affero General Public License
* along with VoltDB. If not, see .
*/
package org.voltdb.client;
import java.io.IOException;
import java.net.InetSocketAddress;
import java.net.UnknownHostException;
import java.nio.ByteBuffer;
import java.nio.channels.SocketChannel;
import java.util.ArrayList;
import java.util.Collections;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;
import java.util.Random;
import java.util.Set;
import java.util.TreeMap;
import java.util.concurrent.ConcurrentHashMap;
import java.util.concurrent.ConcurrentMap;
import java.util.concurrent.CopyOnWriteArrayList;
import java.util.concurrent.CountDownLatch;
import java.util.concurrent.Executors;
import java.util.concurrent.RejectedExecutionException;
import java.util.concurrent.ScheduledExecutorService;
import java.util.concurrent.ScheduledFuture;
import java.util.concurrent.TimeUnit;
import java.util.concurrent.TimeoutException;
import java.util.concurrent.atomic.AtomicBoolean;
import java.util.concurrent.atomic.AtomicInteger;
import java.util.concurrent.atomic.AtomicLong;
import java.util.concurrent.atomic.AtomicReference;
import java.util.concurrent.locks.LockSupport;
import javax.net.ssl.SSLContext;
import javax.net.ssl.SSLEngine;
import javax.security.auth.Subject;
import org.cliffc_voltpatches.high_scale_lib.NonBlockingHashMap;
import org.json_voltpatches.JSONException;
import org.json_voltpatches.JSONObject;
import org.voltcore.network.CipherExecutor;
import org.voltcore.network.Connection;
import org.voltcore.network.QueueMonitor;
import org.voltcore.network.VoltNetworkPool;
import org.voltcore.network.VoltNetworkPool.IOStatsIntf;
import org.voltcore.network.VoltProtocolHandler;
import org.voltcore.utils.CoreUtils;
import org.voltcore.utils.Pair;
import org.voltcore.utils.ssl.SSLConfiguration;
import org.voltdb.ClientResponseImpl;
import org.voltdb.VoltTable;
import org.voltdb.client.ClientStatusListenerExt.DisconnectCause;
import org.voltdb.client.HashinatorLite.HashinatorLiteType;
import org.voltdb.common.Constants;
import com.google_voltpatches.common.base.Throwables;
import com.google_voltpatches.common.collect.ImmutableList;
import com.google_voltpatches.common.collect.ImmutableSet;
import com.google_voltpatches.common.collect.Maps;
import com.google_voltpatches.common.collect.Sets;
import jsr166y.ThreadLocalRandom;
/**
* De/multiplexes transactions across a cluster
*
* It is safe to synchronized on an individual connection and then the distributer, but it is always unsafe
* to synchronized on the distributer and then an individual connection.
*/
class Distributer {
static int RESUBSCRIPTION_DELAY_MS = Integer.getInteger("RESUBSCRIPTION_DELAY_MS", 10000);
static final long PING_HANDLE = Long.MAX_VALUE;
public static final Long ASYNC_TOPO_HANDLE = PING_HANDLE - 1;
static final long USE_DEFAULT_CLIENT_TIMEOUT = 0;
static long PARTITION_KEYS_INFO_REFRESH_FREQUENCY = Long.getLong("PARTITION_KEYS_INFO_REFRESH_FREQUENCY", 1000);
// handles used internally are negative and decrement for each call
public final AtomicLong m_sysHandle = new AtomicLong(-1);
// collection of connections to the cluster
private final CopyOnWriteArrayList m_connections =
new CopyOnWriteArrayList<>();
private final ArrayList m_listeners = new ArrayList<>();
//Selector and connection handling, does all work in blocking selection thread
private final VoltNetworkPool m_network;
private final SSLContext m_sslContext;
// Temporary until a distribution/affinity algorithm is written
private int m_nextConnection = 0;
private final boolean m_useMultipleThreads;
private final boolean m_useClientAffinity;
private final boolean m_sendReadsToReplicasBytDefaultIfCAEnabled;
private static final class Procedure {
final static int PARAMETER_NONE = -1;
private final boolean multiPart;
private final boolean readOnly;
private final int partitionParameter;
private final int partitionParameterType;
private Procedure(boolean multiPart,
boolean readOnly,
int partitionParameter,
int partitionParameterType) {
this.multiPart = multiPart;
this.readOnly = readOnly;
this.partitionParameter = multiPart? PARAMETER_NONE : partitionParameter;
this.partitionParameterType = multiPart ? PARAMETER_NONE : partitionParameterType;
}
}
private final Map m_partitionMasters = new HashMap<>();
private final Map m_partitionReplicas = new HashMap<>();
private final Map m_hostIdToConnection = new HashMap<>();
private final Map m_procedureInfo = new HashMap<>();
private final AtomicReference> m_partitionKeys = new AtomicReference>();
private final AtomicLong m_lastPartitionKeyFetched = new AtomicLong(0);
private final AtomicReference m_partitionUpdateStatus = new AtomicReference();
//This is the instance of the Hashinator we picked from TOPO used only for client affinity.
private HashinatorLite m_hashinator = null;
//This is a global timeout that will be used if a per-procedure timeout is not provided with the procedure call.
private final long m_procedureCallTimeoutNanos;
private static final long MINIMUM_LONG_RUNNING_SYSTEM_CALL_TIMEOUT_MS = 30 * 60 * 1000; // 30 minutes
private final long m_connectionResponseTimeoutNanos;
private final Map m_clientAffinityStats =
new HashMap<>();
public final RateLimiter m_rateLimiter = new RateLimiter();
private final AtomicReference> m_unconnectedHosts = new AtomicReference>();
private AtomicBoolean m_createConnectionUponTopoChangeInProgress = new AtomicBoolean(false);
private boolean m_topologyChangeAware;
//private final Timer m_timer;
private final ScheduledExecutorService m_ex =
Executors.newSingleThreadScheduledExecutor(
CoreUtils.getThreadFactory("VoltDB Client Reaper Thread"));
ScheduledFuture m_timeoutReaperHandle;
/**
* Server's instances id. Unique for the cluster
*/
private Object m_clusterInstanceId[];
private String m_buildString;
/*
* The connection we have issued our subscriptions to. If the connection is lost
* we will need to request subscription from a different node
*/
private NodeConnection m_subscribedConnection = null;
//Track if a request is pending so we don't accidentally handle a failed node twice
private boolean m_subscriptionRequestPending = false;
//Until catalog subscription is implemented, only fetch it once
private boolean m_fetchedCatalog = false;
/**
* JAAS Authentication Subject
*/
private final Subject m_subject;
// executor service for ssl encryption/decryption, if ssl is enabled.
private CipherExecutor m_cipherService;
/**
* Handles topology updates for client affinity
*/
class TopoUpdateCallback implements ProcedureCallback {
@Override
public void clientCallback(ClientResponse clientResponse) throws Exception {
if (clientResponse.getStatus() != ClientResponse.SUCCESS) {
return;
}
try {
synchronized (Distributer.this) {
VoltTable results[] = clientResponse.getResults();
if (results != null && results.length > 1) {
updateAffinityTopology(results);
}
}
}
catch (Exception e) {
e.printStackTrace();
}
}
}
/**
* Handles partition updates for client affinity
*/
class PartitionUpdateCallback implements ProcedureCallback {
final CountDownLatch m_latch;
PartitionUpdateCallback(CountDownLatch latch) {
m_latch = latch;
}
@Override
public void clientCallback(ClientResponse clientResponse) throws Exception {
if (clientResponse.getStatus() == ClientResponse.SUCCESS) {
VoltTable results[] = clientResponse.getResults();
if (results != null && results.length > 0) {
updatePartitioning(results[0]);
}
}
m_partitionUpdateStatus.set(clientResponse);
if (m_latch != null) {
m_latch.countDown();
}
}
}
/**
* Handles @Subscribe response
*/
class SubscribeCallback implements ProcedureCallback {
@Override
public void clientCallback(ClientResponse response) throws Exception {
//Pre 4.1 clusers don't know about subscribe, don't stress over it.
if (response.getStatusString() != null &&
response.getStatusString().contains("@Subscribe was not found")) {
synchronized (Distributer.this) {
m_subscriptionRequestPending = false;
}
return;
}
//Fast path subscribing retry if the connection was lost before getting a response
if (response.getStatus() == ClientResponse.CONNECTION_LOST && !m_connections.isEmpty()) {
subscribeToNewNode();
return;
} else if (response.getStatus() == ClientResponse.CONNECTION_LOST) {
return;
}
//Slow path, god knows why it didn't succeed, server could be paused and in admin mode. Don't firehose attempts.
if (response.getStatus() != ClientResponse.SUCCESS && !m_ex.isShutdown()) {
//Retry on the off chance that it will work the Nth time, or work at a different node
m_ex.schedule(new Runnable() {
@Override
public void run() {
try {
subscribeToNewNode();
} catch (Throwable t) {
t.printStackTrace();
Throwables.propagate(t);
}
}
}, 2, TimeUnit.MINUTES);
return;
}
//If success, the code in NodeConnection.stopping needs to know it has to handle selecting
//a new node to for subscriptions, so set the pending request to false to let that code
//know that the failure won't be handled in the callback
synchronized (Distributer.this) {
m_subscriptionRequestPending = false;
}
}
}
/**
* Handles procedure updates for client affinity
*/
class ProcUpdateCallback implements ProcedureCallback {
@Override
public void clientCallback(ClientResponse clientResponse) throws Exception {
if (clientResponse.getStatus() != ClientResponse.SUCCESS) {
return;
}
try {
synchronized (Distributer.this) {
VoltTable results[] = clientResponse.getResults();
if (results != null && results.length == 1) {
VoltTable vt = results[0];
updateProcedurePartitioning(vt);
}
m_fetchedCatalog = true;
}
}
catch (Exception e) {
e.printStackTrace();
}
}
}
class CallExpiration implements Runnable {
@Override
public void run() {
try {
// make a threadsafe copy of all connections
ArrayList connections = new ArrayList<>();
synchronized (Distributer.this) {
connections.addAll(m_connections);
}
final long nowNanos = System.nanoTime();
// for each connection
for (final NodeConnection c : connections) {
// check for connection age
final long sinceLastResponse = Math.max(1, nowNanos - c.m_lastResponseTimeNanos);
// if outstanding ping and timeoutMS, close the connection
if (c.m_outstandingPing && (sinceLastResponse > m_connectionResponseTimeoutNanos)) {
// memoize why it's closing
c.m_closeCause = DisconnectCause.TIMEOUT;
// this should trigger NodeConnection.stopping(..)
c.m_connection.unregister();
}
// if 1/3 of the timeoutMS since last response, send a ping
if ((!c.m_outstandingPing) && (sinceLastResponse > (m_connectionResponseTimeoutNanos / 3))) {
c.sendPing();
}
// for each outstanding procedure
for (final Map.Entry e : c.m_callbacks.entrySet()) {
final long handle = e.getKey();
final CallbackBookeeping cb = e.getValue();
// if the timeout is expired, call the callback and remove the
// bookeeping data
final long deltaNanos = Math.max(1, nowNanos - cb.timestampNanos);
if (deltaNanos > cb.procedureTimeoutNanos) {
//For expected long operations don't use the default timeout
//unless it is > MINIMUM_LONG_RUNNING_SYSTEM_CALL_TIMEOUT_MS
final boolean isLongOp = isLongOp(cb.name);
if (isLongOp && (deltaNanos < TimeUnit.MILLISECONDS.toNanos(MINIMUM_LONG_RUNNING_SYSTEM_CALL_TIMEOUT_MS))) {
continue;
}
c.handleTimedoutCallback(handle, nowNanos);
}
}
}
} catch (Throwable t) {
t.printStackTrace();
}
}
}
/*
* Check if the proc name is a procedure that is expected to run long
* Make the minimum timeoutMS for certain long running system procedures
* higher than the default 2m.
* you can still set the default timeoutMS higher than even this value
*
*/
private static boolean isLongOp(String procName) {
if (procName.startsWith("@")) {
if (procName.equals("@UpdateApplicationCatalog") || procName.equals("@SnapshotSave")) {
return true;
}
}
return false;
}
class CallbackBookeeping {
public CallbackBookeeping(long timestampNanos, ProcedureCallback callback, String name, long timeoutNanos, boolean ignoreBackpressure) {
assert(callback != null);
this.timestampNanos = timestampNanos;
this.callback = callback;
this.name = name;
this.procedureTimeoutNanos = timeoutNanos;
this.ignoreBackpressure = ignoreBackpressure;
}
long timestampNanos;
//Timeout in ms 0 means use conenction specified procedure timeoutMS.
final long procedureTimeoutNanos;
ProcedureCallback callback;
String name;
boolean ignoreBackpressure;
}
class NodeConnection extends VoltProtocolHandler implements org.voltcore.network.QueueMonitor {
private final AtomicInteger m_callbacksToInvoke = new AtomicInteger(0);
private final ConcurrentMap m_callbacks = new ConcurrentHashMap<>();
private final NonBlockingHashMap m_stats = new NonBlockingHashMap<>();
private Connection m_connection;
private volatile boolean m_isConnected = true;
volatile long m_lastResponseTimeNanos = System.nanoTime();
boolean m_outstandingPing = false;
ClientStatusListenerExt.DisconnectCause m_closeCause = DisconnectCause.CONNECTION_CLOSED;
public NodeConnection(long ids[]) {}
/*
* NodeConnection uses ignoreBackpressure to get rate limiter to not
* apply any permit tracking or rate limits to transactions that should
* never be rejected such as those submitted from within a callback thread or
* generated internally
*/
public void createWork(final long nowNanos, long handle, String name, ByteBuffer c,
ProcedureCallback callback, boolean ignoreBackpressure, long timeoutNanos) {
assert(callback != null);
//How long from the starting point in time to wait to get this stuff done
timeoutNanos = (timeoutNanos == Distributer.USE_DEFAULT_CLIENT_TIMEOUT) ? m_procedureCallTimeoutNanos : timeoutNanos;
//Trigger the timeout at this point in time no matter what
final long timeoutTime = nowNanos + timeoutNanos;
//What was the time after the rate limiter returned
//Will be the same as timeoutNanos if it didn't block
long afterRateLimitNanos = 0;
/*
* Do rate limiting or check for max outstanding related backpressure in
* the rate limiter which can block. If it blocks we can still get a timeout
* exception to give prompt timeouts
*/
try {
afterRateLimitNanos = m_rateLimiter.sendTxnWithOptionalBlockAndReturnCurrentTime(
nowNanos, timeoutNanos, ignoreBackpressure);
} catch (TimeoutException e) {
/*
* It's possible we need to timeout because it took too long to get
* the transaction out on the wire due to max outstanding
*/
final long deltaNanos = Math.max(1, System.nanoTime() - nowNanos);
invokeCallbackWithTimeout(name, callback, deltaNanos, afterRateLimitNanos, timeoutNanos, handle, ignoreBackpressure);
return;
}
assert(m_callbacks.containsKey(handle) == false);
//Drain needs to know when all callbacks have been invoked
final int callbacksToInvoke = m_callbacksToInvoke.incrementAndGet();
assert(callbacksToInvoke >= 0);
//Optimistically submit the task
m_callbacks.put(handle, new CallbackBookeeping(nowNanos, callback, name, timeoutNanos, ignoreBackpressure));
//Schedule the timeout to fire relative to the amount of time
//spent getting to this point. Might fire immediately
//some of the time, but that is fine
final long timeoutRemaining = timeoutTime - afterRateLimitNanos;
//Schedule an individual timeout if necessary
//If it is a long op, don't bother scheduling a discrete timeout
if (timeoutNanos < TimeUnit.SECONDS.toNanos(1) && !isLongOp(name)) {
submitDiscreteTimeoutTask(handle, Math.max(0, timeoutRemaining));
}
//Check for disconnect
if (!m_isConnected) {
//Check if the disconnect or expiration already handled the callback
if (m_callbacks.remove(handle) == null) {
return;
}
final ClientResponse r = new ClientResponseImpl(
ClientResponse.CONNECTION_LOST, new VoltTable[0],
"Connection to database host (" + m_connection.getHostnameAndIPAndPort() +
") was lost before a response was received");
try {
callback.clientCallback(r);
} catch (Exception e) {
uncaughtException(callback, r, e);
}
//Drain needs to know when all callbacks have been invoked
final int remainingToInvoke = m_callbacksToInvoke.decrementAndGet();
assert(remainingToInvoke >= 0);
//for bookkeeping, but it feels dishonest to call this here
m_rateLimiter.transactionResponseReceived(nowNanos, -1, ignoreBackpressure);
return;
} else {
m_connection.writeStream().enqueue(c);
}
}
/*
* For high precision timeouts, submit a discrete task to a scheduled
* executor service to time out the transaction. The timeout task
* when run checks if the task is still present in the concurrent map
* of tasks and removes it. If it wins the race to remove the map
* then the transaction will be timed out even if a response is received
* at the same time.
*
* This will race with the periodic task that checks lower resolution timeouts
* and it is fine, the concurrent map makes sure each callback is handled exactly once
*/
void submitDiscreteTimeoutTask(final long handle, long timeoutNanos) {
m_ex.schedule(new Runnable() {
@Override
public void run() {
handleTimedoutCallback(handle, System.nanoTime());
}
}, timeoutNanos, TimeUnit.NANOSECONDS);
}
/*
* Factor out the boilerplate involved in checking whether a timed out callback
* still exists and needs to be invoked, or has already been handled by another thread
*/
void handleTimedoutCallback(long handle, long nowNanos) {
//Callback doesn't have to be there, it may have already
//received a response or been expired by the periodic expiration task, or a discrete expiration task
final CallbackBookeeping cb = m_callbacks.remove(handle);
//It was handled during the race
if (cb == null) {
return;
}
final long deltaNanos = Math.max(1, nowNanos - cb.timestampNanos);
invokeCallbackWithTimeout(cb.name, cb.callback, deltaNanos, nowNanos, cb.procedureTimeoutNanos, handle, cb.ignoreBackpressure);
}
/*
* Factor out the boilerplate involved in invoking a callback with a timeout response
*/
void invokeCallbackWithTimeout(String procName,
ProcedureCallback callback,
long deltaNanos,
long nowNanos,
long timeoutNanos,
long handle,
boolean ignoreBackpressure) {
ClientResponseImpl r = new ClientResponseImpl(
ClientResponse.CONNECTION_TIMEOUT,
ClientResponse.UNINITIALIZED_APP_STATUS_CODE,
"",
new VoltTable[0],
String.format("No response received in the allotted time (set to %d ms).",
TimeUnit.NANOSECONDS.toMillis(timeoutNanos)));
r.setClientHandle(handle);
r.setClientRoundtrip(deltaNanos);
r.setClusterRoundtrip((int)TimeUnit.NANOSECONDS.toMillis(deltaNanos));
try {
callback.clientCallback(r);
} catch (Throwable e1) {
uncaughtException( callback, r, e1);
}
//Drain needs to know when all callbacks have been invoked
final int remainingToInvoke = m_callbacksToInvoke.decrementAndGet();
assert(remainingToInvoke >= 0);
m_rateLimiter.transactionResponseReceived(nowNanos, -1, ignoreBackpressure);
updateStatsForTimeout(procName, r.getClientRoundtripNanos(), r.getClusterRoundtrip());
}
void sendPing() {
ProcedureInvocation invocation = new ProcedureInvocation(PING_HANDLE, "@Ping");
ByteBuffer buf = ByteBuffer.allocate(4 + invocation.getSerializedSize());
buf.putInt(buf.capacity() - 4);
try {
invocation.flattenToBuffer(buf);
buf.flip();
} catch (IOException e) {
throw new RuntimeException(e);
}
m_connection.writeStream().enqueue(buf);
m_outstandingPing = true;
}
private void updateStatsForTimeout(
final String procName,
final long roundTripNanos,
final int clusterRoundTrip) {
m_connection.queueTask(new Runnable() {
@Override
public void run() {
updateStats(procName, roundTripNanos, clusterRoundTrip, false, false, true);
}
});
}
/**
* Update the procedures statistics
* @param procName Name of procedure being updated
* @param clusterRoundTrip round trip measured within the VoltDB cluster
* @param abort true of the procedure was aborted
* @param failure true if the procedure failed
*/
private void updateStats(
String procName,
long roundTripNanos,
int clusterRoundTrip,
boolean abort,
boolean failure,
boolean timeout) {
ClientStats stats = m_stats.get(procName);
if (stats == null) {
stats = new ClientStats();
stats.m_connectionId = connectionId();
stats.m_hostname = m_connection.getHostnameOrIP();
stats.m_port = m_connection.getRemotePort();
stats.m_procName = procName;
stats.m_startTS = System.currentTimeMillis();
stats.m_endTS = Long.MIN_VALUE;
m_stats.put(procName, stats);
}
stats.update(roundTripNanos, clusterRoundTrip, abort, failure, timeout);
}
@Override
public void handleMessage(ByteBuffer buf, Connection c) {
long nowNanos = System.nanoTime();
ClientResponseImpl response = new ClientResponseImpl();
try {
response.initFromBuffer(buf);
} catch (IOException e1) {
// TODO Auto-generated catch block
e1.printStackTrace();
}
// track the timestamp of the most recent read on this connection
m_lastResponseTimeNanos = nowNanos;
final long handle = response.getClientHandle();
// handle ping response and get out
if (handle == PING_HANDLE) {
m_outstandingPing = false;
return;
} else if (handle == ASYNC_TOPO_HANDLE) {
/*
* Really didn't want to add this block because it is not DRY
* for the exception handling, but trying to set + reset the async topo callback
* turned out to be pretty challenging
*/
ProcedureCallback cb = new TopoUpdateCallback();
try {
cb.clientCallback(response);
} catch (Exception e) {
uncaughtException(cb, response, e);
}
return;
}
//Race with expiration thread to be the first to remove the callback
//from the map and process it
final CallbackBookeeping stuff = m_callbacks.remove(response.getClientHandle());
// presumably (hopefully) this is a response for a timed-out message
if (stuff == null) {
// also ignore internal (topology and procedure) calls
if (handle >= 0) {
// notify any listeners of the late response
for (ClientStatusListenerExt listener : m_listeners) {
listener.lateProcedureResponse(
response,
m_connection.getHostnameOrIP(),
m_connection.getRemotePort());
}
}
}
// handle a proper callback
else {
final long callTimeNanos = stuff.timestampNanos;
final long deltaNanos = Math.max(1, nowNanos - callTimeNanos);
final ProcedureCallback cb = stuff.callback;
assert(cb != null);
final byte status = response.getStatus();
boolean abort = false;
boolean error = false;
if (status == ClientResponse.USER_ABORT || status == ClientResponse.GRACEFUL_FAILURE) {
abort = true;
} else if (status != ClientResponse.SUCCESS) {
error = true;
}
int clusterRoundTrip = response.getClusterRoundtrip();
m_rateLimiter.transactionResponseReceived(nowNanos, clusterRoundTrip, stuff.ignoreBackpressure);
updateStats(stuff.name, deltaNanos, clusterRoundTrip, abort, error, false);
response.setClientRoundtrip(deltaNanos);
assert(response.getHash() == null); // make sure it didn't sneak into wire protocol
try {
cb.clientCallback(response);
} catch (Exception e) {
uncaughtException(cb, response, e);
}
//Drain needs to know when all callbacks have been invoked
final int remainingToInvoke = m_callbacksToInvoke.decrementAndGet();
assert(remainingToInvoke >= 0);
}
}
@Override
public int getMaxRead() {
return Integer.MAX_VALUE;
}
public boolean hadBackPressure() {
return m_connection.writeStream().hadBackPressure();
}
@Override
public void stopping(Connection c) {
super.stopping(c);
m_isConnected = false;
//Prevent queueing of new work to this connection
synchronized (Distributer.this) {
/*
* Repair all cluster topology data with the node connection removed
*/
Iterator> i = m_partitionMasters.entrySet().iterator();
while (i.hasNext()) {
Map.Entry entry = i.next();
if (entry.getValue() == this) {
i.remove();
}
}
i = m_hostIdToConnection.entrySet().iterator();
while (i.hasNext()) {
Map.Entry entry = i.next();
if (entry.getValue() == this) {
i.remove();
}
}
Iterator> i2 = m_partitionReplicas.entrySet().iterator();
List> entriesToRewrite = new ArrayList<>();
while (i2.hasNext()) {
Map.Entry entry = i2.next();
for (NodeConnection nc : entry.getValue()) {
if (nc == this) {
entriesToRewrite.add(Pair.of(entry.getKey(), entry.getValue()));
}
}
}
for (Pair entry : entriesToRewrite) {
m_partitionReplicas.remove(entry.getFirst());
NodeConnection survivors[] = new NodeConnection[entry.getSecond().length - 1];
if (survivors.length == 0) {
break;
}
int zz = 0;
for (int ii = 0; ii < entry.getSecond().length; ii++) {
if (entry.getSecond()[ii] != this) {
survivors[zz++] = entry.getSecond()[ii];
}
}
m_partitionReplicas.put(entry.getFirst(), survivors);
}
m_connections.remove(this);
//Notify listeners that a connection has been lost
for (ClientStatusListenerExt s : m_listeners) {
s.connectionLost(
m_connection.getHostnameOrIP(),
m_connection.getRemotePort(),
m_connections.size(),
m_closeCause);
}
/*
* Deal with the fact that this may have been the connection that subscriptions were issued
* to. If a subscription request was pending, don't handle selecting a new node here
* let the callback see the failure and retry
*/
if (m_useClientAffinity &&
m_subscribedConnection == this &&
m_subscriptionRequestPending == false &&
!m_ex.isShutdown()) {
//Don't subscribe to a new node immediately
//to somewhat prevent a thundering herd
try {
m_ex.schedule(new Runnable() {
@Override
public void run() {
subscribeToNewNode();
}
}, new Random().nextInt(RESUBSCRIPTION_DELAY_MS),
TimeUnit.MILLISECONDS);
} catch (RejectedExecutionException ree) {
// this is for race if m_ex shuts down in the middle of schedule
return;
}
}
}
//Invoke callbacks for all queued invocations with a failure response
final ClientResponse r =
new ClientResponseImpl(
ClientResponse.CONNECTION_LOST, new VoltTable[0],
"Connection to database host (" + m_connection.getHostnameAndIPAndPort() +
") was lost before a response was received");
for (Map.Entry e : m_callbacks.entrySet()) {
//Check for race with other threads
if (m_callbacks.remove(e.getKey()) == null) {
continue;
}
final CallbackBookeeping callBk = e.getValue();
try {
callBk.callback.clientCallback(r);
}
catch (Exception ex) {
uncaughtException(callBk.callback, r, ex);
}
//Drain needs to know when all callbacks have been invoked
final int remainingToInvoke = m_callbacksToInvoke.decrementAndGet();
assert(remainingToInvoke >= 0);
m_rateLimiter.transactionResponseReceived(System.nanoTime(), -1, callBk.ignoreBackpressure);
}
}
@Override
public Runnable offBackPressure() {
return new Runnable() {
@Override
public void run() {
/*
* Synchronization on Distributer.this is critical to ensure that queue
* does not report backpressure AFTER the write stream reports that backpressure
* has ended thus resulting in a lost wakeup.
*/
synchronized (Distributer.this) {
for (final ClientStatusListenerExt csl : m_listeners) {
csl.backpressure(false);
}
}
}
};
}
@Override
public Runnable onBackPressure() {
return null;
}
@Override
public QueueMonitor writestreamMonitor() {
return this;
}
private int m_queuedBytes = 0;
private final int m_maxQueuedBytes = 262144;
@Override
public boolean queue(int bytes) {
m_queuedBytes += bytes;
if (m_queuedBytes > m_maxQueuedBytes) {
return true;
}
return false;
}
public InetSocketAddress getSocketAddress() {
return m_connection.getRemoteSocketAddress();
}
}
void drain() throws InterruptedException {
boolean more;
long sleep = 500;
do {
more = false;
for (NodeConnection cxn : m_connections) {
more = more || cxn.m_callbacksToInvoke.get() > 0;
}
/*
* Back off to spinning at five millis. Try and get drain to be a little
* more prompt. Spinning sucks!
*/
if (more) {
if (Thread.interrupted()) {
throw new InterruptedException();
}
LockSupport.parkNanos(TimeUnit.MICROSECONDS.toNanos(sleep));
if (Thread.interrupted()) {
throw new InterruptedException();
}
if (sleep < 5000) {
sleep += 500;
}
}
} while(more);
}
Distributer() {
this( false,
ClientConfig.DEFAULT_PROCEDURE_TIMOUT_NANOS,
ClientConfig.DEFAULT_CONNECTION_TIMOUT_MS,
false, false, null, null);
}
Distributer(
boolean useMultipleThreads,
long procedureCallTimeoutNanos,
long connectionResponseTimeoutMS,
boolean useClientAffinity,
boolean sendReadsToReplicasBytDefault,
Subject subject,
SSLContext sslContext) {
m_useMultipleThreads = useMultipleThreads;
m_sslContext = sslContext;
if (m_sslContext != null) {
m_cipherService = CipherExecutor.CLIENT;
m_cipherService.startup();
} else {
m_cipherService = null;
}
m_network = new VoltNetworkPool(
m_useMultipleThreads ? Math.max(1, CoreUtils.availableProcessors() / 4 ) : 1,
1, null, "Client");
m_network.start();
m_procedureCallTimeoutNanos= procedureCallTimeoutNanos;
m_connectionResponseTimeoutNanos = TimeUnit.MILLISECONDS.toNanos(connectionResponseTimeoutMS);
m_useClientAffinity = useClientAffinity;
m_sendReadsToReplicasBytDefaultIfCAEnabled = sendReadsToReplicasBytDefault;
// schedule the task that looks for timed-out proc calls and connections
m_timeoutReaperHandle = m_ex.scheduleAtFixedRate(new CallExpiration(), 1, 1, TimeUnit.SECONDS);
m_subject = subject;
}
void createConnection(String host, String program, String password, int port, ClientAuthScheme scheme)
throws UnknownHostException, IOException
{
byte hashedPassword[] = ConnectionUtil.getHashedPassword(scheme, password);
createConnectionWithHashedCredentials(host, program, hashedPassword, port, scheme);
}
void createConnectionWithHashedCredentials(String host, String program, byte[] hashedPassword, int port, ClientAuthScheme scheme)
throws UnknownHostException, IOException
{
SSLEngine sslEngine = null;
if (m_sslContext != null) {
sslEngine = m_sslContext.createSSLEngine("client", port);
sslEngine.setUseClientMode(true);
Set enabled = ImmutableSet.copyOf(sslEngine.getEnabledCipherSuites());
Set intersection = Sets.intersection(SSLConfiguration.GCM_CIPHERS, enabled);
if (intersection.isEmpty()) {
intersection = Sets.intersection(SSLConfiguration.PREFERRED_CIPHERS, enabled);
}
if (intersection.isEmpty()) {
intersection = enabled;
}
sslEngine.setEnabledCipherSuites(intersection.toArray(new String[0]));
}
final Object socketChannelAndInstanceIdAndBuildString[] =
ConnectionUtil.getAuthenticatedConnection(host, program, hashedPassword, port, m_subject, scheme, sslEngine);
final SocketChannel aChannel = (SocketChannel)socketChannelAndInstanceIdAndBuildString[0];
final long instanceIdWhichIsTimestampAndLeaderIp[] = (long[])socketChannelAndInstanceIdAndBuildString[1];
final int hostId = (int)instanceIdWhichIsTimestampAndLeaderIp[0];
NodeConnection cxn = new NodeConnection(instanceIdWhichIsTimestampAndLeaderIp);
Connection c = null;
try {
if (aChannel != null) {
c = m_network.registerChannel(aChannel, cxn, m_cipherService, sslEngine);
}
}
catch (Exception e) {
// Need to clean up the socket if there was any failure
try {
aChannel.close();
} catch (IOException e1) {
//Don't care connection is already lost anyways
}
Throwables.propagate(e);
}
cxn.m_connection = c;
synchronized (this) {
// If there are no connections, discard any previous connection ids and allow the client
// to connect to a new cluster.
// Careful, this is slightly less safe than the previous behavior.
if (m_connections.size() == 0) {
m_clusterInstanceId = null;
}
if (m_clusterInstanceId == null) {
long timestamp = instanceIdWhichIsTimestampAndLeaderIp[2];
int addr = (int)instanceIdWhichIsTimestampAndLeaderIp[3];
m_clusterInstanceId = new Object[] { timestamp, addr };
} else {
if (!(((Long)m_clusterInstanceId[0]).longValue() == instanceIdWhichIsTimestampAndLeaderIp[2]) ||
!(((Integer)m_clusterInstanceId[1]).longValue() == instanceIdWhichIsTimestampAndLeaderIp[3])) {
// clean up the pre-registered voltnetwork connection/channel
c.unregister();
throw new IOException(
"Cluster instance id mismatch. Current is " + m_clusterInstanceId[0] + "," + m_clusterInstanceId[1] +
" and server's was " + instanceIdWhichIsTimestampAndLeaderIp[2] + "," + instanceIdWhichIsTimestampAndLeaderIp[3]);
}
}
m_buildString = (String)socketChannelAndInstanceIdAndBuildString[2];
m_connections.add(cxn);
}
if (m_useClientAffinity) {
synchronized (this) {
m_hostIdToConnection.put(hostId, cxn);
}
if (m_subscribedConnection == null) {
subscribeToNewNode();
}
}
}
/*
* Subscribe to receive async updates on a new node connection. This will set m_subscribed
* connection to the provided connection.
*
* If we are subscribing to a new connection on node failure this will also fetch the topology post node
* failure. If the cluster hasn't finished resolving the failure it is fine, we will get the new topo through\
*/
private void subscribeToNewNode() {
//Technically necessary to synchronize for safe publication of this store
NodeConnection cxn = null;
synchronized (Distributer.this) {
m_subscribedConnection = null;
if (!m_connections.isEmpty()) {
cxn = m_connections.get(new Random().nextInt(m_connections.size()));
m_subscriptionRequestPending = true;
m_subscribedConnection = cxn;
} else {
return;
}
}
try {
//Subscribe to topology updates before retrieving the current topo
//so there isn't potential for lost updates
ProcedureInvocation spi = new ProcedureInvocation(m_sysHandle.getAndDecrement(), "@Subscribe", "TOPOLOGY");
cxn.createWork(System.nanoTime(),
spi.getHandle(),
spi.getProcName(),
serializeSPI(spi),
new SubscribeCallback(),
true,
USE_DEFAULT_CLIENT_TIMEOUT);
spi = new ProcedureInvocation(m_sysHandle.getAndDecrement(), "@Statistics", "TOPO", 0);
//The handle is specific to topology updates and has special cased handling
cxn.createWork(System.nanoTime(),
spi.getHandle(),
spi.getProcName(),
serializeSPI(spi),
new TopoUpdateCallback(),
true,
USE_DEFAULT_CLIENT_TIMEOUT);
//Don't need to retrieve procedure updates every time we do a new subscription
//since catalog changes aren't correlated with node failure the same way topo is
if (!m_fetchedCatalog) {
spi = new ProcedureInvocation(m_sysHandle.getAndDecrement(), "@SystemCatalog", "PROCEDURES");
//The handle is specific to procedure updates and has special cased handling
cxn.createWork(System.nanoTime(),
spi.getHandle(),
spi.getProcName(),
serializeSPI(spi),
new ProcUpdateCallback(),
true,
USE_DEFAULT_CLIENT_TIMEOUT);
}
//Partition key update
refreshPartitionKeys(true);
} catch (Exception e) {
e.printStackTrace();
}
}
/**
* Queue invocation on first node connection without backpressure. If there is none with without backpressure
* then return false and don't queue the invocation
* @param invocation
* @param cb
* @param ignoreBackpressure If true the invocation will be queued even if there is backpressure
* @param nowNanos Current time in nanoseconds using System.nanoTime
* @param timeoutNanos nanoseconds from nowNanos where timeout should fire
* @return True if the message was queued and false if the message was not queued due to backpressure
* @throws NoConnectionsException
*/
boolean queue(
ProcedureInvocation invocation,
ProcedureCallback cb,
final boolean ignoreBackpressure, final long nowNanos, final long timeoutNanos)
throws NoConnectionsException {
assert(invocation != null);
assert(cb != null);
NodeConnection cxn = null;
boolean backpressure = true;
/*
* Synchronization is necessary to ensure that m_connections is not modified
* as well as to ensure that backpressure is reported correctly
*/
synchronized (this) {
final int totalConnections = m_connections.size();
if (totalConnections == 0) {
throw new NoConnectionsException("No connections.");
}
/*
* Check if the master for the partition is known. No back pressure check to ensure correct
* routing, but backpressure will be managed anyways. This is where we guess partition based on client
* affinity and known topology (hashinator initialized).
*/
if (m_useClientAffinity && (m_hashinator != null)) {
final Procedure procedureInfo = m_procedureInfo.get(invocation.getProcName());
Integer hashedPartition = -1;
if (procedureInfo != null) {
hashedPartition = Constants.MP_INIT_PID;
if (( ! procedureInfo.multiPart) &&
// User may have passed too few parameters to allow dispatching.
// Avoid an indexing error here to fall through to the proper ProcCallException.
(procedureInfo.partitionParameter < invocation.getPassedParamCount())) {
hashedPartition = m_hashinator.getHashedPartitionForParameter(
procedureInfo.partitionParameterType,
invocation.getPartitionParamValue(procedureInfo.partitionParameter));
}
/*
* If the procedure is read only and single part and the user wants it, load balance across replicas
* This is probably slower for SAFE consistency.
*/
if (!procedureInfo.multiPart && procedureInfo.readOnly && m_sendReadsToReplicasBytDefaultIfCAEnabled) {
NodeConnection partitionReplicas[] = m_partitionReplicas.get(hashedPartition);
if (partitionReplicas != null && partitionReplicas.length > 0) {
cxn = partitionReplicas[ThreadLocalRandom.current().nextInt(partitionReplicas.length)];
if (cxn.hadBackPressure()) {
//See if there is one without backpressure, make sure it's still connected
for (NodeConnection nc : partitionReplicas) {
if (!nc.hadBackPressure() && nc.m_isConnected) {
cxn = nc;
break;
}
}
}
if (!cxn.hadBackPressure() || ignoreBackpressure) {
backpressure = false;
}
}
} else {
/*
* For writes or SAFE reads, this is the best way to go
*/
cxn = m_partitionMasters.get(hashedPartition);
if (cxn != null && !cxn.hadBackPressure() || ignoreBackpressure) {
backpressure = false;
}
}
}
if (cxn != null && !cxn.m_isConnected) {
// Would be nice to log something here
// Client affinity picked a connection that was actually disconnected. Reset to null
// and let the round-robin choice pick a connection
cxn = null;
}
ClientAffinityStats stats = m_clientAffinityStats.get(hashedPartition);
if (stats == null) {
stats = new ClientAffinityStats(hashedPartition, 0, 0, 0, 0);
m_clientAffinityStats.put(hashedPartition, stats);
}
if (cxn != null) {
if (procedureInfo != null && procedureInfo.readOnly) {
stats.addAffinityRead();
}
else {
stats.addAffinityWrite();
}
}
// account these here because we lose the partition ID and procedure info once we
// bust out of this scope.
else {
if (procedureInfo != null && procedureInfo.readOnly) {
stats.addRrRead();
}
else {
stats.addRrWrite();
}
}
}
if (cxn == null) {
for (int i=0; i < totalConnections; ++i) {
cxn = m_connections.get(Math.abs(++m_nextConnection % totalConnections));
if (!cxn.hadBackPressure() || ignoreBackpressure) {
// serialize and queue the invocation
backpressure = false;
break;
}
}
}
if (backpressure) {
cxn = null;
for (ClientStatusListenerExt s : m_listeners) {
s.backpressure(true);
}
}
}
/*
* Do the heavy weight serialization outside the synchronized block.
* createWork synchronizes on an individual connection which allows for more concurrency
*/
if (cxn != null) {
ByteBuffer buf = null;
try {
buf = serializeSPI(invocation);
} catch (Exception e) {
Throwables.propagate(e);
}
cxn.createWork(nowNanos, invocation.getHandle(), invocation.getProcName(), buf, cb, ignoreBackpressure, timeoutNanos);
}
if (m_topologyChangeAware) {
createConnectionsUponTopologyChange();
}
return !backpressure;
}
/**
* Shutdown the VoltNetwork allowing the Ports to close and free resources
* like memory pools
* @throws InterruptedException
*/
final void shutdown() throws InterruptedException {
// stop the old proc call reaper
m_timeoutReaperHandle.cancel(false);
m_ex.shutdown();
if (CoreUtils.isJunitTest()) {
m_ex.awaitTermination(1, TimeUnit.SECONDS);
} else {
m_ex.awaitTermination(365, TimeUnit.DAYS);
}
m_network.shutdown();
if (m_cipherService != null) {
m_cipherService.shutdown();
m_cipherService = null;
}
}
void uncaughtException(ProcedureCallback cb, ClientResponse r, Throwable t) {
boolean handledByClient = false;
for (ClientStatusListenerExt csl : m_listeners) {
if (csl instanceof ClientImpl.InternalClientStatusListener) {
continue;
}
try {
csl.uncaughtException(cb, r, t);
handledByClient = true;
} catch (Exception e) {
e.printStackTrace();
}
}
if (!handledByClient) {
t.printStackTrace();
}
}
synchronized void addClientStatusListener(ClientStatusListenerExt listener) {
if (!m_listeners.contains(listener)) {
m_listeners.add(listener);
}
}
synchronized boolean removeClientStatusListener(ClientStatusListenerExt listener) {
return m_listeners.remove(listener);
}
ClientStatsContext createStatsContext() {
return new ClientStatsContext(this, getStatsSnapshot(), getIOStatsSnapshot(),
getAffinityStatsSnapshot());
}
Map> getStatsSnapshot() {
Map> retval =
new TreeMap<>();
for (NodeConnection conn : m_connections) {
Map connMap = new TreeMap<>();
for (Entry e : conn.m_stats.entrySet()) {
connMap.put(e.getKey(), (ClientStats) e.getValue().clone());
}
retval.put(conn.connectionId(), connMap);
}
return retval;
}
Map getIOStatsSnapshot() {
Map retval = new TreeMap<>();
Map> ioStats;
try {
ioStats = m_network.getIOStats(false, ImmutableList.of());
} catch (Exception e) {
return null;
}
for (NodeConnection conn : m_connections) {
Pair perConnIOStats = ioStats.get(conn.connectionId());
if (perConnIOStats == null) {
continue;
}
long read = perConnIOStats.getSecond()[0];
long write = perConnIOStats.getSecond()[2];
ClientIOStats cios = new ClientIOStats(conn.connectionId(), read, write);
retval.put(conn.connectionId(), cios);
}
return retval;
}
Map getAffinityStatsSnapshot()
{
Map retval = new HashMap<>();
// these get modified under this lock in queue()
synchronized(this) {
for (Entry e : m_clientAffinityStats.entrySet()) {
retval.put(e.getKey(), (ClientAffinityStats)e.getValue().clone());
}
}
return retval;
}
public synchronized Object[] getInstanceId() {
return m_clusterInstanceId;
}
/**
* Not exposed to users for the moment.
*/
public synchronized void resetInstanceId() {
m_clusterInstanceId = null;
}
public String getBuildString() {
return m_buildString;
}
public List getThreadIds() {
return m_network.getThreadIds();
}
public List getConnectedHostList() {
ArrayList addressList = new ArrayList<>();
for (NodeConnection conn : m_connections) {
addressList.add(conn.getSocketAddress());
}
return Collections.unmodifiableList(addressList);
}
public Map getConnectedHostIPAndPort() {
Map connectedHostIPAndPortMap = Maps.newHashMap();
for (NodeConnection conn : m_connections) {
connectedHostIPAndPortMap.put(conn.getSocketAddress().getAddress().getHostAddress(), (conn.getSocketAddress().getPort()));
}
return Collections.unmodifiableMap(connectedHostIPAndPortMap);
}
private void updateAffinityTopology(VoltTable tables[]) {
//First table contains the description of partition ids master/slave relationships
VoltTable vt = tables[0];
//In future let TOPO return cooked bytes when cooked and we use correct recipe
boolean cooked = false;
if (tables.length == 1) {
//Just in case the new client connects to the old version of Volt that only returns 1 topology table
// We're going to get the MPI back in this table, so subtract it out from the number of partitions.
int numPartitions = vt.getRowCount() - 1;
m_hashinator = new HashinatorLite(numPartitions); // legacy only
} else {
//Second table contains the hash function
boolean advanced = tables[1].advanceRow();
if (!advanced) {
System.err.println("Topology description received from Volt was incomplete " +
"performance will be lower because transactions can't be routed at this client");
return;
}
m_hashinator = new HashinatorLite(
HashinatorLiteType.valueOf(tables[1].getString("HASHTYPE")),
tables[1].getVarbinary("HASHCONFIG"),
cooked);
}
m_partitionMasters.clear();
m_partitionReplicas.clear();
// The MPI's partition ID is 16383 (MpInitiator.MP_INIT_PID), so we shouldn't inadvertently
// hash to it. Go ahead and include it in the maps, we can use it at some point to
// route MP transactions directly to the MPI node.
Set unconnected = new HashSet();
while (vt.advanceRow()) {
Integer partition = (int)vt.getLong("Partition");
ArrayList connections = new ArrayList<>();
for (String site : vt.getString("Sites").split(",")) {
site = site.trim();
Integer hostId = Integer.valueOf(site.split(":")[0]);
if (m_hostIdToConnection.containsKey(hostId)) {
connections.add(m_hostIdToConnection.get(hostId));
} else {
unconnected.add(hostId);
}
}
m_partitionReplicas.put(partition, connections.toArray(new NodeConnection[0]));
Integer leaderHostId = Integer.valueOf(vt.getString("Leader").split(":")[0]);
if (m_hostIdToConnection.containsKey(leaderHostId)) {
m_partitionMasters.put(partition, m_hostIdToConnection.get(leaderHostId));
}
}
if (m_topologyChangeAware) {
m_unconnectedHosts.set(ImmutableSet.copyOf(unconnected));
}
refreshPartitionKeys(true);
}
private void updateProcedurePartitioning(VoltTable vt) {
m_procedureInfo.clear();
while (vt.advanceRow()) {
try {
//Data embedded in JSON object in remarks column
String jsString = vt.getString(6);
String procedureName = vt.getString(2);
JSONObject jsObj = new JSONObject(jsString);
boolean readOnly = jsObj.getBoolean(Constants.JSON_READ_ONLY);
if (jsObj.getBoolean(Constants.JSON_SINGLE_PARTITION)) {
int partitionParameter = jsObj.getInt(Constants.JSON_PARTITION_PARAMETER);
int partitionParameterType =
jsObj.getInt(Constants.JSON_PARTITION_PARAMETER_TYPE);
m_procedureInfo.put(procedureName,
new Procedure(false,readOnly, partitionParameter, partitionParameterType));
} else {
// Multi Part procedure JSON descriptors omit the partitionParameter
m_procedureInfo.put(procedureName, new Procedure(true, readOnly, Procedure.PARAMETER_NONE,
Procedure.PARAMETER_NONE));
}
} catch (JSONException e) {
e.printStackTrace();
}
}
}
private void updatePartitioning(VoltTable vt) {
List keySet = new ArrayList();
while (vt.advanceRow()) {
//check for mock unit test
if (vt.getColumnCount() == 2) {
Integer key = (int)(vt.getLong("PARTITION_KEY"));
keySet.add(key);
}
}
m_partitionKeys.set(ImmutableSet.copyOf(keySet));
}
/**
* Return if Hashinator is initialed. This is useful only for non standard clients.
* This will only only ever return true if client affinity is turned on.
*
* @return
*/
public boolean isHashinatorInitialized() {
return (m_hashinator != null);
}
/**
* This is used by clients such as CSVLoader which puts processing into buckets.
*
* @param typeValue volt Type
* @param value the representative value
* @return
*/
public long getPartitionForParameter(byte typeValue, Object value) {
if (m_hashinator == null) {
return -1;
}
return m_hashinator.getHashedPartitionForParameter(typeValue, value);
}
public HashinatorLiteType getHashinatorType() {
if (m_hashinator == null) {
return HashinatorLiteType.LEGACY;
}
return m_hashinator.getConfigurationType();
}
private ByteBuffer serializeSPI(ProcedureInvocation pi) throws IOException {
ByteBuffer buf = ByteBuffer.allocate(pi.getSerializedSize() + 4);
buf.putInt(buf.capacity() - 4);
pi.flattenToBuffer(buf);
buf.flip();
return buf;
}
long getProcedureTimeoutNanos() {
return m_procedureCallTimeoutNanos;
}
ImmutableSet getPartitionKeys() throws NoConnectionsException, IOException, ProcCallException {
refreshPartitionKeys(false);
if (m_partitionUpdateStatus.get().getStatus() != ClientResponse.SUCCESS) {
throw new ProcCallException(m_partitionUpdateStatus.get(), null, null);
}
return m_partitionKeys.get();
}
/**
* Set up partitions.
* @param topologyUpdate if true, it is called from topology update
* @throws ProcCallException on any VoltDB specific failure.
* @throws NoConnectionsException if this {@link Client} instance is not connected to any servers.
* @throws IOException if there is a Java network or connection problem.
*/
private void refreshPartitionKeys(boolean topologyUpdate) {
long interval = System.currentTimeMillis() - m_lastPartitionKeyFetched.get();
if (!m_useClientAffinity && interval < PARTITION_KEYS_INFO_REFRESH_FREQUENCY) {
return;
}
try {
ProcedureInvocation invocation = new ProcedureInvocation(m_sysHandle.getAndDecrement(), "@GetPartitionKeys", "INTEGER");
CountDownLatch latch = null;
if (!topologyUpdate) {
latch = new CountDownLatch(1);
}
PartitionUpdateCallback cb = new PartitionUpdateCallback(latch);
if (!queue(invocation, cb, true, System.nanoTime(), USE_DEFAULT_CLIENT_TIMEOUT)) {
m_partitionUpdateStatus.set(new ClientResponseImpl(ClientResponseImpl.SERVER_UNAVAILABLE, new VoltTable[0],
"Fails to queue the partition update query, please try later."));
}
if (!topologyUpdate) {
latch.await();
}
m_lastPartitionKeyFetched.set(System.currentTimeMillis());
} catch (InterruptedException | IOException e) {
m_partitionUpdateStatus.set(new ClientResponseImpl(ClientResponseImpl.SERVER_UNAVAILABLE, new VoltTable[0],
"Fails to fetch partition keys from server:" + e.getMessage()));
}
}
void setTopologyChangeAware(boolean topoAware) {
m_topologyChangeAware = topoAware;
}
void createConnectionsUponTopologyChange() {
if(!m_topologyChangeAware || m_createConnectionUponTopoChangeInProgress.get()) {
return;
}
m_createConnectionUponTopoChangeInProgress.set(true);
ImmutableSet unconnected = m_unconnectedHosts.get();
if (unconnected != null && !unconnected.isEmpty()) {
m_unconnectedHosts.compareAndSet(unconnected, ImmutableSet.copyOf(new HashSet()));
for (Integer host : unconnected) {
if (!isHostConnected(host)) {
for (ClientStatusListenerExt csl : m_listeners) {
if (csl instanceof ClientImpl.InternalClientStatusListener) {
((ClientImpl.InternalClientStatusListener)csl).createConnectionsUponTopologyChange();
break;
}
}
}
}
}
m_createConnectionUponTopoChangeInProgress.set(false);
}
void setCreateConnectionsUponTopologyChangeComplete() throws NoConnectionsException {
m_createConnectionUponTopoChangeInProgress.set(false);
ProcedureInvocation spi = new ProcedureInvocation(m_sysHandle.getAndDecrement(), "@Statistics", "TOPO", 0);
queue(spi, new TopoUpdateCallback(), true, System.nanoTime(), USE_DEFAULT_CLIENT_TIMEOUT);
}
boolean isHostConnected(Integer hostId) {
return m_hostIdToConnection.containsKey(hostId);
}
}