com.datastax.driver.core.Metrics Maven / Gradle / Ivy
Show all versions of scylla-driver-core Show documentation
/*
* Copyright DataStax, Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.datastax.driver.core;
import com.codahale.metrics.Counter;
import com.codahale.metrics.Gauge;
import com.codahale.metrics.JmxReporter;
import com.codahale.metrics.Meter;
import com.codahale.metrics.MetricRegistry;
import com.codahale.metrics.Timer;
import com.datastax.driver.core.policies.SpeculativeExecutionPolicy;
import java.util.HashSet;
import java.util.Set;
import java.util.concurrent.BlockingQueue;
import java.util.concurrent.ThreadPoolExecutor;
/**
* Metrics exposed by the driver.
*
* The metrics exposed by this class use the Metrics
* library and you should refer its documentation for details on how to handle the exposed metric
* objects.
*
*
By default, metrics are exposed through JMX, which is very useful for development and
* browsing, but for production environments you may want to have a look at the reporters provided by the
* Metrics library which could be more efficient/adapted.
*/
public class Metrics {
private final Cluster.Manager manager;
private final MetricRegistry registry = new MetricRegistry();
private final JmxReporter jmxReporter;
private final Errors errors = new Errors();
private final Timer requests = registry.timer("requests");
private final Meter bytesSent = registry.meter("bytes-sent");
private final Meter bytesReceived = registry.meter("bytes-received");
private final Gauge knownHosts =
registry.register(
"known-hosts",
new Gauge() {
@Override
public Integer getValue() {
return manager.metadata.allHosts().size();
}
});
private final Gauge connectedTo =
registry.register(
"connected-to",
new Gauge() {
@Override
public Integer getValue() {
Set s = new HashSet();
for (SessionManager session : manager.sessions) s.addAll(session.pools.keySet());
return s.size();
}
});
private final Gauge openConnections =
registry.register(
"open-connections",
new Gauge() {
@Override
public Integer getValue() {
int value = manager.controlConnection.isOpen() ? 1 : 0;
for (SessionManager session : manager.sessions)
for (HostConnectionPool pool : session.pools.values()) value += pool.opened();
return value;
}
});
private final Gauge trashedConnections =
registry.register(
"trashed-connections",
new Gauge() {
@Override
public Integer getValue() {
int value = 0;
for (SessionManager session : manager.sessions)
for (HostConnectionPool pool : session.pools.values()) value += pool.trashed();
return value;
}
});
private final Gauge inFlightRequests =
registry.register(
"inflight-requests",
new Gauge() {
@Override
public Integer getValue() {
int value = 0;
for (SessionManager session : manager.sessions)
for (HostConnectionPool pool : session.pools.values())
value += pool.totalInFlight.get();
return value;
}
});
private final Gauge requestQueueDepth =
registry.register(
"request-queue-depth",
new Gauge() {
@Override
public Integer getValue() {
int value = 0;
for (SessionManager session : manager.sessions)
for (HostConnectionPool pool : session.pools.values())
value += pool.pendingBorrowCount.get();
return value;
}
});
private final Gauge executorQueueDepth;
private final Gauge blockingExecutorQueueDepth;
private final Gauge reconnectionSchedulerQueueSize;
private final Gauge taskSchedulerQueueSize;
Metrics(Cluster.Manager manager) {
this.manager = manager;
this.executorQueueDepth =
registry.register("executor-queue-depth", buildQueueSizeGauge(manager.executorQueue));
this.blockingExecutorQueueDepth =
registry.register(
"blocking-executor-queue-depth", buildQueueSizeGauge(manager.blockingExecutorQueue));
this.reconnectionSchedulerQueueSize =
registry.register(
"reconnection-scheduler-task-count",
buildQueueSizeGauge(manager.reconnectionExecutorQueue));
this.taskSchedulerQueueSize =
registry.register(
"task-scheduler-task-count", buildQueueSizeGauge(manager.scheduledTasksExecutorQueue));
if (manager.configuration.getMetricsOptions().isJMXReportingEnabled()) {
this.jmxReporter =
JmxReporter.forRegistry(registry).inDomain(manager.clusterName + "-metrics").build();
this.jmxReporter.start();
} else {
this.jmxReporter = null;
}
}
/**
* Returns the registry containing all metrics.
*
* The metrics registry allows you to easily use the reporters that ship with Metrics or a custom
* written one.
*
*
For instance, if {@code metrics} is {@code this} object, you could export the metrics to csv
* files using:
*
*
* com.codahale.metrics.CsvReporter.forRegistry(metrics.getRegistry()).build(new File("measurements/")).start(1, TimeUnit.SECONDS);
*
*
* If you already have a {@code MetricRegistry} in your application and wish to add the
* driver's metrics to it, the recommended approach is to use a listener:
*
*
* // Your existing registry:
* final com.codahale.metrics.MetricRegistry myRegistry = ...
*
* cluster.getMetrics().getRegistry().addListener(new com.codahale.metrics.MetricRegistryListener() {
* @Override
* public void onGaugeAdded(String name, Gauge<?> gauge) {
* if (myRegistry.getNames().contains(name)) {
* // name is already taken, maybe prefix with a namespace
* ...
* } else {
* myRegistry.register(name, gauge);
* }
* }
*
* ... // Implement other methods in a similar fashion
* });
*
*
* Since reporting is handled by your registry, you'll probably also want to disable JMX reporting
* with {@link Cluster.Builder#withoutJMXReporting()}.
*
* @return the registry containing all metrics.
*/
public MetricRegistry getRegistry() {
return registry;
}
/**
* Returns metrics on the user requests performed on the Cluster.
*
* This metric exposes
*
*
* - the total number of requests.
*
- the requests rate (in requests per seconds), including 1, 5 and 15 minute rates.
*
- the mean, min and max latencies, as well as latency at a given percentile.
*
*
* @return a {@code Timer} metric object exposing the rate and latency for user requests.
*/
public Timer getRequestsTimer() {
return requests;
}
/**
* Returns an object grouping metrics related to the errors encountered.
*
* @return an object grouping metrics related to the errors encountered.
*/
public Errors getErrorMetrics() {
return errors;
}
/**
* Returns the number of Cassandra hosts currently known by the driver (that is whether they are
* currently considered up or down).
*
* @return the number of Cassandra hosts currently known by the driver.
*/
public Gauge getKnownHosts() {
return knownHosts;
}
/**
* Returns the number of Cassandra hosts the driver is currently connected to (that is have at
* least one connection opened to).
*
* @return the number of Cassandra hosts the driver is currently connected to.
*/
public Gauge getConnectedToHosts() {
return connectedTo;
}
/**
* Returns the total number of currently opened connections to Cassandra hosts.
*
* @return The total number of currently opened connections to Cassandra hosts.
*/
public Gauge getOpenConnections() {
return openConnections;
}
/**
* Returns the total number of currently "trashed" connections to Cassandra hosts.
*
* When the load to a host decreases, the driver will reclaim some connections in order to save
* resources. No requests are sent to these connections anymore, but they are kept open for an
* additional amount of time ({@link PoolingOptions#getIdleTimeoutSeconds()}), in case the load
* goes up again. This metric counts connections in that state.
*
* @return The total number of currently trashed connections to Cassandra hosts.
*/
public Gauge getTrashedConnections() {
return trashedConnections;
}
/**
* Returns the total number of in flight requests to Cassandra hosts.
*
* @return The total number of in flight requests to Cassandra hosts.
*/
public Gauge getInFlightRequests() {
return inFlightRequests;
}
/**
* Returns the total number of enqueued requests on all Cassandra hosts.
*
* @see Session.State#getRequestQueueDepth(Host)
* @return The total number of enqueued requests on all Cassandra hosts.
*/
public Gauge getRequestQueueDepth() {
return requestQueueDepth;
}
/**
* Returns the number of queued up tasks in the {@link ThreadingOptions#createExecutor(String)
* main internal executor}.
*
* If the executor's task queue is not accessible – which happens when the executor is not an
* instance of {@link ThreadPoolExecutor} – then this gauge returns -1.
*
* @return The number of queued up tasks in the main internal executor, or -1, if that number is
* unknown.
*/
public Gauge getExecutorQueueDepth() {
return executorQueueDepth;
}
/**
* Returns the number of queued up tasks in the {@link
* ThreadingOptions#createBlockingExecutor(String) blocking executor}.
*
* If the executor's task queue is not accessible – which happens when the executor is not an
* instance of {@link ThreadPoolExecutor} – then this gauge returns -1.
*
* @return The number of queued up tasks in the blocking executor, or -1, if that number is
* unknown.
*/
public Gauge getBlockingExecutorQueueDepth() {
return blockingExecutorQueueDepth;
}
/**
* Returns the number of queued up tasks in the {@link
* ThreadingOptions#createReconnectionExecutor(String) reconnection executor}.
*
* A queue size > 0 does not necessarily indicate a backlog as some tasks may not have been
* scheduled to execute yet.
*
*
If the executor's task queue is not accessible – which happens when the executor is not an
* instance of {@link ThreadPoolExecutor} – then this gauge returns -1.
*
* @return The size of the work queue for the reconnection executor, or -1, if that number is
* unknown.
*/
public Gauge getReconnectionSchedulerQueueSize() {
return reconnectionSchedulerQueueSize;
}
/**
* Returns the number of queued up tasks in the {@link
* ThreadingOptions#createScheduledTasksExecutor(String) scheduled tasks executor}.
*
* A queue size > 0 does not necessarily indicate a backlog as some tasks may not have been
* scheduled to execute yet.
*
*
If the executor's task queue is not accessible – which happens when the executor is not an
* instance of {@link ThreadPoolExecutor} – then this gauge returns -1.
*
* @return The size of the work queue for the scheduled tasks executor, or -1, if that number is
* unknown.
*/
public Gauge getTaskSchedulerQueueSize() {
return taskSchedulerQueueSize;
}
/**
* Returns the number of bytes sent so far.
*
* Note that this measures unencrypted traffic, even if SSL is enabled (the probe is inserted
* before SSL handlers in the Netty pipeline). In practice, SSL overhead should be negligible
* after the initial handshake.
*
* @return the number of bytes sent so far.
*/
public Meter getBytesSent() {
return bytesSent;
}
/**
* Returns the number of bytes received so far.
*
*
Note that this measures unencrypted traffic, even if SSL is enabled (the probe is inserted
* before SSL handlers in the Netty pipeline). In practice, SSL overhead should be negligible
* after the initial handshake.
*
* @return the number of bytes received so far.
*/
public Meter getBytesReceived() {
return bytesReceived;
}
void shutdown() {
if (jmxReporter != null) jmxReporter.stop();
}
private static Gauge buildQueueSizeGauge(final BlockingQueue> queue) {
if (queue != null) {
return new Gauge() {
@Override
public Integer getValue() {
return queue.size();
}
};
} else {
return new Gauge() {
@Override
public Integer getValue() {
return -1;
}
};
}
}
/** Metrics on errors encountered. */
public class Errors {
private final Counter connectionErrors = registry.counter("connection-errors");
private final Counter authenticationErrors = registry.counter("authentication-errors");
private final Counter writeTimeouts = registry.counter("write-timeouts");
private final Counter readTimeouts = registry.counter("read-timeouts");
private final Counter unavailables = registry.counter("unavailables");
private final Counter clientTimeouts = registry.counter("client-timeouts");
private final Counter otherErrors = registry.counter("other-errors");
private final Counter retries = registry.counter("retries");
private final Counter retriesOnWriteTimeout = registry.counter("retries-on-write-timeout");
private final Counter retriesOnReadTimeout = registry.counter("retries-on-read-timeout");
private final Counter retriesOnUnavailable = registry.counter("retries-on-unavailable");
private final Counter retriesOnClientTimeout = registry.counter("retries-on-client-timeout");
private final Counter retriesOnConnectionError =
registry.counter("retries-on-connection-error");
private final Counter retriesOnOtherErrors = registry.counter("retries-on-other-errors");
private final Counter ignores = registry.counter("ignores");
private final Counter ignoresOnWriteTimeout = registry.counter("ignores-on-write-timeout");
private final Counter ignoresOnReadTimeout = registry.counter("ignores-on-read-timeout");
private final Counter ignoresOnUnavailable = registry.counter("ignores-on-unavailable");
private final Counter ignoresOnClientTimeout = registry.counter("ignores-on-client-timeout");
private final Counter ignoresOnConnectionError =
registry.counter("ignores-on-connection-error");
private final Counter ignoresOnOtherErrors = registry.counter("ignores-on-other-errors");
private final Counter speculativeExecutions = registry.counter("speculative-executions");
/**
* Returns the number of errors while connecting to Cassandra nodes.
*
* This represents the number of times that a request to a Cassandra node has failed due to a
* connection problem. This thus also corresponds to how often the driver had to pick a fallback
* host for a request.
*
*
You can expect a few connection errors when a Cassandra node fails (or is stopped) ,but if
* that number grows continuously you likely have a problem.
*
* @return the number of errors while connecting to Cassandra nodes.
*/
public Counter getConnectionErrors() {
return connectionErrors;
}
/**
* Returns the number of authentication errors while connecting to Cassandra nodes.
*
* @return the number of errors.
*/
public Counter getAuthenticationErrors() {
return authenticationErrors;
}
/**
* Returns the number of write requests that returned a timeout (independently of the final
* decision taken by the {@link com.datastax.driver.core.policies.RetryPolicy}).
*
* @return the number of write timeout.
*/
public Counter getWriteTimeouts() {
return writeTimeouts;
}
/**
* Returns the number of read requests that returned a timeout (independently of the final
* decision taken by the {@link com.datastax.driver.core.policies.RetryPolicy}).
*
* @return the number of read timeout.
*/
public Counter getReadTimeouts() {
return readTimeouts;
}
/**
* Returns the number of requests that returned an unavailable exception (independently of the
* final decision taken by the {@link com.datastax.driver.core.policies.RetryPolicy}).
*
* @return the number of unavailable exceptions.
*/
public Counter getUnavailables() {
return unavailables;
}
/**
* Returns the number of requests that timed out before the driver received a response.
*
* @return the number of client timeouts.
*/
public Counter getClientTimeouts() {
return clientTimeouts;
}
/**
* Returns the number of requests that returned errors not accounted for by another metric. This
* includes all types of invalid requests.
*
* @return the number of requests errors not accounted by another metric.
*/
public Counter getOthers() {
return otherErrors;
}
/**
* Returns the number of times a request was retried due to the {@link
* com.datastax.driver.core.policies.RetryPolicy}.
*
* @return the number of times a request was retried due to the {@link
* com.datastax.driver.core.policies.RetryPolicy}.
*/
public Counter getRetries() {
return retries;
}
/**
* Returns the number of times a request was retried due to the {@link
* com.datastax.driver.core.policies.RetryPolicy}, after a read timed out.
*
* @return the number of times a request was retried due to the {@link
* com.datastax.driver.core.policies.RetryPolicy}, after a read timed out.
*/
public Counter getRetriesOnReadTimeout() {
return retriesOnReadTimeout;
}
/**
* Returns the number of times a request was retried due to the {@link
* com.datastax.driver.core.policies.RetryPolicy}, after a write timed out.
*
* @return the number of times a request was retried due to the {@link
* com.datastax.driver.core.policies.RetryPolicy}, after a write timed out.
*/
public Counter getRetriesOnWriteTimeout() {
return retriesOnWriteTimeout;
}
/**
* Returns the number of times a request was retried due to the {@link
* com.datastax.driver.core.policies.RetryPolicy}, after an unavailable exception.
*
* @return the number of times a request was retried due to the {@link
* com.datastax.driver.core.policies.RetryPolicy}, after an unavailable exception.
*/
public Counter getRetriesOnUnavailable() {
return retriesOnUnavailable;
}
/**
* Returns the number of times a request was retried due to the {@link
* com.datastax.driver.core.policies.RetryPolicy}, after a client timeout.
*
* @return the number of times a request was retried due to the {@link
* com.datastax.driver.core.policies.RetryPolicy}, after a client timeout.
*/
public Counter getRetriesOnClientTimeout() {
return retriesOnClientTimeout;
}
/**
* Returns the number of times a request was retried due to the {@link
* com.datastax.driver.core.policies.RetryPolicy}, after a connection error.
*
* @return the number of times a request was retried due to the {@link
* com.datastax.driver.core.policies.RetryPolicy}, after a connection error.
*/
public Counter getRetriesOnConnectionError() {
return retriesOnConnectionError;
}
/**
* Returns the number of times a request was retried due to the {@link
* com.datastax.driver.core.policies.RetryPolicy}, after an unexpected error.
*
* @return the number of times a request was retried due to the {@link
* com.datastax.driver.core.policies.RetryPolicy}, after an unexpected error.
*/
public Counter getRetriesOnOtherErrors() {
return retriesOnOtherErrors;
}
/**
* Returns the number of times a request was ignored due to the {@link
* com.datastax.driver.core.policies.RetryPolicy}, for example due to timeouts or
* unavailability.
*
* @return the number of times a request was ignored due to the {@link
* com.datastax.driver.core.policies.RetryPolicy}.
*/
public Counter getIgnores() {
return ignores;
}
/**
* Returns the number of times a request was ignored due to the {@link
* com.datastax.driver.core.policies.RetryPolicy}, after a read timed out.
*
* @return the number of times a request was ignored due to the {@link
* com.datastax.driver.core.policies.RetryPolicy}, after a read timed out.
*/
public Counter getIgnoresOnReadTimeout() {
return ignoresOnReadTimeout;
}
/**
* Returns the number of times a request was ignored due to the {@link
* com.datastax.driver.core.policies.RetryPolicy}, after a write timed out.
*
* @return the number of times a request was ignored due to the {@link
* com.datastax.driver.core.policies.RetryPolicy}, after a write timed out.
*/
public Counter getIgnoresOnWriteTimeout() {
return ignoresOnWriteTimeout;
}
/**
* Returns the number of times a request was ignored due to the {@link
* com.datastax.driver.core.policies.RetryPolicy}, after an unavailable exception.
*
* @return the number of times a request was ignored due to the {@link
* com.datastax.driver.core.policies.RetryPolicy}, after an unavailable exception.
*/
public Counter getIgnoresOnUnavailable() {
return ignoresOnUnavailable;
}
/**
* Returns the number of times a request was ignored due to the {@link
* com.datastax.driver.core.policies.RetryPolicy}, after a client timeout.
*
* @return the number of times a request was ignored due to the {@link
* com.datastax.driver.core.policies.RetryPolicy}, after a client timeout.
*/
public Counter getIgnoresOnClientTimeout() {
return ignoresOnClientTimeout;
}
/**
* Returns the number of times a request was ignored due to the {@link
* com.datastax.driver.core.policies.RetryPolicy}, after a connection error.
*
* @return the number of times a request was ignored due to the {@link
* com.datastax.driver.core.policies.RetryPolicy}, after a connection error.
*/
public Counter getIgnoresOnConnectionError() {
return ignoresOnConnectionError;
}
/**
* Returns the number of times a request was ignored due to the {@link
* com.datastax.driver.core.policies.RetryPolicy}, after an unexpected error.
*
* @return the number of times a request was ignored due to the {@link
* com.datastax.driver.core.policies.RetryPolicy}, after an unexpected error.
*/
public Counter getIgnoresOnOtherErrors() {
return ignoresOnOtherErrors;
}
/**
* Returns the number of times a speculative execution was started because a previous execution
* did not complete within the delay specified by {@link SpeculativeExecutionPolicy}.
*
* @return the number of speculative executions.
*/
public Counter getSpeculativeExecutions() {
return speculativeExecutions;
}
}
}