com.datastax.driver.core.Metrics Maven / Gradle / Ivy

Go to download
/*
 * Copyright DataStax, Inc.
 *
 * This software can be used solely with DataStax Enterprise. Please consult the license at
 * http://www.datastax.com/terms/datastax-dse-driver-license-terms
 */
package com.datastax.driver.core;

import com.codahale.metrics.Counter;
import com.codahale.metrics.Gauge;
import com.codahale.metrics.JmxReporter;
import com.codahale.metrics.Meter;
import com.codahale.metrics.MetricRegistry;
import com.codahale.metrics.Timer;
import com.datastax.driver.core.policies.SpeculativeExecutionPolicy;
import java.util.HashSet;
import java.util.Set;
import java.util.concurrent.BlockingQueue;
import java.util.concurrent.ThreadPoolExecutor;

/**
 * Metrics exposed by the driver.
 *
 * The metrics exposed by this class use the Metrics
 * library and you should refer its documentation for details on how to handle the exposed metric
 * objects.
 *
 * 
By default, metrics are exposed through JMX, which is very useful for development and
 * browsing, but for production environments you may want to have a look at the reporters provided by the
 * Metrics library which could be more efficient/adapted.
 */
public class Metrics {

  private final Cluster.Manager manager;
  private final MetricRegistry registry = new MetricRegistry();
  private final JmxReporter jmxReporter;
  private final Errors errors = new Errors();

  private final Timer requests = registry.timer("requests");
  private final Meter bytesSent = registry.meter("bytes-sent");
  private final Meter bytesReceived = registry.meter("bytes-received");

  private final Gauge knownHosts =
      registry.register(
          "known-hosts",
          new Gauge() {
            @Override
            public Integer getValue() {
              return manager.metadata.allHosts().size();
            }
          });
  private final Gauge connectedTo =
      registry.register(
          "connected-to",
          new Gauge() {
            @Override
            public Integer getValue() {
              Set s = new HashSet();
              for (SessionManager session : manager.sessions) s.addAll(session.pools.keySet());
              return s.size();
            }
          });
  private final Gauge openConnections =
      registry.register(
          "open-connections",
          new Gauge() {
            @Override
            public Integer getValue() {
              int value = manager.controlConnection.isOpen() ? 1 : 0;
              for (SessionManager session : manager.sessions)
                for (HostConnectionPool pool : session.pools.values()) value += pool.opened();
              return value;
            }
          });
  private final Gauge trashedConnections =
      registry.register(
          "trashed-connections",
          new Gauge() {
            @Override
            public Integer getValue() {
              int value = 0;
              for (SessionManager session : manager.sessions)
                for (HostConnectionPool pool : session.pools.values()) value += pool.trashed();
              return value;
            }
          });
  private final Gauge inFlightRequests =
      registry.register(
          "inflight-requests",
          new Gauge() {
            @Override
            public Integer getValue() {
              int value = 0;
              for (SessionManager session : manager.sessions)
                for (HostConnectionPool pool : session.pools.values())
                  value += pool.totalInFlight.get();
              return value;
            }
          });

  private final Gauge requestQueueDepth =
      registry.register(
          "request-queue-depth",
          new Gauge() {
            @Override
            public Integer getValue() {
              int value = 0;
              for (SessionManager session : manager.sessions)
                for (HostConnectionPool pool : session.pools.values())
                  value += pool.pendingBorrowCount.get();
              return value;
            }
          });

  private final Gauge executorQueueDepth;
  private final Gauge blockingExecutorQueueDepth;
  private final Gauge reconnectionSchedulerQueueSize;
  private final Gauge taskSchedulerQueueSize;

  Metrics(Cluster.Manager manager) {
    this.manager = manager;
    this.executorQueueDepth =
        registry.register("executor-queue-depth", buildQueueSizeGauge(manager.executorQueue));
    this.blockingExecutorQueueDepth =
        registry.register(
            "blocking-executor-queue-depth", buildQueueSizeGauge(manager.blockingExecutorQueue));
    this.reconnectionSchedulerQueueSize =
        registry.register(
            "reconnection-scheduler-task-count",
            buildQueueSizeGauge(manager.reconnectionExecutorQueue));
    this.taskSchedulerQueueSize =
        registry.register(
            "task-scheduler-task-count", buildQueueSizeGauge(manager.scheduledTasksExecutorQueue));
    if (manager.configuration.getMetricsOptions().isJMXReportingEnabled()) {
      this.jmxReporter =
          JmxReporter.forRegistry(registry).inDomain(manager.clusterName + "-metrics").build();
      this.jmxReporter.start();
    } else {
      this.jmxReporter = null;
    }
  }

  /**
   * Returns the registry containing all metrics.
   *
   * 
The metrics registry allows you to easily use the reporters that ship with Metrics or a custom
   * written one.
   *
   * 
For instance, if {@code metrics} is {@code this} object, you could export the metrics to csv
   * files using:
   *
   * 
   *     com.codahale.metrics.CsvReporter.forRegistry(metrics.getRegistry()).build(new File("measurements/")).start(1, TimeUnit.SECONDS);
   * 
   *
   * If you already have a {@code MetricRegistry} in your application and wish to add the
   * driver's metrics to it, the recommended approach is to use a listener:
   *
   * 
   *     // Your existing registry:
   *     final com.codahale.metrics.MetricRegistry myRegistry = ...
   *
   *     cluster.getMetrics().getRegistry().addListener(new com.codahale.metrics.MetricRegistryListener() {
   *         @Override
   *         public void onGaugeAdded(String name, Gauge<?> gauge) {
   *             if (myRegistry.getNames().contains(name)) {
   *                 // name is already taken, maybe prefix with a namespace
   *                 ...
   *             } else {
   *                 myRegistry.register(name, gauge);
   *             }
   *         }
   *
   *         ... // Implement other methods in a similar fashion
   *     });
   * 
   *
   * Since reporting is handled by your registry, you'll probably also want to disable JMX reporting
   * with {@link Cluster.Builder#withoutJMXReporting()}.
   *
   * @return the registry containing all metrics.
   */
  public MetricRegistry getRegistry() {
    return registry;
  }

  /**
   * Returns metrics on the user requests performed on the Cluster.
   *
   * This metric exposes
   *
   * 

   *   the total number of requests.
   *   
the requests rate (in requests per seconds), including 1, 5 and 15 minute rates.
   *   
the mean, min and max latencies, as well as latency at a given percentile.
   * 
   *
   * @return a {@code Timer} metric object exposing the rate and latency for user requests.
   */
  public Timer getRequestsTimer() {
    return requests;
  }

  /**
   * Returns an object grouping metrics related to the errors encountered.
   *
   * @return an object grouping metrics related to the errors encountered.
   */
  public Errors getErrorMetrics() {
    return errors;
  }

  /**
   * Returns the number of Cassandra hosts currently known by the driver (that is whether they are
   * currently considered up or down).
   *
   * @return the number of Cassandra hosts currently known by the driver.
   */
  public Gauge getKnownHosts() {
    return knownHosts;
  }

  /**
   * Returns the number of Cassandra hosts the driver is currently connected to (that is have at
   * least one connection opened to).
   *
   * @return the number of Cassandra hosts the driver is currently connected to.
   */
  public Gauge getConnectedToHosts() {
    return connectedTo;
  }

  /**
   * Returns the total number of currently opened connections to Cassandra hosts.
   *
   * @return The total number of currently opened connections to Cassandra hosts.
   */
  public Gauge getOpenConnections() {
    return openConnections;
  }

  /**
   * Returns the total number of currently "trashed" connections to Cassandra hosts.
   *
   * When the load to a host decreases, the driver will reclaim some connections in order to save
   * resources. No requests are sent to these connections anymore, but they are kept open for an
   * additional amount of time ({@link PoolingOptions#getIdleTimeoutSeconds()}), in case the load
   * goes up again. This metric counts connections in that state.
   *
   * @return The total number of currently trashed connections to Cassandra hosts.
   */
  public Gauge getTrashedConnections() {
    return trashedConnections;
  }

  /**
   * Returns the total number of in flight requests to Cassandra hosts.
   *
   * @return The total number of in flight requests to Cassandra hosts.
   */
  public Gauge getInFlightRequests() {
    return inFlightRequests;
  }

  /**
   * Returns the total number of enqueued requests on all Cassandra hosts.
   *
   * @see Session.State#getRequestQueueDepth(Host)
   * @return The total number of enqueued requests on all Cassandra hosts.
   */
  public Gauge getRequestQueueDepth() {
    return requestQueueDepth;
  }

  /**
   * Returns the number of queued up tasks in the {@link ThreadingOptions#createExecutor(String)
   * main internal executor}.
   *
   * 
If the executor's task queue is not accessible – which happens when the executor is not an
   * instance of {@link ThreadPoolExecutor} – then this gauge returns -1.
   *
   * @return The number of queued up tasks in the main internal executor, or -1, if that number is
   *     unknown.
   */
  public Gauge getExecutorQueueDepth() {
    return executorQueueDepth;
  }

  /**
   * Returns the number of queued up tasks in the {@link
   * ThreadingOptions#createBlockingExecutor(String) blocking executor}.
   *
   * 
If the executor's task queue is not accessible – which happens when the executor is not an
   * instance of {@link ThreadPoolExecutor} – then this gauge returns -1.
   *
   * @return The number of queued up tasks in the blocking executor, or -1, if that number is
   *     unknown.
   */
  public Gauge getBlockingExecutorQueueDepth() {
    return blockingExecutorQueueDepth;
  }

  /**
   * Returns the number of queued up tasks in the {@link
   * ThreadingOptions#createReconnectionExecutor(String) reconnection executor}.
   *
   * 
A queue size > 0 does not necessarily indicate a backlog as some tasks may not have been
   * scheduled to execute yet.
   *
   * 
If the executor's task queue is not accessible – which happens when the executor is not an
   * instance of {@link ThreadPoolExecutor} – then this gauge returns -1.
   *
   * @return The size of the work queue for the reconnection executor, or -1, if that number is
   *     unknown.
   */
  public Gauge getReconnectionSchedulerQueueSize() {
    return reconnectionSchedulerQueueSize;
  }

  /**
   * Returns the number of queued up tasks in the {@link
   * ThreadingOptions#createScheduledTasksExecutor(String) scheduled tasks executor}.
   *
   * 
A queue size > 0 does not necessarily indicate a backlog as some tasks may not have been
   * scheduled to execute yet.
   *
   * 
If the executor's task queue is not accessible – which happens when the executor is not an
   * instance of {@link ThreadPoolExecutor} – then this gauge returns -1.
   *
   * @return The size of the work queue for the scheduled tasks executor, or -1, if that number is
   *     unknown.
   */
  public Gauge getTaskSchedulerQueueSize() {
    return taskSchedulerQueueSize;
  }

  /**
   * Returns the number of bytes sent so far.
   *
   * 
Note that this measures unencrypted traffic, even if SSL is enabled (the probe is inserted
   * before SSL handlers in the Netty pipeline). In practice, SSL overhead should be negligible
   * after the initial handshake.
   *
   * @return the number of bytes sent so far.
   */
  public Meter getBytesSent() {
    return bytesSent;
  }

  /**
   * Returns the number of bytes received so far.
   *
   * 
Note that this measures unencrypted traffic, even if SSL is enabled (the probe is inserted
   * before SSL handlers in the Netty pipeline). In practice, SSL overhead should be negligible
   * after the initial handshake.
   *
   * @return the number of bytes received so far.
   */
  public Meter getBytesReceived() {
    return bytesReceived;
  }

  void shutdown() {
    if (jmxReporter != null) jmxReporter.stop();
  }

  private static Gauge buildQueueSizeGauge(final BlockingQueue queue) {
    if (queue != null) {
      return new Gauge() {
        @Override
        public Integer getValue() {
          return queue.size();
        }
      };
    } else {
      return new Gauge() {
        @Override
        public Integer getValue() {
          return -1;
        }
      };
    }
  }

  /** Metrics on errors encountered. */
  public class Errors {

    private final Counter connectionErrors = registry.counter("connection-errors");
    private final Counter authenticationErrors = registry.counter("authentication-errors");

    private final Counter writeTimeouts = registry.counter("write-timeouts");
    private final Counter readTimeouts = registry.counter("read-timeouts");
    private final Counter unavailables = registry.counter("unavailables");
    private final Counter clientTimeouts = registry.counter("client-timeouts");

    private final Counter otherErrors = registry.counter("other-errors");

    private final Counter retries = registry.counter("retries");
    private final Counter retriesOnWriteTimeout = registry.counter("retries-on-write-timeout");
    private final Counter retriesOnReadTimeout = registry.counter("retries-on-read-timeout");
    private final Counter retriesOnUnavailable = registry.counter("retries-on-unavailable");
    private final Counter retriesOnClientTimeout = registry.counter("retries-on-client-timeout");
    private final Counter retriesOnConnectionError =
        registry.counter("retries-on-connection-error");
    private final Counter retriesOnOtherErrors = registry.counter("retries-on-other-errors");

    private final Counter ignores = registry.counter("ignores");
    private final Counter ignoresOnWriteTimeout = registry.counter("ignores-on-write-timeout");
    private final Counter ignoresOnReadTimeout = registry.counter("ignores-on-read-timeout");
    private final Counter ignoresOnUnavailable = registry.counter("ignores-on-unavailable");
    private final Counter ignoresOnClientTimeout = registry.counter("ignores-on-client-timeout");
    private final Counter ignoresOnConnectionError =
        registry.counter("ignores-on-connection-error");
    private final Counter ignoresOnOtherErrors = registry.counter("ignores-on-other-errors");

    private final Counter speculativeExecutions = registry.counter("speculative-executions");

    /**
     * Returns the number of errors while connecting to Cassandra nodes.
     *
     * 
This represents the number of times that a request to a Cassandra node has failed due to a
     * connection problem. This thus also corresponds to how often the driver had to pick a fallback
     * host for a request.
     *
     * You can expect a few connection errors when a Cassandra node fails (or is stopped) ,but if
     * that number grows continuously you likely have a problem.
     *
     * @return the number of errors while connecting to Cassandra nodes.
     */
    public Counter getConnectionErrors() {
      return connectionErrors;
    }

    /**
     * Returns the number of authentication errors while connecting to Cassandra nodes.
     *
     * @return the number of errors.
     */
    public Counter getAuthenticationErrors() {
      return authenticationErrors;
    }

    /**
     * Returns the number of write requests that returned a timeout (independently of the final
     * decision taken by the {@link com.datastax.driver.core.policies.RetryPolicy}).
     *
     * @return the number of write timeout.
     */
    public Counter getWriteTimeouts() {
      return writeTimeouts;
    }

    /**
     * Returns the number of read requests that returned a timeout (independently of the final
     * decision taken by the {@link com.datastax.driver.core.policies.RetryPolicy}).
     *
     * @return the number of read timeout.
     */
    public Counter getReadTimeouts() {
      return readTimeouts;
    }

    /**
     * Returns the number of requests that returned an unavailable exception (independently of the
     * final decision taken by the {@link com.datastax.driver.core.policies.RetryPolicy}).
     *
     * @return the number of unavailable exceptions.
     */
    public Counter getUnavailables() {
      return unavailables;
    }

    /**
     * Returns the number of requests that timed out before the driver received a response.
     *
     * @return the number of client timeouts.
     */
    public Counter getClientTimeouts() {
      return clientTimeouts;
    }

    /**
     * Returns the number of requests that returned errors not accounted for by another metric. This
     * includes all types of invalid requests.
     *
     * @return the number of requests errors not accounted by another metric.
     */
    public Counter getOthers() {
      return otherErrors;
    }

    /**
     * Returns the number of times a request was retried due to the {@link
     * com.datastax.driver.core.policies.RetryPolicy}.
     *
     * @return the number of times a request was retried due to the {@link
     *     com.datastax.driver.core.policies.RetryPolicy}.
     */
    public Counter getRetries() {
      return retries;
    }

    /**
     * Returns the number of times a request was retried due to the {@link
     * com.datastax.driver.core.policies.RetryPolicy}, after a read timed out.
     *
     * @return the number of times a request was retried due to the {@link
     *     com.datastax.driver.core.policies.RetryPolicy}, after a read timed out.
     */
    public Counter getRetriesOnReadTimeout() {
      return retriesOnReadTimeout;
    }

    /**
     * Returns the number of times a request was retried due to the {@link
     * com.datastax.driver.core.policies.RetryPolicy}, after a write timed out.
     *
     * @return the number of times a request was retried due to the {@link
     *     com.datastax.driver.core.policies.RetryPolicy}, after a write timed out.
     */
    public Counter getRetriesOnWriteTimeout() {
      return retriesOnWriteTimeout;
    }

    /**
     * Returns the number of times a request was retried due to the {@link
     * com.datastax.driver.core.policies.RetryPolicy}, after an unavailable exception.
     *
     * @return the number of times a request was retried due to the {@link
     *     com.datastax.driver.core.policies.RetryPolicy}, after an unavailable exception.
     */
    public Counter getRetriesOnUnavailable() {
      return retriesOnUnavailable;
    }

    /**
     * Returns the number of times a request was retried due to the {@link
     * com.datastax.driver.core.policies.RetryPolicy}, after a client timeout.
     *
     * @return the number of times a request was retried due to the {@link
     *     com.datastax.driver.core.policies.RetryPolicy}, after a client timeout.
     */
    public Counter getRetriesOnClientTimeout() {
      return retriesOnClientTimeout;
    }

    /**
     * Returns the number of times a request was retried due to the {@link
     * com.datastax.driver.core.policies.RetryPolicy}, after a connection error.
     *
     * @return the number of times a request was retried due to the {@link
     *     com.datastax.driver.core.policies.RetryPolicy}, after a connection error.
     */
    public Counter getRetriesOnConnectionError() {
      return retriesOnConnectionError;
    }

    /**
     * Returns the number of times a request was retried due to the {@link
     * com.datastax.driver.core.policies.RetryPolicy}, after an unexpected error.
     *
     * @return the number of times a request was retried due to the {@link
     *     com.datastax.driver.core.policies.RetryPolicy}, after an unexpected error.
     */
    public Counter getRetriesOnOtherErrors() {
      return retriesOnOtherErrors;
    }

    /**
     * Returns the number of times a request was ignored due to the {@link
     * com.datastax.driver.core.policies.RetryPolicy}, for example due to timeouts or
     * unavailability.
     *
     * @return the number of times a request was ignored due to the {@link
     *     com.datastax.driver.core.policies.RetryPolicy}.
     */
    public Counter getIgnores() {
      return ignores;
    }

    /**
     * Returns the number of times a request was ignored due to the {@link
     * com.datastax.driver.core.policies.RetryPolicy}, after a read timed out.
     *
     * @return the number of times a request was ignored due to the {@link
     *     com.datastax.driver.core.policies.RetryPolicy}, after a read timed out.
     */
    public Counter getIgnoresOnReadTimeout() {
      return ignoresOnReadTimeout;
    }

    /**
     * Returns the number of times a request was ignored due to the {@link
     * com.datastax.driver.core.policies.RetryPolicy}, after a write timed out.
     *
     * @return the number of times a request was ignored due to the {@link
     *     com.datastax.driver.core.policies.RetryPolicy}, after a write timed out.
     */
    public Counter getIgnoresOnWriteTimeout() {
      return ignoresOnWriteTimeout;
    }

    /**
     * Returns the number of times a request was ignored due to the {@link
     * com.datastax.driver.core.policies.RetryPolicy}, after an unavailable exception.
     *
     * @return the number of times a request was ignored due to the {@link
     *     com.datastax.driver.core.policies.RetryPolicy}, after an unavailable exception.
     */
    public Counter getIgnoresOnUnavailable() {
      return ignoresOnUnavailable;
    }

    /**
     * Returns the number of times a request was ignored due to the {@link
     * com.datastax.driver.core.policies.RetryPolicy}, after a client timeout.
     *
     * @return the number of times a request was ignored due to the {@link
     *     com.datastax.driver.core.policies.RetryPolicy}, after a client timeout.
     */
    public Counter getIgnoresOnClientTimeout() {
      return ignoresOnClientTimeout;
    }

    /**
     * Returns the number of times a request was ignored due to the {@link
     * com.datastax.driver.core.policies.RetryPolicy}, after a connection error.
     *
     * @return the number of times a request was ignored due to the {@link
     *     com.datastax.driver.core.policies.RetryPolicy}, after a connection error.
     */
    public Counter getIgnoresOnConnectionError() {
      return ignoresOnConnectionError;
    }

    /**
     * Returns the number of times a request was ignored due to the {@link
     * com.datastax.driver.core.policies.RetryPolicy}, after an unexpected error.
     *
     * @return the number of times a request was ignored due to the {@link
     *     com.datastax.driver.core.policies.RetryPolicy}, after an unexpected error.
     */
    public Counter getIgnoresOnOtherErrors() {
      return ignoresOnOtherErrors;
    }

    /**
     * Returns the number of times a speculative execution was started because a previous execution
     * did not complete within the delay specified by {@link SpeculativeExecutionPolicy}.
     *
     * @return the number of speculative executions.
     */
    public Counter getSpeculativeExecutions() {
      return speculativeExecutions;
    }
  }
}