com.netflix.loadbalancer.ServerStats Maven / Gradle / Ivy
/*
*
* Copyright 2013 Netflix, Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*
*/
package com.netflix.loadbalancer;
import com.google.common.annotations.VisibleForTesting;
import com.netflix.client.config.CommonClientConfigKey;
import com.netflix.client.config.IClientConfigKey;
import com.netflix.client.config.Property;
import com.netflix.client.config.UnboxedIntProperty;
import com.netflix.servo.annotations.DataSourceType;
import com.netflix.servo.annotations.Monitor;
import com.netflix.stats.distribution.DataDistribution;
import com.netflix.stats.distribution.DataPublisher;
import com.netflix.stats.distribution.Distribution;
import com.netflix.util.MeasuredRate;
import java.util.Date;
import java.util.Random;
import java.util.concurrent.atomic.AtomicInteger;
import java.util.concurrent.atomic.AtomicLong;
/**
* Capture various stats per Server(node) in the LoadBalancer
* @author stonse
*
*/
public class ServerStats {
private static final int DEFAULT_PUBLISH_INTERVAL = 60 * 1000; // = 1 minute
private static final int DEFAULT_BUFFER_SIZE = 60 * 1000; // = 1000 requests/sec for 1 minute
private final UnboxedIntProperty connectionFailureThreshold;
private final UnboxedIntProperty circuitTrippedTimeoutFactor;
private final UnboxedIntProperty maxCircuitTrippedTimeout;
private final UnboxedIntProperty activeRequestsCountTimeout;
private static final double[] PERCENTS = makePercentValues();
private DataDistribution dataDist = new DataDistribution(1, PERCENTS); // in case
private DataPublisher publisher = null;
private final Distribution responseTimeDist = new Distribution();
int bufferSize = DEFAULT_BUFFER_SIZE;
int publishInterval = DEFAULT_PUBLISH_INTERVAL;
long failureCountSlidingWindowInterval = 1000;
private MeasuredRate serverFailureCounts = new MeasuredRate(failureCountSlidingWindowInterval);
private MeasuredRate requestCountInWindow = new MeasuredRate(300000L);
Server server;
AtomicLong totalRequests = new AtomicLong();
@VisibleForTesting
AtomicInteger successiveConnectionFailureCount = new AtomicInteger(0);
@VisibleForTesting
AtomicInteger activeRequestsCount = new AtomicInteger(0);
@VisibleForTesting
AtomicInteger openConnectionsCount = new AtomicInteger(0);
private volatile long lastConnectionFailedTimestamp;
private volatile long lastActiveRequestsCountChangeTimestamp;
private AtomicLong totalCircuitBreakerBlackOutPeriod = new AtomicLong(0);
private volatile long lastAccessedTimestamp;
private volatile long firstConnectionTimestamp = 0;
public ServerStats() {
connectionFailureThreshold = new UnboxedIntProperty(Property.of(LoadBalancerStats.CONNECTION_FAILURE_COUNT_THRESHOLD.defaultValue()));
circuitTrippedTimeoutFactor = new UnboxedIntProperty(LoadBalancerStats.CIRCUIT_TRIP_TIMEOUT_FACTOR_SECONDS.defaultValue());
maxCircuitTrippedTimeout = new UnboxedIntProperty(LoadBalancerStats.CIRCUIT_TRIP_MAX_TIMEOUT_SECONDS.defaultValue());
activeRequestsCountTimeout = new UnboxedIntProperty(LoadBalancerStats.ACTIVE_REQUESTS_COUNT_TIMEOUT.defaultValue());
}
public ServerStats(LoadBalancerStats lbStats) {
maxCircuitTrippedTimeout = lbStats.getCircuitTripMaxTimeoutSeconds();
circuitTrippedTimeoutFactor = lbStats.getCircuitTrippedTimeoutFactor();
connectionFailureThreshold = lbStats.getConnectionFailureCountThreshold();
activeRequestsCountTimeout = lbStats.getActiveRequestsCountTimeout();
}
/**
* Initializes the object, starting data collection and reporting.
*/
public void initialize(Server server) {
serverFailureCounts = new MeasuredRate(failureCountSlidingWindowInterval);
requestCountInWindow = new MeasuredRate(300000L);
if (publisher == null) {
dataDist = new DataDistribution(getBufferSize(), PERCENTS);
publisher = new DataPublisher(dataDist, getPublishIntervalMillis());
publisher.start();
}
this.server = server;
}
public void close() {
if (publisher != null)
publisher.stop();
}
public Server getServer() {
return server;
}
private int getBufferSize() {
return bufferSize;
}
private long getPublishIntervalMillis() {
return publishInterval;
}
public void setBufferSize(int bufferSize) {
this.bufferSize = bufferSize;
}
public void setPublishInterval(int publishInterval) {
this.publishInterval = publishInterval;
}
/**
* The supported percentile values.
* These correspond to the various Monitor methods defined below.
* No, this is not pretty, but that's the way it is.
*/
private static enum Percent {
TEN(10), TWENTY_FIVE(25), FIFTY(50), SEVENTY_FIVE(75), NINETY(90),
NINETY_FIVE(95), NINETY_EIGHT(98), NINETY_NINE(99), NINETY_NINE_POINT_FIVE(99.5);
private double val;
Percent(double val) {
this.val = val;
}
public double getValue() {
return val;
}
}
private static double[] makePercentValues() {
Percent[] percents = Percent.values();
double[] p = new double[percents.length];
for (int i = 0; i < percents.length; i++) {
p[i] = percents[i].getValue();
}
return p;
}
public long getFailureCountSlidingWindowInterval() {
return failureCountSlidingWindowInterval;
}
public void setFailureCountSlidingWindowInterval(
long failureCountSlidingWindowInterval) {
this.failureCountSlidingWindowInterval = failureCountSlidingWindowInterval;
}
// run time methods
/**
* Increment the count of failures for this Server
*
*/
public void addToFailureCount(){
serverFailureCounts.increment();
}
/**
* Returns the count of failures in the current window
*
*/
public long getFailureCount(){
return serverFailureCounts.getCurrentCount();
}
/**
* Call this method to note the response time after every request
* @param msecs
*/
public void noteResponseTime(double msecs){
dataDist.noteValue(msecs);
responseTimeDist.noteValue(msecs);
}
public void incrementNumRequests(){
totalRequests.incrementAndGet();
}
public void incrementActiveRequestsCount() {
activeRequestsCount.incrementAndGet();
requestCountInWindow.increment();
long currentTime = System.currentTimeMillis();
lastActiveRequestsCountChangeTimestamp = currentTime;
lastAccessedTimestamp = currentTime;
if (firstConnectionTimestamp == 0) {
firstConnectionTimestamp = currentTime;
}
}
public void incrementOpenConnectionsCount() {
openConnectionsCount.incrementAndGet();
}
public void decrementActiveRequestsCount() {
activeRequestsCount.getAndUpdate(current -> Math.max(0, current - 1));
lastActiveRequestsCountChangeTimestamp = System.currentTimeMillis();
}
public void decrementOpenConnectionsCount() {
openConnectionsCount.getAndUpdate(current -> Math.max(0, current - 1));
}
public int getActiveRequestsCount() {
return getActiveRequestsCount(System.currentTimeMillis());
}
public int getActiveRequestsCount(long currentTime) {
int count = activeRequestsCount.get();
if (count == 0) {
return 0;
} else if (currentTime - lastActiveRequestsCountChangeTimestamp > activeRequestsCountTimeout.get() * 1000 || count < 0) {
activeRequestsCount.set(0);
return 0;
} else {
return count;
}
}
public int getOpenConnectionsCount() {
return openConnectionsCount.get();
}
public long getMeasuredRequestsCount() {
return requestCountInWindow.getCount();
}
@Monitor(name="ActiveRequestsCount", type = DataSourceType.GAUGE)
public int getMonitoredActiveRequestsCount() {
return activeRequestsCount.get();
}
@Monitor(name="CircuitBreakerTripped", type = DataSourceType.INFORMATIONAL)
public boolean isCircuitBreakerTripped() {
return isCircuitBreakerTripped(System.currentTimeMillis());
}
public boolean isCircuitBreakerTripped(long currentTime) {
long circuitBreakerTimeout = getCircuitBreakerTimeout();
if (circuitBreakerTimeout <= 0) {
return false;
}
return circuitBreakerTimeout > currentTime;
}
private long getCircuitBreakerTimeout() {
long blackOutPeriod = getCircuitBreakerBlackoutPeriod();
if (blackOutPeriod <= 0) {
return 0;
}
return lastConnectionFailedTimestamp + blackOutPeriod;
}
private long getCircuitBreakerBlackoutPeriod() {
int failureCount = successiveConnectionFailureCount.get();
int threshold = connectionFailureThreshold.get();
if (failureCount < threshold) {
return 0;
}
int diff = Math.min(failureCount - threshold, 16);
int blackOutSeconds = (1 << diff) * circuitTrippedTimeoutFactor.get();
if (blackOutSeconds > maxCircuitTrippedTimeout.get()) {
blackOutSeconds = maxCircuitTrippedTimeout.get();
}
return blackOutSeconds * 1000L;
}
public void incrementSuccessiveConnectionFailureCount() {
lastConnectionFailedTimestamp = System.currentTimeMillis();
successiveConnectionFailureCount.incrementAndGet();
totalCircuitBreakerBlackOutPeriod.addAndGet(getCircuitBreakerBlackoutPeriod());
}
public void clearSuccessiveConnectionFailureCount() {
successiveConnectionFailureCount.set(0);
}
@Monitor(name="SuccessiveConnectionFailureCount", type = DataSourceType.GAUGE)
public int getSuccessiveConnectionFailureCount() {
return successiveConnectionFailureCount.get();
}
/*
* Response total times
*/
/**
* Gets the average total amount of time to handle a request, in milliseconds.
*/
@Monitor(name = "OverallResponseTimeMillisAvg", type = DataSourceType.INFORMATIONAL,
description = "Average total time for a request, in milliseconds")
public double getResponseTimeAvg() {
return responseTimeDist.getMean();
}
/**
* Gets the maximum amount of time spent handling a request, in milliseconds.
*/
@Monitor(name = "OverallResponseTimeMillisMax", type = DataSourceType.INFORMATIONAL,
description = "Max total time for a request, in milliseconds")
public double getResponseTimeMax() {
return responseTimeDist.getMaximum();
}
/**
* Gets the minimum amount of time spent handling a request, in milliseconds.
*/
@Monitor(name = "OverallResponseTimeMillisMin", type = DataSourceType.INFORMATIONAL,
description = "Min total time for a request, in milliseconds")
public double getResponseTimeMin() {
return responseTimeDist.getMinimum();
}
/**
* Gets the standard deviation in the total amount of time spent handling a request, in milliseconds.
*/
@Monitor(name = "OverallResponseTimeMillisStdDev", type = DataSourceType.INFORMATIONAL,
description = "Standard Deviation in total time to handle a request, in milliseconds")
public double getResponseTimeStdDev() {
return responseTimeDist.getStdDev();
}
/*
* QOS percentile performance data for most recent period
*/
/**
* Gets the number of samples used to compute the various response-time percentiles.
*/
@Monitor(name = "ResponseTimePercentileNumValues", type = DataSourceType.GAUGE,
description = "The number of data points used to compute the currently reported percentile values")
public int getResponseTimePercentileNumValues() {
return dataDist.getSampleSize();
}
/**
* Gets the time when the varios percentile data was last updated.
*/
@Monitor(name = "ResponseTimePercentileWhen", type = DataSourceType.INFORMATIONAL,
description = "The time the percentile values were computed")
public String getResponseTimePercentileTime() {
return dataDist.getTimestamp();
}
/**
* Gets the time when the varios percentile data was last updated,
* in milliseconds since the epoch.
*/
@Monitor(name = "ResponseTimePercentileWhenMillis", type = DataSourceType.COUNTER,
description = "The time the percentile values were computed in milliseconds since the epoch")
public long getResponseTimePercentileTimeMillis() {
return dataDist.getTimestampMillis();
}
/**
* Gets the average total amount of time to handle a request
* in the recent time-slice, in milliseconds.
*/
@Monitor(name = "ResponseTimeMillisAvg", type = DataSourceType.GAUGE,
description = "Average total time for a request in the recent time slice, in milliseconds")
public double getResponseTimeAvgRecent() {
return dataDist.getMean();
}
/**
* Gets the 10-th percentile in the total amount of time spent handling a request, in milliseconds.
*/
@Monitor(name = "ResponseTimeMillis10Percentile", type = DataSourceType.INFORMATIONAL,
description = "10th percentile in total time to handle a request, in milliseconds")
public double getResponseTime10thPercentile() {
return getResponseTimePercentile(Percent.TEN);
}
/**
* Gets the 25-th percentile in the total amount of time spent handling a request, in milliseconds.
*/
@Monitor(name = "ResponseTimeMillis25Percentile", type = DataSourceType.INFORMATIONAL,
description = "25th percentile in total time to handle a request, in milliseconds")
public double getResponseTime25thPercentile() {
return getResponseTimePercentile(Percent.TWENTY_FIVE);
}
/**
* Gets the 50-th percentile in the total amount of time spent handling a request, in milliseconds.
*/
@Monitor(name = "ResponseTimeMillis50Percentile", type = DataSourceType.INFORMATIONAL,
description = "50th percentile in total time to handle a request, in milliseconds")
public double getResponseTime50thPercentile() {
return getResponseTimePercentile(Percent.FIFTY);
}
/**
* Gets the 75-th percentile in the total amount of time spent handling a request, in milliseconds.
*/
@Monitor(name = "ResponseTimeMillis75Percentile", type = DataSourceType.INFORMATIONAL,
description = "75th percentile in total time to handle a request, in milliseconds")
public double getResponseTime75thPercentile() {
return getResponseTimePercentile(Percent.SEVENTY_FIVE);
}
/**
* Gets the 90-th percentile in the total amount of time spent handling a request, in milliseconds.
*/
@Monitor(name = "ResponseTimeMillis90Percentile", type = DataSourceType.INFORMATIONAL,
description = "90th percentile in total time to handle a request, in milliseconds")
public double getResponseTime90thPercentile() {
return getResponseTimePercentile(Percent.NINETY);
}
/**
* Gets the 95-th percentile in the total amount of time spent handling a request, in milliseconds.
*/
@Monitor(name = "ResponseTimeMillis95Percentile", type = DataSourceType.GAUGE,
description = "95th percentile in total time to handle a request, in milliseconds")
public double getResponseTime95thPercentile() {
return getResponseTimePercentile(Percent.NINETY_FIVE);
}
/**
* Gets the 98-th percentile in the total amount of time spent handling a request, in milliseconds.
*/
@Monitor(name = "ResponseTimeMillis98Percentile", type = DataSourceType.INFORMATIONAL,
description = "98th percentile in total time to handle a request, in milliseconds")
public double getResponseTime98thPercentile() {
return getResponseTimePercentile(Percent.NINETY_EIGHT);
}
/**
* Gets the 99-th percentile in the total amount of time spent handling a request, in milliseconds.
*/
@Monitor(name = "ResponseTimeMillis99Percentile", type = DataSourceType.GAUGE,
description = "99th percentile in total time to handle a request, in milliseconds")
public double getResponseTime99thPercentile() {
return getResponseTimePercentile(Percent.NINETY_NINE);
}
/**
* Gets the 99.5-th percentile in the total amount of time spent handling a request, in milliseconds.
*/
@Monitor(name = "ResponseTimeMillis99_5Percentile", type = DataSourceType.GAUGE,
description = "99.5th percentile in total time to handle a request, in milliseconds")
public double getResponseTime99point5thPercentile() {
return getResponseTimePercentile(Percent.NINETY_NINE_POINT_FIVE);
}
public long getTotalRequestsCount() {
return totalRequests.get();
}
private double getResponseTimePercentile(Percent p) {
return dataDist.getPercentiles()[p.ordinal()];
}
public String toString(){
StringBuilder sb = new StringBuilder();
sb.append("[Server:" + server + ";");
sb.append("\tZone:" + server.getZone() + ";");
sb.append("\tTotal Requests:" + totalRequests + ";");
sb.append("\tSuccessive connection failure:" + getSuccessiveConnectionFailureCount() + ";");
if (isCircuitBreakerTripped()) {
sb.append("\tBlackout until: " + new Date(getCircuitBreakerTimeout()) + ";");
}
sb.append("\tTotal blackout seconds:" + totalCircuitBreakerBlackOutPeriod.get() / 1000 + ";");
sb.append("\tLast connection made:" + new Date(lastAccessedTimestamp) + ";");
if (lastConnectionFailedTimestamp > 0) {
sb.append("\tLast connection failure: " + new Date(lastConnectionFailedTimestamp) + ";");
}
sb.append("\tFirst connection made: " + new Date(firstConnectionTimestamp) + ";");
sb.append("\tActive Connections:" + getMonitoredActiveRequestsCount() + ";");
sb.append("\ttotal failure count in last (" + failureCountSlidingWindowInterval + ") msecs:" + getFailureCount() + ";");
sb.append("\taverage resp time:" + getResponseTimeAvg() + ";");
sb.append("\t90 percentile resp time:" + getResponseTime90thPercentile() + ";");
sb.append("\t95 percentile resp time:" + getResponseTime95thPercentile() + ";");
sb.append("\tmin resp time:" + getResponseTimeMin() + ";");
sb.append("\tmax resp time:" + getResponseTimeMax() + ";");
sb.append("\tstddev resp time:" + getResponseTimeStdDev());
sb.append("]\n");
return sb.toString();
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy