
com.spotify.autoscaler.metric.AutoscalerMetrics Maven / Gradle / Ivy
/*-
* -\-\-
* bigtable-autoscaler
* --
* Copyright (C) 2018 Spotify AB
* --
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
* -/-/-
*/
package com.spotify.autoscaler.metric;
import com.codahale.metrics.Gauge;
import com.spotify.autoscaler.Application;
import com.spotify.autoscaler.LoggerContext;
import com.spotify.autoscaler.ScalingEvent;
import com.spotify.autoscaler.db.BigtableCluster;
import com.spotify.autoscaler.db.Database;
import com.spotify.autoscaler.db.ErrorCode;
import com.spotify.metrics.core.MetricId;
import com.spotify.metrics.core.SemanticMetricRegistry;
import com.sun.management.UnixOperatingSystemMXBean;
import java.lang.management.ManagementFactory;
import java.time.Duration;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.concurrent.ConcurrentHashMap;
import java.util.concurrent.ScheduledExecutorService;
import java.util.concurrent.ScheduledThreadPoolExecutor;
import java.util.concurrent.TimeUnit;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
/*Helper class containing methods to register and measure autoscaler metrics.*/
public class AutoscalerMetrics {
public static final MetricId APP_PREFIX = MetricId.build("key", Application.SERVICE_NAME);
private static final Logger LOG = LoggerFactory.getLogger(AutoscalerMetrics.class);
private static final Duration CLEANUP_INTERVAL = Duration.ofMinutes(1);
private final SemanticMetricRegistry registry;
private final Map registeredClusters = new ConcurrentHashMap<>();
public AutoscalerMetrics(final SemanticMetricRegistry registry) {
this.registry = registry;
}
public void registerClusterDataMetrics(
final BigtableCluster cluster, final int currentNodes, final Database db) {
final ClusterData clusterData =
new ClusterDataBuilder()
.cluster(cluster)
.currentNodeCount(currentNodes)
.minNodeCount(cluster.minNodes())
.maxNodeCount(cluster.maxNodes())
.effectiveMinNodeCount(cluster.effectiveMinNodes())
.consecutiveFailureCount(cluster.consecutiveFailureCount())
.lastErrorCode(cluster.errorCode())
.build();
if (registeredClusters.putIfAbsent(cluster.clusterName(), clusterData) == null) {
// First time we saw this cluster, register a gauge
for (final ClusterDataGauges metric : ClusterDataGauges.values()) {
registry.register(
baseMetric(cluster).tagged("what", metric.getTag()),
metric.getMetricValue(registeredClusters, cluster.clusterName(), db));
}
for (final ErrorCode code : ErrorCode.values()) {
registry.register(
baseMetric(cluster)
.tagged("what", ErrorGauges.CONSECUTIVE_FAILURE_COUNT.getTag())
.tagged("latest-error-code", code.name()),
ErrorGauges.CONSECUTIVE_FAILURE_COUNT.getMetricValue(
registeredClusters, cluster.clusterName(), code));
}
} else {
// update metrics
registeredClusters.put(cluster.clusterName(), clusterData);
}
}
public void scheduleCleanup(final Database database) {
final ScheduledExecutorService cleanupExecutor =
new ScheduledThreadPoolExecutor(1, r -> new Thread(r, "Cluster-Metrics-Cleaner"));
cleanupExecutor.scheduleAtFixedRate(
() -> {
try {
LOG.info("Cleanup running");
unregisterInactiveClustersMetrics(registry, database);
} catch (final Throwable t) {
LOG.error("Cleanup task failed", t);
}
},
CLEANUP_INTERVAL.toMillis(),
CLEANUP_INTERVAL.toMillis(),
TimeUnit.MILLISECONDS);
}
private void unregisterInactiveClustersMetrics(
final SemanticMetricRegistry registry, final Database database) {
final Set bigtableClusters = database.getActiveClusterKeys();
for (final Map.Entry entry : registeredClusters.entrySet()) {
if (!bigtableClusters.contains(entry.getKey())) {
registeredClusters.remove(entry.getKey());
final BigtableCluster cluster = entry.getValue().cluster();
LoggerContext.pushContext(cluster);
registry.removeMatching(
(name, m) -> {
final Map tags = name.getTags();
return tags.getOrDefault("project-id", "").equals(cluster.projectId())
&& tags.getOrDefault("instance-id", "").equals(cluster.instanceId())
&& tags.getOrDefault("cluster-id", "").equals(cluster.clusterId())
&& getAllMetrics().contains(tags.getOrDefault("what", ""));
});
LOG.info("Metrics unregistered");
LoggerContext.clearContext();
}
}
}
public void registerClusterLoadMetrics(
final BigtableCluster cluster, final double load, final ClusterLoadGauges type) {
if (registeredClusters.get(cluster.clusterName()) == null) {
return;
}
final ClusterDataBuilder clusterDataBuilder =
ClusterDataBuilder.from(registeredClusters.get(cluster.clusterName()));
switch (type) {
case CPU:
clusterDataBuilder.cpuUtil(load);
break;
case STORAGE:
clusterDataBuilder.storageUtil(load);
break;
default:
throw new IllegalArgumentException(String.format("Undefined ClusterLoadGauges %s", type));
}
registeredClusters.put(cluster.clusterName(), clusterDataBuilder.build());
final MetricId metricId = baseMetric(cluster).tagged("what", type.getTag());
if (!registry.getGauges().containsKey(metricId)) {
registry.register(metricId, type.getMetricValue(registeredClusters, cluster.clusterName()));
}
}
private MetricId baseMetric(final BigtableCluster cluster) {
return APP_PREFIX
.tagged("project-id", cluster.projectId())
.tagged("cluster-id", cluster.clusterId())
.tagged("instance-id", cluster.instanceId());
}
private static List getAllMetrics() {
final List metrics = new ArrayList<>();
Arrays.stream(ClusterDataGauges.values()).map(ClusterDataGauges::getTag).forEach(metrics::add);
Arrays.stream(ClusterLoadGauges.values()).map(ClusterLoadGauges::getTag).forEach(metrics::add);
Arrays.stream(ErrorGauges.values()).map(ErrorGauges::getTag).forEach(metrics::add);
return metrics;
}
public void markStorageConstraint(
final BigtableCluster cluster, final int desiredNodes, final int targetNodes) {
registry
.meter(
constraintMetric(cluster, desiredNodes, targetNodes)
.tagged("reason", "storage-constraint"))
.mark();
}
public void markScalingEventConstraint(
final BigtableCluster cluster, final int desiredNodes, final ScalingEvent event) {
registry
.meter(
constraintMetric(cluster, desiredNodes, event.getDesiredNodeCount())
.tagged("reason", event.getReason()))
.mark();
}
public void markSizeConstraint(
final int desiredNodes, final int finalNodes, final BigtableCluster cluster) {
final MetricId metric = constraintMetric(cluster, desiredNodes, finalNodes);
if (cluster.minNodes() > desiredNodes) {
registry.meter(metric.tagged("reason", "min-nodes-constraint")).mark();
}
if (cluster.effectiveMinNodes() > desiredNodes) {
registry.meter(metric.tagged("reason", "effective-min-nodes-constraint")).mark();
}
if (cluster.maxNodes() < desiredNodes) {
registry.meter(metric.tagged("reason", "max-nodes-constraint")).mark();
}
}
private MetricId constraintMetric(
final BigtableCluster cluster, final int desiredNodes, final int targetNodes) {
return baseMetric(cluster)
.tagged("what", "overridden-desired-node-count")
.tagged("desired-nodes", String.valueOf(desiredNodes))
.tagged("min-nodes", String.valueOf(cluster.effectiveMinNodes()))
.tagged("target-nodes", String.valueOf(targetNodes))
.tagged("max-nodes", String.valueOf(cluster.maxNodes()));
}
public void markClusterCheck() {
registry.meter(APP_PREFIX.tagged("what", "clusters-checked")).mark();
}
public void markCallToGetSize() {
registry.meter(APP_PREFIX.tagged("what", "call-to-get-size")).mark();
}
public void markCallToSetSize() {
registry.meter(APP_PREFIX.tagged("what", "call-to-set-size")).mark();
}
public void markClusterChanged() {
registry.meter(APP_PREFIX.tagged("what", "clusters-changed")).mark();
}
public void markSetSizeError() {
registry.meter(APP_PREFIX.tagged("what", "set-size-transport-error")).mark();
}
public void markHeartBeat() {
registry.meter(APP_PREFIX.tagged("what", "autoscale-heartbeat")).mark();
}
public void registerOpenDatabaseConnections(final Database database) {
registry.register(
APP_PREFIX.tagged("what", "open-db-connections"),
(Gauge) database::getTotalConnections);
}
public void registerActiveClusters(final Database database) {
registry.register(
APP_PREFIX.tagged("what", "enabled-clusters"),
(Gauge)
() -> database.getBigtableClusters().stream().filter(BigtableCluster::enabled).count());
registry.register(
APP_PREFIX.tagged("what", "disabled-clusters"),
(Gauge)
() -> database.getBigtableClusters().stream().filter(p -> !p.enabled()).count());
}
public void registerOpenFileDescriptors() {
registry.register(
APP_PREFIX.tagged("what", "open-file-descriptors"),
(Gauge)
() ->
((UnixOperatingSystemMXBean) ManagementFactory.getOperatingSystemMXBean())
.getOpenFileDescriptorCount());
}
public void registerDailyResizeCount(final Database database) {
registry.register(
APP_PREFIX.tagged("what", "daily-resize-count"),
(Gauge) database::getDailyResizeCount);
}
public void registerFailureCount(final Database database) {
for (final ErrorCode code : ErrorCode.values()) {
registry.register(
APP_PREFIX.tagged("what", "failing-cluster-count").tagged("error-code", code.name()),
(Gauge)
() ->
database
.getBigtableClusters()
.stream()
.filter(BigtableCluster::enabled)
.filter(p -> p.errorCode().orElse(ErrorCode.OK) == code)
.filter(p -> p.consecutiveFailureCount() > 0)
.count());
}
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy