org.apache.kylin.rest.config.initialize.MetricsRegistry Maven / Gradle / Ivy
The newest version!
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.kylin.rest.config.initialize;
import static java.util.stream.Collectors.toSet;
import java.lang.management.ManagementFactory;
import java.lang.management.MemoryMXBean;
import java.lang.management.MemoryUsage;
import java.util.List;
import java.util.Map;
import java.util.Objects;
import java.util.Set;
import java.util.stream.Collectors;
import java.util.stream.Stream;
import javax.sql.DataSource;
import org.apache.commons.dbcp2.BasicDataSource;
import org.apache.commons.lang3.StringUtils;
import org.apache.kylin.common.KylinConfig;
import org.apache.kylin.common.event.ModelAddEvent;
import org.apache.kylin.common.metrics.MetricsCategory;
import org.apache.kylin.common.metrics.MetricsGroup;
import org.apache.kylin.common.metrics.MetricsName;
import org.apache.kylin.common.metrics.MetricsTag;
import org.apache.kylin.common.metrics.prometheus.PrometheusMetrics;
import org.apache.kylin.common.persistence.metadata.JdbcDataSource;
import org.apache.kylin.common.scheduler.EventBusFactory;
import org.apache.kylin.guava30.shaded.common.collect.Lists;
import org.apache.kylin.guava30.shaded.common.collect.Maps;
import org.apache.kylin.job.JobContext;
import org.apache.kylin.job.dao.ExecutablePO;
import org.apache.kylin.job.execution.AbstractExecutable;
import org.apache.kylin.job.execution.ExecutableManager;
import org.apache.kylin.job.execution.ExecutableState;
import org.apache.kylin.job.execution.JobTypeEnum;
import org.apache.kylin.job.util.JobContextUtil;
import org.apache.kylin.metadata.cube.model.NDataflowManager;
import org.apache.kylin.metadata.cube.storage.ProjectStorageInfoCollector;
import org.apache.kylin.metadata.cube.storage.StorageInfoEnum;
import org.apache.kylin.metadata.cube.storage.StorageVolumeInfo;
import org.apache.kylin.metadata.model.NDataModel;
import org.apache.kylin.metadata.model.NDataModelManager;
import org.apache.kylin.metadata.model.NTableMetadataManager;
import org.apache.kylin.metadata.model.TableDesc;
import org.apache.kylin.metadata.project.NProjectManager;
import org.apache.kylin.metadata.project.ProjectInstance;
import org.apache.kylin.metadata.user.ManagedUser;
import org.apache.kylin.metadata.user.NKylinUserManager;
import org.apache.kylin.query.util.LoadCounter;
import org.apache.kylin.query.util.LoadDesc;
import org.apache.kylin.rest.service.ProjectService;
import org.apache.kylin.rest.util.SpringContext;
import org.apache.spark.sql.SparderEnv;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.codahale.metrics.RatioGauge;
import com.codahale.metrics.jvm.GarbageCollectorMetricSet;
import io.micrometer.core.instrument.Gauge;
import io.micrometer.core.instrument.Meter;
import io.micrometer.core.instrument.MeterRegistry;
import io.micrometer.core.instrument.Tags;
import lombok.val;
import lombok.extern.slf4j.Slf4j;
@Slf4j
public class MetricsRegistry {
private MetricsRegistry() {
//do nothing
}
private static final String GLOBAL = "global";
private static final Map totalStorageSizeMap = Maps.newHashMap();
private static volatile Map> projectPendingJobMap = Maps.newHashMap();
private static volatile Map> projectRunningJobMap = Maps.newHashMap();
private static final int[] PENDING_JOB_TIMEOUT_MINUTE = new int[] { 5, 10, 15, 30 };
private static final double[] RUNNING_JOB_TIMEOUT_HOUR = new double[] { 0.5d, 1d, 1.5d, 2d, 3d };
private static final Logger logger = LoggerFactory.getLogger(MetricsRegistry.class);
public static void refreshTotalStorageSize() {
val projectService = SpringContext.getBean(ProjectService.class);
totalStorageSizeMap.forEach((project, totalStorageSize) -> {
val storageVolumeInfoResponse = projectService.getStorageVolumeInfoResponse(project);
totalStorageSizeMap.put(project, storageVolumeInfoResponse.getTotalStorageSize());
});
}
public static void refreshProjectLongRunningJobs(KylinConfig kylinConfig, Set projects) {
Map> tempProjectPendingJobMap = Maps.newHashMap();
Map> tempProjectRunningJobMap = Maps.newHashMap();
for (String project : projects) {
final ExecutableManager executableManager = ExecutableManager.getInstance(kylinConfig, project);
tempProjectPendingJobMap.put(project, collectTimeoutToPendingJobsMap(executableManager));
tempProjectRunningJobMap.put(project, collectTimeoutToRunningJobsMap(executableManager));
}
projectPendingJobMap = tempProjectPendingJobMap;
projectRunningJobMap = tempProjectRunningJobMap;
}
private static Map collectTimeoutToPendingJobsMap(ExecutableManager executableManager) {
Map timeoutToPendingJobsMap = Maps.newHashMap();
List pendingJobs = executableManager.getAllJobs().stream()
.filter(e -> ExecutableState.READY.name().equals(e.getOutput().getStatus()))
.map(executableManager::fromPO).collect(Collectors.toList());
for (int pendingJobMin : PENDING_JOB_TIMEOUT_MINUTE) {
timeoutToPendingJobsMap.put(pendingJobMin,
pendingJobs.stream().filter(e -> e.getWaitTime() / 1000.0 > pendingJobMin * 60).count());
}
return timeoutToPendingJobsMap;
}
private static Map collectTimeoutToRunningJobsMap(ExecutableManager executableManager) {
Map timeoutToRunningJobsMap = Maps.newHashMap();
List runningJobs = executableManager.getAllJobs().stream()
.filter(e -> ExecutableState.RUNNING.name().equals(e.getOutput().getStatus()))
.map(executableManager::fromPO).collect(Collectors.toList());
for (double runningJobHour : RUNNING_JOB_TIMEOUT_HOUR) {
timeoutToRunningJobsMap.put(runningJobHour,
runningJobs.stream().filter(e -> e.getDuration() / 1000.0 > runningJobHour * 3600).count());
}
return timeoutToRunningJobsMap;
}
public static void removeProjectFromStorageSizeMap(String project) {
totalStorageSizeMap.remove(project);
}
public static void registerGlobalMetrics(KylinConfig config, String host) {
final NProjectManager projectManager = NProjectManager.getInstance(config);
MetricsGroup.newGauge(MetricsName.PROJECT_GAUGE, MetricsCategory.GLOBAL, GLOBAL, () -> {
List list = projectManager.listAllProjects();
if (list == null) {
return 0;
}
return list.size();
});
final NKylinUserManager userManager = NKylinUserManager.getInstance(config);
MetricsGroup.newGauge(MetricsName.USER_GAUGE, MetricsCategory.GLOBAL, GLOBAL, () -> {
List list = userManager.list();
if (list == null) {
return 0;
}
return list.size();
});
Map tags = MetricsGroup.getHostTagMap(host, GLOBAL);
MetricsGroup.newCounter(MetricsName.STORAGE_CLEAN, MetricsCategory.GLOBAL, GLOBAL, tags);
MetricsGroup.newCounter(MetricsName.STORAGE_CLEAN_DURATION, MetricsCategory.GLOBAL, GLOBAL, tags);
MetricsGroup.newCounter(MetricsName.STORAGE_CLEAN_FAILED, MetricsCategory.GLOBAL, GLOBAL, tags);
MetricsGroup.newCounter(MetricsName.METADATA_BACKUP, MetricsCategory.GLOBAL, GLOBAL, tags);
MetricsGroup.newCounter(MetricsName.METADATA_BACKUP_DURATION, MetricsCategory.GLOBAL, GLOBAL, tags);
MetricsGroup.newCounter(MetricsName.METADATA_BACKUP_FAILED, MetricsCategory.GLOBAL, GLOBAL, tags);
MetricsGroup.newCounter(MetricsName.METADATA_OPS_CRON, MetricsCategory.GLOBAL, GLOBAL, tags);
MetricsGroup.newCounter(MetricsName.METADATA_OPS_CRON_SUCCESS, MetricsCategory.GLOBAL, GLOBAL, tags);
MetricsGroup.newCounter(MetricsName.TRANSACTION_RETRY_COUNTER, MetricsCategory.GLOBAL, GLOBAL, tags);
MetricsGroup.newHistogram(MetricsName.TRANSACTION_LATENCY, MetricsCategory.GLOBAL, GLOBAL, tags);
}
public static void registerGlobalPrometheusMetrics() {
MeterRegistry meterRegistry = SpringContext.getBean(MeterRegistry.class);
for (String state : Lists.newArrayList("idle", "active")) {
JdbcDataSource.getDataSources().stream()
.collect(Collectors.groupingBy(ds -> ((BasicDataSource) ds).getDriverClassName()))
.forEach((driver, sources) -> Gauge
.builder(PrometheusMetrics.JVM_DB_CONNECTIONS.getValue(), sources, dataSources -> {
int count = 0;
for (DataSource dataSource : dataSources) {
BasicDataSource basicDataSource = (BasicDataSource) dataSource;
if (state.equals("idle")) {
count += basicDataSource.getNumIdle();
} else {
count += basicDataSource.getNumActive();
}
}
return count;
})
.tags(MetricsTag.STATE.getVal(), state, MetricsTag.POOL.getVal(), "dbcp2",
MetricsTag.TYPE.getVal(), driver)
.description("Number of Metastore(RDBMS) connections").strongReference(true)
.register(meterRegistry));
}
Gauge.builder(PrometheusMetrics.SPARDER_UP.getValue(), () -> SparderEnv.isSparkAvailable() ? 1 : 0)
.description("Health status of spark context(query engine)").strongReference(true)
.register(meterRegistry);
Gauge.builder(PrometheusMetrics.SPARK_TASKS.getValue(), LoadCounter.getInstance(),
e -> SparderEnv.isSparkAvailable() ? e.getPendingTaskCount() : 0)
.tags(MetricsTag.STATE.getVal(), MetricsTag.PENDING.getVal()).strongReference(true)
.description("Number of pending spark tasks of query engine").register(meterRegistry);
Gauge.builder(PrometheusMetrics.SPARK_TASKS.getValue(), LoadCounter.getInstance(),
e -> SparderEnv.isSparkAvailable() ? e.getRunningTaskCount() : 0)
.tags(MetricsTag.STATE.getVal(), MetricsTag.RUNNING.getVal()).strongReference(true)
.description("Number of running spark tasks of query engine").register(meterRegistry);
Gauge.builder(PrometheusMetrics.SPARK_TASK_UTILIZATION.getValue(), LoadCounter.getInstance(),
e -> SparderEnv.isSparkAvailable() ? e.getRunningTaskCount() * 1.0 / e.getSlotCount() : 0)
.description("Ratio of spark cores utilization.").strongReference(true).register(meterRegistry);
}
public static void registerProjectPrometheusMetrics(KylinConfig kylinConfig, String project) {
if (!kylinConfig.isPrometheusMetricsEnabled()) {
return;
}
MeterRegistry meterRegistry = SpringContext.getBean(MeterRegistry.class);
Tags projectTag = Tags.of(MetricsTag.PROJECT.getVal(), project);
if (kylinConfig.isJobNode() || kylinConfig.isDataLoadingNode()) {
Gauge.builder(PrometheusMetrics.JOB_COUNTS.getValue(), () -> {
JobContext jobContext = JobContextUtil.getJobContext(kylinConfig);
return Objects.isNull(jobContext) ? 0
: jobContext.getJobScheduler().getRunningJob().values().stream().map(pair -> pair.getFirst())
.filter(jobExecutable -> project.equals(jobExecutable.getProject())).count();
}).tags(projectTag).tags(MetricsTag.STATE.getVal(), MetricsTag.RUNNING.getVal())
.description("Number of spark job by build engine").register(meterRegistry);
}
for (double runningTimeoutHour : RUNNING_JOB_TIMEOUT_HOUR) {
Gauge.builder(PrometheusMetrics.JOB_LONG_RUNNING.getValue(),
() -> MetricsRegistry.projectRunningJobMap
.getOrDefault(project, Maps.newHashMap()).getOrDefault(runningTimeoutHour, 0L))
.tags(projectTag)
.tags(MetricsTag.STATE.getVal(), MetricsTag.RUNNING.getVal(), MetricsTag.TIMEOUT.getVal(),
runningTimeoutHour + "h")
.description("Number of spark job by query engine").register(meterRegistry);
}
for (int waitTimeoutMin : PENDING_JOB_TIMEOUT_MINUTE) {
Gauge.builder(PrometheusMetrics.JOB_LONG_RUNNING.getValue(),
() -> MetricsRegistry.projectPendingJobMap.getOrDefault(project, Maps.newHashMap())
.getOrDefault(waitTimeoutMin, 0L))
.tags(projectTag)
.tags(MetricsTag.STATE.getVal(), MetricsTag.WAITING.getVal(), MetricsTag.TIMEOUT.getVal(),
waitTimeoutMin + "m")
.description("Number of spark job by build engine which ").register(meterRegistry);
}
}
public static void registerHostMetrics(String host) {
MetricsGroup.newCounter(MetricsName.SPARDER_RESTART, MetricsCategory.HOST, host);
MetricsGroup.newCounter(MetricsName.QUERY_HOST, MetricsCategory.HOST, host);
MetricsGroup.newCounter(MetricsName.QUERY_SCAN_BYTES_HOST, MetricsCategory.HOST, host);
MetricsGroup.newHistogram(MetricsName.QUERY_TIME_HOST, MetricsCategory.HOST, host);
MemoryMXBean mxBean = ManagementFactory.getMemoryMXBean();
MetricsGroup.newGauge(MetricsName.HEAP_MAX, MetricsCategory.HOST, host,
() -> mxBean.getHeapMemoryUsage().getMax());
MetricsGroup.newGauge(MetricsName.HEAP_USED, MetricsCategory.HOST, host,
() -> mxBean.getHeapMemoryUsage().getUsed());
MetricsGroup.newGauge(MetricsName.HEAP_USAGE, MetricsCategory.HOST, host, () -> {
final MemoryUsage usage = mxBean.getHeapMemoryUsage();
return RatioGauge.Ratio.of(usage.getUsed(), usage.getMax()).getValue();
});
MetricsGroup.newMetricSet(MetricsName.JVM_GC, MetricsCategory.HOST, host, new GarbageCollectorMetricSet());
MetricsGroup.newGauge(MetricsName.JVM_AVAILABLE_CPU, MetricsCategory.HOST, host,
() -> Runtime.getRuntime().availableProcessors());
MetricsGroup.newGauge(MetricsName.QUERY_LOAD, MetricsCategory.HOST, host, () -> {
LoadDesc loadDesc = LoadCounter.getInstance().getLoadDesc();
return loadDesc.getLoad();
});
MetricsGroup.newGauge(MetricsName.CPU_CORES, MetricsCategory.HOST, host, () -> {
LoadDesc loadDesc = LoadCounter.getInstance().getLoadDesc();
return loadDesc.getCoreNum();
});
}
static void registerJobMetrics(KylinConfig config, String project) {
final ExecutableManager executableManager = ExecutableManager.getInstance(config, project);
MetricsGroup.newGauge(MetricsName.JOB_ERROR_GAUGE, MetricsCategory.PROJECT, project, () -> {
List list = executableManager.getAllJobs();
return list == null ? 0
: list.stream().filter(e -> ExecutableState.ERROR.name().equals(e.getOutput().getStatus())).count();
});
MetricsGroup.newGauge(MetricsName.JOB_RUNNING_GAUGE, MetricsCategory.PROJECT, project, () -> {
List list = executableManager.getAllJobs();
return list == null ? 0 : list.stream().filter(e -> {
String status = e.getOutput().getStatus();
return ExecutableState.RUNNING.name().equals(status) || ExecutableState.READY.name().equals(status)
|| ExecutableState.PENDING.name().equals(status);
}).count();
});
MetricsGroup.newGauge(MetricsName.JOB_PENDING_GAUGE, MetricsCategory.PROJECT, project, () -> {
List list = executableManager.getAllJobs();
return list == null ? 0
: list.stream().filter(e -> ExecutableState.READY.name().equals(e.getOutput().getStatus())
|| ExecutableState.PENDING.name().equals(e.getOutput().getStatus())).count();
});
}
static void registerStorageMetrics(String project) {
val projectService = SpringContext.getBean(ProjectService.class);
totalStorageSizeMap.put(project, projectService.getStorageVolumeInfoResponse(project).getTotalStorageSize());
MetricsGroup.newGauge(MetricsName.PROJECT_STORAGE_SIZE, MetricsCategory.PROJECT, project,
() -> totalStorageSizeMap.getOrDefault(project, 0L));
MetricsGroup.newGauge(MetricsName.PROJECT_GARBAGE_SIZE, MetricsCategory.PROJECT, project, () -> {
val collector = new ProjectStorageInfoCollector(Lists.newArrayList(StorageInfoEnum.GARBAGE_STORAGE));
StorageVolumeInfo storageVolumeInfo = collector.getStorageVolumeInfo(KylinConfig.getInstanceFromEnv(),
project);
return storageVolumeInfo.getGarbageStorageSize();
});
}
public static void registerProjectMetrics(KylinConfig config, String project, String host) {
// for non-gauges
MetricsGroup.registerProjectMetrics(project, host);
//for gauges
final NDataModelManager dataModelManager = NDataModelManager.getInstance(config, project);
MetricsGroup.newGauge(MetricsName.MODEL_GAUGE, MetricsCategory.PROJECT, project, () -> {
List list = dataModelManager.listAllModels();
return list == null ? 0 : list.size();
});
boolean streamingEnabled = config.isStreamingEnabled();
final NDataflowManager dataflowManager = NDataflowManager.getInstance(config, project);
MetricsGroup.newGauge(MetricsName.HEALTHY_MODEL_GAUGE, MetricsCategory.PROJECT, project, () -> {
List list = dataflowManager.listUnderliningDataModels().stream()
.filter(model -> model.isAccessible(streamingEnabled)).collect(Collectors.toList());
return list.size();
});
registerStorageMetrics(project);
registerJobMetrics(config, project);
final NTableMetadataManager tableMetadataManager = NTableMetadataManager.getInstance(config, project);
MetricsGroup.newGauge(MetricsName.TABLE_GAUGE, MetricsCategory.PROJECT, project, () -> {
final List list = tableMetadataManager.listAllTables().stream()
.filter(table -> table.isAccessible(streamingEnabled)).collect(Collectors.toList());
return list.size();
});
MetricsGroup.newGauge(MetricsName.DB_GAUGE, MetricsCategory.PROJECT, project, () -> {
final List list = tableMetadataManager.listAllTables();
return list == null ? 0
: list.stream().filter(table -> table.isAccessible(streamingEnabled))
.map(TableDesc::getCaseSensitiveDatabase).collect(toSet()).size();
});
registerModelMetrics(config, project);
registerJobStatisticsMetrics(project, host);
}
static void registerModelMetrics(KylinConfig config, String project) {
NDataModelManager modelManager = NDataModelManager.getInstance(config, project);
boolean streamingEnabled = config.isStreamingEnabled();
modelManager.listAllModels().stream().filter(model -> model.isAccessible(streamingEnabled))
.forEach(model -> registerModelMetrics(project, model.getId(), model.getAlias()));
}
static void registerModelMetrics(String project, String modelId, String modelAlias) {
EventBusFactory.getInstance().postSync(new ModelAddEvent(project, modelId, modelAlias));
}
public static void deletePrometheusProjectMetrics(String project) {
if (StringUtils.isEmpty(project)) {
throw new IllegalArgumentException("Remove prometheus project metrics, project shouldn't be empty.");
}
MeterRegistry meterRegistry = SpringContext.getBean(MeterRegistry.class);
meterRegistry.getMeters().stream().map(Meter::getId)
.filter(id -> project.equals(id.getTag(MetricsTag.PROJECT.getVal()))).forEach(meterRegistry::remove);
logger.info("Remove project prometheus metrics for {} success.", project);
}
public static void removePrometheusModelMetrics(String project, String modelName) {
if (StringUtils.isBlank(project) || StringUtils.isBlank(modelName)) {
throw new IllegalArgumentException(
"Remove prometheus model metrics, project or modelName shouldn't be empty.");
}
Set modelMetrics = PrometheusMetrics.listModelMetrics();
modelMetrics.forEach(metricName -> doRemoveMetric(metricName,
Tags.of(MetricsTag.PROJECT.getVal(), project, MetricsTag.MODEL.getVal(), modelName)));
}
private static void doRemoveMetric(PrometheusMetrics metricName, Tags tags) {
MeterRegistry meterRegistry = SpringContext.getBean(MeterRegistry.class);
Meter result = meterRegistry
.remove(new Meter.Id(metricName.getValue(), tags, null, null, Meter.Type.DISTRIBUTION_SUMMARY));
if (Objects.isNull(result)) {
logger.warn("Remove prometheus metric failed, metric name: {}, tags: {}", metricName.getValue(), tags);
}
}
private static void registerJobStatisticsMetrics(String project, String host) {
List types = Stream.of(JobTypeEnum.values()).collect(Collectors.toList());
Map tags;
for (JobTypeEnum type : types) {
tags = Maps.newHashMap();
tags.put(MetricsTag.HOST.getVal(), host);
tags.put(MetricsTag.JOB_TYPE.getVal(), type.name());
MetricsGroup.newCounter(MetricsName.JOB_COUNT, MetricsCategory.PROJECT, project, tags);
MetricsGroup.newCounter(MetricsName.SUCCESSFUL_JOB_COUNT, MetricsCategory.PROJECT, project, tags);
MetricsGroup.newCounter(MetricsName.ERROR_JOB_COUNT, MetricsCategory.PROJECT, project, tags);
MetricsGroup.newCounter(MetricsName.TERMINATED_JOB_COUNT, MetricsCategory.PROJECT, project, tags);
MetricsGroup.newCounter(MetricsName.JOB_COUNT_LT_5, MetricsCategory.PROJECT, project, tags);
MetricsGroup.newCounter(MetricsName.JOB_COUNT_5_10, MetricsCategory.PROJECT, project, tags);
MetricsGroup.newCounter(MetricsName.JOB_COUNT_10_30, MetricsCategory.PROJECT, project, tags);
MetricsGroup.newCounter(MetricsName.JOB_COUNT_30_60, MetricsCategory.PROJECT, project, tags);
MetricsGroup.newCounter(MetricsName.JOB_COUNT_GT_60, MetricsCategory.PROJECT, project, tags);
MetricsGroup.newCounter(MetricsName.JOB_TOTAL_DURATION, MetricsCategory.PROJECT, project, tags);
}
}
}