org.apache.hadoop.yarn.util.Log4jWarningErrorMetricsAppender Maven / Gradle / Ivy
The newest version!
/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hadoop.yarn.util;
import org.apache.hadoop.classification.InterfaceAudience;
import org.apache.hadoop.classification.InterfaceStability;
import org.apache.hadoop.util.StringUtils;
import org.apache.hadoop.util.Time;
import org.apache.log4j.AppenderSkeleton;
import org.apache.log4j.Level;
import org.apache.log4j.Logger;
import org.apache.log4j.spi.LoggingEvent;
import java.util.*;
@InterfaceAudience.Private
@InterfaceStability.Unstable
public class Log4jWarningErrorMetricsAppender extends AppenderSkeleton {
public static final String LOG_METRICS_APPENDER = "RM_LOG_METRICS_APPENDER";
static final int MAX_MESSAGE_SIZE = 2048;
static public class Element {
public Long count;
public Long timestampSeconds;
Element(Long count, Long timestampSeconds) {
this.count = count;
this.timestampSeconds = timestampSeconds;
}
}
static class PurgeElement implements Comparable {
String message;
Long timestamp;
PurgeElement(String message, Long timestamp) {
this.message = message;
this.timestamp = timestamp;
}
public int compareTo(PurgeElement e) {
if (e == null) {
throw new NullPointerException("Null element passed to compareTo");
}
int ret = this.timestamp.compareTo(e.timestamp);
if (ret != 0) {
return ret;
}
return this.message.compareTo(e.message);
}
@Override
public boolean equals(Object e) {
if (e == null || !(e instanceof PurgeElement)) {
return false;
}
if (e == this) {
return true;
}
PurgeElement el = (PurgeElement) e;
return (this.message.equals(el.message))
&& (this.timestamp.equals(el.timestamp));
}
@Override
public int hashCode() {
return this.timestamp.hashCode();
}
}
Map> errors;
Map> warnings;
SortedMap errorsTimestampCount;
SortedMap warningsTimestampCount;
SortedSet errorsPurgeInformation;
SortedSet warningsPurgeInformation;
Timer cleanupTimer;
long cleanupInterval;
long messageAgeLimitSeconds;
int maxUniqueMessages;
final Object lock = new Object();
/**
* Create an appender to keep track of the errors and warnings logged by the
* system. It will keep purge messages older than 2 days. It will store upto
* the last 500 unique errors and the last 500 unique warnings. The thread to
* purge message will run every 5 minutes, unless the 500 message limit is hit
* earlier.
*/
public Log4jWarningErrorMetricsAppender() {
this(5 * 60, 24 * 60 * 60, 250);
}
/**
* Create an appender to keep track of the errors and warnings logged by the
* system.
*
* @param cleanupIntervalSeconds
* the interval at which old messages are purged to prevent the
* message stores from growing unbounded
* @param messageAgeLimitSeconds
* the maximum age of a message in seconds before it is purged from
* the store
* @param maxUniqueMessages
* the maximum number of unique messages of each type we keep before
* we start purging
*/
public Log4jWarningErrorMetricsAppender(int cleanupIntervalSeconds,
long messageAgeLimitSeconds, int maxUniqueMessages) {
super();
errors = new HashMap<>();
warnings = new HashMap<>();
errorsTimestampCount = new TreeMap<>();
warningsTimestampCount = new TreeMap<>();
errorsPurgeInformation = new TreeSet<>();
warningsPurgeInformation = new TreeSet<>();
cleanupTimer = new Timer();
cleanupInterval = cleanupIntervalSeconds * 1000;
cleanupTimer.schedule(new ErrorAndWarningsCleanup(), cleanupInterval);
this.messageAgeLimitSeconds = messageAgeLimitSeconds;
this.maxUniqueMessages = maxUniqueMessages;
this.setName(LOG_METRICS_APPENDER);
this.setThreshold(Level.WARN);
}
/**
* {@inheritDoc}
*/
@Override
protected void append(LoggingEvent event) {
String message = event.getRenderedMessage();
String[] throwableStr = event.getThrowableStrRep();
if (throwableStr != null) {
message = message + "\n" + StringUtils.join("\n", throwableStr);
message =
org.apache.commons.lang3.StringUtils.left(message, MAX_MESSAGE_SIZE);
}
int level = event.getLevel().toInt();
if (level == Level.WARN_INT || level == Level.ERROR_INT) {
// store second level information
Long eventTimeSeconds = event.getTimeStamp() / 1000;
Map> map;
SortedMap timestampsCount;
SortedSet purgeInformation;
if (level == Level.WARN_INT) {
map = warnings;
timestampsCount = warningsTimestampCount;
purgeInformation = warningsPurgeInformation;
} else {
map = errors;
timestampsCount = errorsTimestampCount;
purgeInformation = errorsPurgeInformation;
}
updateMessageDetails(message, eventTimeSeconds, map, timestampsCount,
purgeInformation);
}
}
private void updateMessageDetails(String message, Long eventTimeSeconds,
Map> map,
SortedMap timestampsCount,
SortedSet purgeInformation) {
synchronized (lock) {
if (map.containsKey(message)) {
SortedMap tmp = map.get(message);
Long lastMessageTime = tmp.lastKey();
int value = 1;
if (tmp.containsKey(eventTimeSeconds)) {
value = tmp.get(eventTimeSeconds) + 1;
}
tmp.put(eventTimeSeconds, value);
purgeInformation.remove(new PurgeElement(message, lastMessageTime));
} else {
SortedMap value = new TreeMap<>();
value.put(eventTimeSeconds, 1);
map.put(message, value);
if (map.size() > maxUniqueMessages * 2) {
cleanupTimer.cancel();
cleanupTimer = new Timer();
cleanupTimer.schedule(new ErrorAndWarningsCleanup(), 0);
}
}
purgeInformation.add(new PurgeElement(message, eventTimeSeconds));
int newValue = 1;
if (timestampsCount.containsKey(eventTimeSeconds)) {
newValue = timestampsCount.get(eventTimeSeconds) + 1;
}
timestampsCount.put(eventTimeSeconds, newValue);
}
}
/**
* {@inheritDoc}
*/
@Override
public void close() {
cleanupTimer.cancel();
}
/**
* {@inheritDoc}
*/
@Override
public boolean requiresLayout() {
return false;
}
/**
* Get the counts of errors in the time periods provided. Note that the counts
* provided by this function may differ from the ones provided by
* getErrorMessagesAndCounts since the message store is purged at regular
* intervals to prevent it from growing without bounds, while the store for
* the counts is purged less frequently.
*
* @param cutoffs
* list of timestamp cutoffs(in seconds) for which the counts are
* desired
* @return list of error counts in the time periods corresponding to cutoffs
*/
public List getErrorCounts(List cutoffs) {
return this.getCounts(errorsTimestampCount, cutoffs);
}
/**
* Get the counts of warnings in the time periods provided. Note that the
* counts provided by this function may differ from the ones provided by
* getWarningMessagesAndCounts since the message store is purged at regular
* intervals to prevent it from growing without bounds, while the store for
* the counts is purged less frequently.
*
* @param cutoffs
* list of timestamp cutoffs(in seconds) for which the counts are
* desired
* @return list of warning counts in the time periods corresponding to cutoffs
*/
public List getWarningCounts(List cutoffs) {
return this.getCounts(warningsTimestampCount, cutoffs);
}
private List getCounts(SortedMap map,
List cutoffs) {
List ret = new ArrayList<>();
Long largestCutoff = Collections.min(cutoffs);
for (int i = 0; i < cutoffs.size(); ++i) {
ret.add(0);
}
synchronized (lock) {
Map submap = map.tailMap(largestCutoff);
for (Map.Entry entry : submap.entrySet()) {
for (int i = 0; i < cutoffs.size(); ++i) {
if (entry.getKey() >= cutoffs.get(i)) {
int tmp = ret.get(i);
ret.set(i, tmp + entry.getValue());
}
}
}
}
return ret;
}
/**
* Get the errors and the number of occurrences for each of the errors for the
* time cutoffs provided. Note that the counts provided by this function may
* differ from the ones provided by getErrorCounts since the message store is
* purged at regular intervals to prevent it from growing without bounds,
* while the store for the counts is purged less frequently.
*
* @param cutoffs
* list of timestamp cutoffs(in seconds) for which the counts are
* desired
* @return list of maps corresponding for each cutoff provided; each map
* contains the error and the number of times the error occurred in
* the time period
*/
public List
© 2015 - 2025 Weber Informatics LLC | Privacy Policy