org.graylog2.shared.journal.KafkaJournal Maven / Gradle / Ivy
/**
* This file is part of Graylog.
*
* Graylog is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* Graylog is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with Graylog. If not, see .
*/
package org.graylog2.shared.journal;
import com.codahale.metrics.Gauge;
import com.codahale.metrics.Meter;
import com.codahale.metrics.Metric;
import com.codahale.metrics.MetricFilter;
import com.codahale.metrics.MetricRegistry;
import com.codahale.metrics.Timer;
import com.github.joschi.jadconfig.util.Size;
import com.google.common.base.Charsets;
import com.google.common.base.Throwables;
import com.google.common.collect.Iterables;
import com.google.common.collect.Lists;
import com.google.common.collect.Sets;
import com.google.common.io.Files;
import com.google.common.primitives.Ints;
import com.google.common.util.concurrent.AbstractIdleService;
import com.google.common.util.concurrent.Uninterruptibles;
import kafka.common.KafkaException;
import kafka.common.OffsetOutOfRangeException;
import kafka.common.TopicAndPartition;
import kafka.log.CleanerConfig;
import kafka.log.Log;
import kafka.log.LogConfig;
import kafka.log.LogManager;
import kafka.log.LogSegment;
import kafka.message.ByteBufferMessageSet;
import kafka.message.Message;
import kafka.message.MessageAndOffset;
import kafka.message.MessageSet;
import kafka.server.BrokerState;
import kafka.server.RunningAsBroker;
import kafka.utils.KafkaScheduler;
import kafka.utils.Time;
import kafka.utils.Utils;
import org.graylog2.plugin.GlobalMetricNames;
import org.graylog2.plugin.ThrottleState;
import org.graylog2.shared.metrics.HdrTimer;
import org.joda.time.DateTimeUtils;
import org.joda.time.Duration;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import scala.Option;
import scala.collection.Iterator;
import scala.collection.JavaConversions;
import scala.collection.Map$;
import scala.runtime.AbstractFunction1;
import javax.inject.Inject;
import javax.inject.Named;
import javax.inject.Singleton;
import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.SyncFailedException;
import java.nio.channels.ClosedByInterruptException;
import java.nio.file.AccessDeniedException;
import java.util.Collections;
import java.util.Date;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.SortedMap;
import java.util.concurrent.Callable;
import java.util.concurrent.ScheduledExecutorService;
import java.util.concurrent.ScheduledFuture;
import java.util.concurrent.TimeUnit;
import java.util.concurrent.atomic.AtomicInteger;
import java.util.concurrent.atomic.AtomicLong;
import java.util.concurrent.atomic.AtomicReference;
import static com.codahale.metrics.MetricRegistry.name;
import static com.github.joschi.jadconfig.util.Size.megabytes;
import static java.util.concurrent.TimeUnit.DAYS;
import static java.util.concurrent.TimeUnit.MILLISECONDS;
import static java.util.concurrent.TimeUnit.MINUTES;
import static java.util.concurrent.TimeUnit.NANOSECONDS;
import static java.util.concurrent.TimeUnit.SECONDS;
import static org.graylog2.plugin.Tools.bytesToHex;
@Singleton
public class KafkaJournal extends AbstractIdleService implements Journal {
private static final Logger LOG = LoggerFactory.getLogger(KafkaJournal.class);
private static final int NUM_IO_THREADS = 1;
public static final long DEFAULT_COMMITTED_OFFSET = Long.MIN_VALUE;
public static final int NOTIFY_ON_UTILIZATION_PERCENTAGE = 95;
// this exists so we can use JodaTime's millis provider in tests.
// kafka really only cares about the milliseconds() method in here
private static final Time JODA_TIME = new Time() {
@Override
public long milliseconds() {
return DateTimeUtils.currentTimeMillis();
}
@Override
public long nanoseconds() {
return System.nanoTime();
}
@Override
public void sleep(long ms) {
Uninterruptibles.sleepUninterruptibly(ms, MILLISECONDS);
}
};
private final LogManager logManager;
private final Log kafkaLog;
private final File committedReadOffsetFile;
private final AtomicLong committedOffset = new AtomicLong(DEFAULT_COMMITTED_OFFSET);
private final ScheduledExecutorService scheduler;
private final Timer writeTime;
private final Timer readTime;
private final KafkaScheduler kafkaScheduler;
private final Meter writtenMessages;
private final Meter readMessages;
private final OffsetFileFlusher offsetFlusher;
private final DirtyLogFlusher dirtyLogFlusher;
private final RecoveryCheckpointFlusher recoveryCheckpointFlusher;
private final LogRetentionCleaner logRetentionCleaner;
private long nextReadOffset = 0L;
private ScheduledFuture> checkpointFlusherFuture;
private ScheduledFuture> dirtyLogFlushFuture;
private ScheduledFuture> logRetentionFuture;
private ScheduledFuture> offsetFlusherFuture;
private volatile boolean shuttingDown;
private final AtomicReference throttleState = new AtomicReference<>();
private final AtomicInteger purgedSegmentsInLastRetention = new AtomicInteger();
@Inject
public KafkaJournal(@Named("message_journal_dir") File journalDirectory,
@Named("scheduler") ScheduledExecutorService scheduler,
@Named("message_journal_segment_size") Size segmentSize,
@Named("message_journal_segment_age") Duration segmentAge,
@Named("message_journal_max_size") Size retentionSize,
@Named("message_journal_max_age") Duration retentionAge,
@Named("message_journal_flush_interval") long flushInterval,
@Named("message_journal_flush_age") Duration flushAge,
MetricRegistry metricRegistry) {
this.scheduler = scheduler;
this.writtenMessages = metricRegistry.meter(name(this.getClass(), "writtenMessages"));
this.readMessages = metricRegistry.meter(name(this.getClass(), "readMessages"));
registerUncommittedGauge(metricRegistry, name(this.getClass(), "uncommittedMessages"));
// the registerHdrTimer helper doesn't throw on existing metrics
this.writeTime = registerHdrTimer(metricRegistry, name(this.getClass(), "writeTime"));
this.readTime = registerHdrTimer(metricRegistry, name(this.getClass(), "readTime"));
// these are the default values as per kafka 0.8.1.1
final LogConfig defaultConfig =
new LogConfig(
// segmentSize: The soft maximum for the size of a segment file in the log
Ints.saturatedCast(segmentSize.toBytes()),
// segmentMs: The soft maximum on the amount of time before a new log segment is rolled
segmentAge.getMillis(),
// segmentJitterMs The maximum random jitter subtracted from segmentMs to avoid thundering herds of segment rolling
0,
// flushInterval: The number of messages that can be written to the log before a flush is forced
flushInterval,
// flushMs: The amount of time the log can have dirty data before a flush is forced
flushAge.getMillis(),
// retentionSize: The approximate total number of bytes this log can use
retentionSize.toBytes(),
// retentionMs: The age approximate maximum age of the last segment that is retained
retentionAge.getMillis(),
// maxMessageSize: The maximum size of a message in the log
Integer.MAX_VALUE,
// maxIndexSize: The maximum size of an index file
Ints.saturatedCast(megabytes(1L).toBytes()),
// indexInterval: The approximate number of bytes between index entries
4096,
// fileDeleteDelayMs: The time to wait before deleting a file from the filesystem
MINUTES.toMillis(1L),
// deleteRetentionMs: The time to retain delete markers in the log. Only applicable for logs that are being compacted.
DAYS.toMillis(1L),
// minCleanableRatio: The ratio of bytes that are available for cleaning to the bytes already cleaned
0.5,
// compact: Should old segments in this log be deleted or de-duplicated?
false,
// uncleanLeaderElectionEnable Indicates whether unclean leader election is enabled; actually a controller-level property
// but included here for topic-specific configuration validation purposes
true,
// minInSyncReplicas If number of insync replicas drops below this number, we stop accepting writes with -1 (or all) required acks
0
);
// these are the default values as per kafka 0.8.1.1, except we don't turn on the cleaner
// Cleaner really is log compaction with respect to "deletes" in the log.
// we never insert a message twice, at least not on purpose, so we do not "clean" logs, ever.
final CleanerConfig cleanerConfig =
new CleanerConfig(
1,
megabytes(4L).toBytes(),
0.9d,
Ints.saturatedCast(megabytes(1L).toBytes()),
Ints.saturatedCast(megabytes(32L).toBytes()),
Ints.saturatedCast(megabytes(5L).toBytes()),
SECONDS.toMillis(15L),
false,
"MD5");
if (!journalDirectory.exists() && !journalDirectory.mkdirs()) {
LOG.error("Cannot create journal directory at {}, please check the permissions",
journalDirectory.getAbsolutePath());
Throwables.propagate(new AccessDeniedException(journalDirectory.getAbsolutePath(), null, "Could not create journal directory."));
}
// TODO add check for directory, etc
committedReadOffsetFile = new File(journalDirectory, "graylog2-committed-read-offset");
try {
if (!committedReadOffsetFile.createNewFile()) {
final String line = Files.readFirstLine(committedReadOffsetFile, Charsets.UTF_8);
// the file contains the last offset graylog2 has successfully processed.
// thus the nextReadOffset is one beyond that number
if (line != null) {
committedOffset.set(Long.parseLong(line.trim()));
nextReadOffset = committedOffset.get() + 1;
}
}
} catch (IOException e) {
LOG.error("Cannot access offset file: {}", e.getMessage());
Throwables.propagate(new AccessDeniedException(committedReadOffsetFile.getAbsolutePath(),
null,
e.getMessage()));
}
try {
final BrokerState brokerState = new BrokerState();
brokerState.newState(RunningAsBroker.state());
kafkaScheduler = new KafkaScheduler(2, "kafka-journal-scheduler-", false); // TODO make thread count configurable
kafkaScheduler.startup();
logManager = new LogManager(
new File[]{journalDirectory},
Map$.MODULE$.empty(),
defaultConfig,
cleanerConfig,
NUM_IO_THREADS,
SECONDS.toMillis(60L),
SECONDS.toMillis(60L),
SECONDS.toMillis(60L),
kafkaScheduler, // Broker state
brokerState,
JODA_TIME);
final TopicAndPartition topicAndPartition = new TopicAndPartition("messagejournal", 0);
final Option messageLog = logManager.getLog(topicAndPartition);
if (messageLog.isEmpty()) {
kafkaLog = logManager.createLog(topicAndPartition, logManager.defaultConfig());
} else {
kafkaLog = messageLog.get();
}
LOG.info("Initialized Kafka based journal at {}", journalDirectory);
setupKafkaLogMetrics(metricRegistry);
offsetFlusher = new OffsetFileFlusher();
dirtyLogFlusher = new DirtyLogFlusher();
recoveryCheckpointFlusher = new RecoveryCheckpointFlusher();
logRetentionCleaner = new LogRetentionCleaner();
} catch (KafkaException e) {
// most likely failed to grab lock
LOG.error("Unable to start logmanager.", e);
throw new RuntimeException(e);
}
}
private Timer registerHdrTimer(MetricRegistry metricRegistry, final String metricName) {
Timer timer;
try {
timer = metricRegistry.register(metricName, new HdrTimer(1, TimeUnit.MINUTES, 1));
} catch (IllegalArgumentException e) {
final SortedMap timers = metricRegistry.getTimers(new MetricFilter() {
@Override
public boolean matches(String name, Metric metric) {
return metricName.equals(name);
}
});
timer = Iterables.getOnlyElement(timers.values());
}
return timer;
}
private void registerUncommittedGauge(MetricRegistry metricRegistry, String name) {
try {
metricRegistry.register(name,
new Gauge() {
@Override
public Long getValue() {
return Math.max(0, getLogEndOffset() - 1 - committedOffset.get());
}
});
} catch (IllegalArgumentException ignored) {
// already registered, we'll ignore that.
}
}
public int getPurgedSegmentsInLastRetention() {
return purgedSegmentsInLastRetention.get();
}
private void setupKafkaLogMetrics(final MetricRegistry metricRegistry) {
metricRegistry.register(name(KafkaJournal.class, "size"), new Gauge() {
@Override
public Long getValue() {
return kafkaLog.size();
}
});
metricRegistry.register(name(KafkaJournal.class, "logEndOffset"), new Gauge() {
@Override
public Long getValue() {
return kafkaLog.logEndOffset();
}
});
metricRegistry.register(name(KafkaJournal.class, "numberOfSegments"), new Gauge() {
@Override
public Integer getValue() {
return kafkaLog.numberOfSegments();
}
});
metricRegistry.register(name(KafkaJournal.class, "unflushedMessages"), new Gauge() {
@Override
public Long getValue() {
return kafkaLog.unflushedMessages();
}
});
metricRegistry.register(name(KafkaJournal.class, "recoveryPoint"), new Gauge() {
@Override
public Long getValue() {
return kafkaLog.recoveryPoint();
}
});
metricRegistry.register(name(KafkaJournal.class, "lastFlushTime"), new Gauge() {
@Override
public Long getValue() {
return kafkaLog.lastFlushTime();
}
});
metricRegistry.register(GlobalMetricNames.JOURNAL_OLDEST_SEGMENT, new Gauge() {
@Override
public Date getValue() {
long oldestSegment = Long.MAX_VALUE;
for (final LogSegment segment : getSegments()) {
oldestSegment = Math.min(oldestSegment, segment.created());
}
return new Date(oldestSegment);
}
});
}
/**
* Creates an opaque object which can be passed to {@link #write(java.util.List)} for a bulk journal write.
*
* @param idBytes a byte array which represents the key for the entry
* @param messageBytes the journal entry's payload, i.e. the message itself
* @return a journal entry to be passed to {@link #write(java.util.List)}
*/
@Override
public Entry createEntry(byte[] idBytes, byte[] messageBytes) {
return new Entry(idBytes, messageBytes);
}
/**
* Writes the list of entries to the journal.
*
* @param entries journal entries to be written
* @return the last position written to in the journal
*/
@Override
public long write(List entries) {
try (Timer.Context ignored = writeTime.time()) {
long payloadSize = 0L;
final List messages = Lists.newArrayListWithCapacity(entries.size());
for (final Entry entry : entries) {
final byte[] messageBytes = entry.getMessageBytes();
final byte[] idBytes = entry.getIdBytes();
payloadSize += messageBytes.length;
messages.add(new Message(messageBytes, idBytes));
if (LOG.isTraceEnabled()) {
LOG.trace("Message {} contains bytes {}", bytesToHex(idBytes), bytesToHex(messageBytes));
}
}
final ByteBufferMessageSet messageSet = new ByteBufferMessageSet(JavaConversions.asScalaBuffer(messages));
final Log.LogAppendInfo appendInfo = kafkaLog.append(messageSet, true);
long lastWriteOffset = appendInfo.lastOffset();
LOG.debug("Wrote {} messages to journal: {} bytes, log position {} to {}",
entries.size(), payloadSize, appendInfo.firstOffset(), lastWriteOffset);
writtenMessages.mark(entries.size());
return lastWriteOffset;
}
}
/**
* Writes a single message to the journal and returns the new write position
*
* @param idBytes byte array congaing the message id
* @param messageBytes encoded message payload
* @return the last position written to in the journal
*/
@Override
public long write(byte[] idBytes, byte[] messageBytes) {
final Entry journalEntry = createEntry(idBytes, messageBytes);
return write(Collections.singletonList(journalEntry));
}
@Override
public List read(long requestedMaximumCount) {
return read(nextReadOffset, requestedMaximumCount);
}
public List read(long readOffset, long requestedMaximumCount) {
// Always read at least one!
final long maximumCount = Math.max(1, requestedMaximumCount);
long maxOffset = readOffset + maximumCount;
final List messages = Lists.newArrayListWithCapacity((int) (maximumCount));
if (shuttingDown) {
return messages;
}
try (Timer.Context ignored = readTime.time()) {
final long logStartOffset = getLogStartOffset();
if (readOffset < logStartOffset) {
LOG.error(
"Read offset {} before start of log at {}, starting to read from the beginning of the journal.",
readOffset,
logStartOffset);
readOffset = logStartOffset;
maxOffset = readOffset + maximumCount;
}
LOG.debug("Requesting to read a maximum of {} messages (or 5MB) from the journal, offset interval [{}, {})",
maximumCount, readOffset, maxOffset);
// TODO benchmark and make read-ahead strategy configurable for performance tuning
final MessageSet messageSet = kafkaLog.read(readOffset,
5 * 1024 * 1024,
Option.
© 2015 - 2024 Weber Informatics LLC | Privacy Policy