io.atomix.copycat.server.storage.Log Maven / Gradle / Ivy

Go to download
/*
 * Copyright 2015 the original author or authors.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package io.atomix.copycat.server.storage;

import io.atomix.catalyst.serializer.Serializer;
import io.atomix.catalyst.util.Assert;
import io.atomix.catalyst.util.concurrent.CatalystThreadFactory;
import io.atomix.copycat.server.storage.compaction.Compactor;
import io.atomix.copycat.server.storage.entry.Entry;
import io.atomix.copycat.server.storage.entry.TypedEntryPool;

import java.util.concurrent.Executors;

/**
 * Stores Raft log entries in a segmented log in memory or on disk.
 * 
 * The log is the primary vehicle for storing state changes and managing replication in Raft. The log is used to verify
 * consistency between members and manage cluster configurations, client sessions, state machine operations, and other
 * tasks.
 * 

 * State changes are written to the log as {@link Entry} objects. Each entry is associated with an {@code index} and
 * {@code term}. The {@code index} is a 1-based entry index from the start of the log. The {@code term} is used for
 * various consistency checks in the Raft algorithm. Raft guarantees that a {@link #commit(long) committed} entry at any
 * index {@code i} that has term {@code t} will also be present in the logs on all other servers in the cluster at the
 * same index {@code i} with term {@code t}. However, note that log compaction may break this contract. Considering log
 * compaction, it's more accurate to say that iff committed entry {@code i} is present in another server's log, that
 * entry has term {@code t} and the same value.
 * 

 * Entries are written to the log via the {@link #append(Entry)} method. When an entry is appended, it's written to the
 * next sequential index in the log after {@link #lastIndex()}. Entries can be created from a typed entry pool with the
 * {@link #create(Class)} method. 
 *   {@code
 *   long index;
 *   try (CommandEntry entry = log.create(CommandEntry.class)) {
 *     entry.setTerm(2)
 *       .setSequence(5)
 *       .setCommand(new PutCommand());
 *     index = log.append(entry);
 *   }
 *   }
 * 
 * 
 * {@link Entry entries} are appended to {@link Segment}s in the log. Segments are individual file or memory based
 * groups of sequential entries. Each segment has a fixed capacity in terms of either number of entries or size in
 * bytes. Once the capacity of a segment has been reached, the log rolls over to a new segment for the next entry that's
 * appended.
 * 

 * Internally, each segment maintains an in-memory index of entries. The index stores the offset and position of each
 * entry within the segment's internal {@link io.atomix.catalyst.buffer.Buffer}. For entries that are appended to the
 * log sequentially, the index has an O(1) lookup time. For instances where entries in a segment have been skipped (due
 * to log compaction), the lookup time is O(log n) due to binary search. However, due to the nature of the Raft
 * consensus algorithm, readers should typically benefit from O(1) lookups.
 * 

 * In order to prevent exhausting disk space, the log manages a set of background threads that periodically rewrite and
 * combine segments to free disk space. This is known as log compaction. As entries are committed to the log and applied
 * to the Raft state machine as {@link io.atomix.copycat.server.Commit} objects, state machines {@link #clean(long)}
 * entries that no longer apply to the state machine state. Internally, each log {@link Segment} maintains a compact
 * {@link io.atomix.catalyst.buffer.util.BitArray} to track cleaned entries. When an entry is cleaned, the entry's
 * offset is set in the bit array for the associated segment. The bit array represents the state of entries waiting to
 * be compacted from the log.
 * 

 * As entries are written to the log, segments reach their capacity and the log rolls over into new segments. Once a
 * segment is full and all of its entries have been {@link #commit(long) committed}, indicating they cannot be removed,
 * the segment becomes eligible for compaction. Log compaction processes come in two forms:
 * {@link io.atomix.copycat.server.storage.compaction.Compaction#MINOR} and
 * {@link io.atomix.copycat.server.storage.compaction.Compaction#MAJOR}, which can be configured in the {@link Storage}
 * configuration. Minor and major compaction serve to remove normal entries and tombstones from the log respectively.
 * 

 * Minor compaction is the more frequent and lightweight process. Periodically, according to the configured
 * {@link Storage#minorCompactionInterval()}, a background thread will evaluate the log for minor compaction. The minor
 * compaction process iterates through segments and selects compactable segments based on the ratio of entries that have
 * been {@link #clean(long) cleaned}. Minor compaction is generational. The
 * {@link io.atomix.copycat.server.storage.compaction.MinorCompactionManager} is more likely to select segments that haven't
 * yet been compacted than ones that have. Once a set of segments have been compacted, for each segment a
 * {@link io.atomix.copycat.server.storage.compaction.MinorCompactionTask} rewrites the segment without cleaned entries.
 * This rewriting results in a segment with missing entries, and Copycat's Raft implementation accounts for that. For
 * instance, a segment with entries {@code {1, 2, 3}} can become {@code {1, 3}} after being cleaned, and any attempt to
 * {@link #get(long) read} entry {@code 2} will result in a {@code null} entry.
 * 

 * However, note that minor compaction only applies to non-tombstone entries. Tombstones are entries that represent the
 * removal of state from the system, and that requires a more careful and costly compaction process to ensure consistency
 * in the event of a failure. Consider a state machine with the following two commands in the log:
 * 

 * {@code put 1}
 * {@code remove 1}
 * 
 * If the first command is written to segment {@code 1}, and the second command is written to segment {@code 2},
 * compacting segment {@code 2} (minor compaction may compact segments in any order) without removing the first command
 * from segment {@code 1} would effectively result in the undoing of the {@code remove 1} command. If the {@code remove 1}
 * command is removed from the log before {@code put 1}, a restart and replay of the log will result in the application of
 * {@code put 1} to the state machine, but not {@code remove 1}, thus resulting in an inconsistent state machine state.
 * 
 * Copycat handles tombstones by allowing tombstone entries to be flagged with the {@link Entry#isTombstone()} boolean.
 * The minor compaction process plainly ignores tombstone entries and leaves them up to the major compaction process to
 * handle. Major compaction works similarly to minor compaction in that the configured
 * {@link Storage#majorCompactionInterval()} dictates the interval at which the major compaction process runs. During
 * major compaction, the {@link io.atomix.copycat.server.storage.compaction.MajorCompactionManager} iterates through
 * all {@link #commit(long) committed} segments and rewrites them sequentially with all cleaned entries
 * removed, including tombstones. This ensures that earlier segments are compacted before later segments, and so
 * stateful entries that were {@link #clean(long) cleaned} prior to related tombstones are guaranteed to be removed
 * first.
 * 

 * As entries are removed from the log during minor and major compaction, log segment files begin to shrink. Copycat
 * does not want to have a thousand file pointers open, so some mechanism is required to combine segments as disk space
 * is freed. To that end, as the major compaction process iterates through the set of committed segments and rewrites
 * live entries, it combines multiple segments up to the configured segment capacity. When a segment becomes full during
 * major compaction, the compaction process rolls over to a new segment and continues compaction. This results in a
 * significantly smaller number of files.
 *
 * @author Jordan Halterman
 */
public class Log implements AutoCloseable {
  final SegmentManager segments;
  private final Compactor compactor;
  private final TypedEntryPool entryPool = new TypedEntryPool();
  private boolean open = true;

  /**
   * @throws NullPointerException if {@code name} or {@code storage} is null
   */
  protected Log(String name, Storage storage) {
    this.segments = new SegmentManager(name, storage);
    this.compactor = new Compactor(storage, segments, Executors.newScheduledThreadPool(storage.compactionThreads(),
        new CatalystThreadFactory("copycat-compactor-%d")));
  }

  /**
   * Returns the log compactor.
   *
   * @return The log compactor.
   */
  public Compactor compactor() {
    return compactor;
  }

  /**
   * Returns the log entry serializer.
   *
   * @return The log entry serializer.
   */
  public Serializer serializer() {
    return segments.serializer();
  }

  /**
   * Returns a boolean value indicating whether the log is open.
   *
   * @return Indicates whether the log is open.
   */
  public boolean isOpen() {
    return open;
  }

  /**
   * Asserts that the log is open.
   * 
   * @throws IllegalStateException if the log is not open
   */
  private void assertIsOpen() {
    Assert.state(isOpen(), "log is not open");
  }

  /**
   * Asserts that the index is a valid index.
   * 
   * @throws IndexOutOfBoundsException if the {@code index} is out of bounds
   */
  private void assertValidIndex(long index) {
    Assert.index(validIndex(index), "invalid log index: %d", index);
  }

  /**
   * Returns a boolean value indicating whether the log is empty.
   *
   * @return Indicates whether the log is empty.
   * @throws IllegalStateException If the log is not open.
   */
  public boolean isEmpty() {
    assertIsOpen();
    return segments.firstSegment().isEmpty();
  }

  /**
   * Returns the total size of all {@link Segment segments} of the log on disk in bytes.
   *
   * @return The total size of all {@link Segment segments} of the log in bytes.
   * @throws IllegalStateException If the log is not open.
   */
  public long size() {
    assertIsOpen();
    return segments.segments().stream().mapToLong(Segment::size).sum();
  }

  /**
   * Returns the number of entries in the log.
   * 

   * The length is the total number of {@link Entry entries} represented by the log on disk. This includes entries
   * that have been compacted from the log. So, in that sense, the length represents the total range of indexes.
   *
   * @return The number of entries in the log.
   * @throws IllegalStateException If the log is not open.
   */
  public long length() {
    assertIsOpen();
    return segments.segments().stream().mapToLong(Segment::length).sum();
  }

  /**
   * Returns the log's current first index.
   * 

   * If no entries have been written to the log then the first index will be {@code 0}. If the log contains entries then
   * the first index will be {@code 1}.
   *
   * @return The index of the first entry in the log or {@code 0} if the log is empty.
   * @throws IllegalStateException If the log is not open.
   */
  public long firstIndex() {
    return !isEmpty() ? segments.firstSegment().descriptor().index() : 0;
  }

  /**
   * Returns the index of the last entry in the log.
   * 

   * If no entries have been written to the log then the last index will be {@code 0}.
   *
   * @return The index of the last entry in the log or {@code 0} if the log is empty.
   * @throws IllegalStateException If the log is not open.
   */
  public long lastIndex() {
    return !isEmpty() ? segments.lastSegment().lastIndex() : 0;
  }

  /**
   * Checks whether we need to roll over to a new segment.
   */
  private void checkRoll() {
    if (segments.currentSegment().isFull()) {
      segments.nextSegment();
    }
  }

  /**
   * Creates a new log entry.
   * 

   * Users should ensure that the returned {@link Entry} is closed once the write is complete. Closing the entry will
   * result in its contents being persisted to the log. Only a single {@link Entry} instance may be open via the this
   * method at any given time.
   *
   * @param type The entry type.
   * @return The log entry.
   * @throws IllegalStateException If the log is not open
   * @throws NullPointerException If the {@code type} is {@code null}
   */
  public > T create(Class type) {
    Assert.notNull(type, "type");
    assertIsOpen();
    checkRoll();
    return entryPool.acquire(type, segments.currentSegment().nextIndex());
  }

  /**
   * Appends an entry to the log.
   *
   * @param entry The entry to append.
   * @return The appended entry index.
   * @throws IllegalStateException If the log is not open
   * @throws NullPointerException If {@code entry} is {@code null}
   * @throws IndexOutOfBoundsException If the entry's index does not match the expected next log index.
   */
  public long append(Entry entry) {
    Assert.notNull(entry, "entry");
    assertIsOpen();
    checkRoll();

    // Append the entry to the appropriate segment.
    return segments.currentSegment().append(entry);
  }

  /**
   * Gets an entry from the log at the given index.
   * 

   * If the given index is outside of the bounds of the log then a {@link IndexOutOfBoundsException} will be thrown. If
   * the entry at the given index has been compacted then the returned entry will be {@code null}.
   * 

   * Entries returned by this method are pooled and {@link io.atomix.catalyst.util.ReferenceCounted reference counted}.
   * In order to ensure the entry is released back to the internal entry pool call {@link Entry#close()} or load the
   * entry in a try-with-resources statement. 
   *   {@code
   *   try (RaftEntry entry = log.get(123)) {
   *     // Do some stuff...
   *   }
   *   }
   * 
   *
   * @param index The index of the entry to get.
   * @return The entry at the given index or {@code null} if the entry doesn't exist.
   * @throws IllegalStateException If the log is not open.
   * @throws IndexOutOfBoundsException If the given index is not within the bounds of the log.
   */
  public  T get(long index) {
    assertIsOpen();
    assertValidIndex(index);

    Segment segment = segments.segment(index);
    Assert.index(segment != null, "invalid index: " + index);
    T entry = segment.get(index);
    return entry != null ? entry : null;
  }

  /**
   * Returns a boolean value indicating whether the given index is within the bounds of the log.
   * 
   * If the index is less than {@code 1} or greater than {@link Log#lastIndex()} then this method will return
   * {@code false}, otherwise {@code true}.
   *
   * @param index The index to check.
   * @return Indicates whether the given index is within the bounds of the log.
   * @throws IllegalStateException If the log is not open.
   */
  private boolean validIndex(long index) {
    long firstIndex = firstIndex();
    long lastIndex = lastIndex();
    return !isEmpty() && firstIndex <= index && index <= lastIndex;
  }

  /**
   * Returns a boolean value indicating whether the log contains a live entry at the given index.
   *
   * @param index The index to check.
   * @return Indicates whether the log contains a live entry at the given index.
   * @throws IllegalStateException If the log is not open.
   */
  public boolean contains(long index) {
    if (!validIndex(index))
      return false;

    Segment segment = segments.segment(index);
    return segment != null && segment.contains(index);
  }

  /**
   * Cleans the entry at the given index.
   *
   * @param index The index of the entry to clean.
   * @return The log.
   * @throws IllegalStateException If the log is not open.
   * @throws IndexOutOfBoundsException If the given index is not within the bounds of the log.
   */
  public Log clean(long index) {
    assertIsOpen();
    assertValidIndex(index);

    Segment segment = segments.segment(index);
    Assert.index(segment != null, "invalid index: " + index);
    segment.clean(index);
    return this;
  }

  /**
   * Commits entries up to the given index to the log.
   *
   * @param index The index up to which to commit entries.
   * @return The log.
   * @throws IllegalStateException If the log is not open.
   */
  public Log commit(long index) {
    assertIsOpen();
    segments.commitIndex(index);
    return this;
  }

  /**
   * Skips the given number of entries.
   * 
   * This method essentially advances the log's {@link Log#lastIndex()} without writing any entries at the interim
   * indices. Note that calling {@code Loggable#truncate()} after {@code skip()} will result in the skipped entries
   * being partially or completely reverted.
   *
   * @param entries The number of entries to skip.
   * @return The log.
   * @throws IllegalStateException If the log is not open.
   * @throws IllegalArgumentException If the number of entries is less than {@code 1}
   * @throws IndexOutOfBoundsException If skipping the given number of entries places the index out of the bounds of the
   *           log.
   */
  public Log skip(long entries) {
    assertIsOpen();
    Segment segment = segments.currentSegment();
    segment.skip(entries);
    return this;
  }

  /**
   * Truncates the log up to the given index.
   *
   * @param index The index at which to truncate the log.
   * @return The updated log.
   * @throws IllegalStateException If the log is not open.
   * @throws IndexOutOfBoundsException If the given index is not within the bounds of the log.
   */
  public Log truncate(long index) {
    assertIsOpen();
    if (index > 0)
      assertValidIndex(index);
    Assert.index(index >= segments.commitIndex(), "cannot truncate committed entries");

    if (lastIndex() == index)
      return this;

    boolean first = true;
    for (Segment segment : segments.segments()) {
      if (first && index == 0 || segment.validIndex(index)) {
        segment.truncate(index);
        first = false;
      } else if (segment.descriptor().index() > index) {
        segments.removeSegment(segment);
      }
    }
    return this;
  }

  /**
   * Flushes the log to disk.
   *
   * @throws IllegalStateException If the log is not open.
   */
  public void flush() {
    assertIsOpen();
    segments.currentSegment().flush();
  }

  /**
   * Closes the log.
   * 
   * @throws IllegalStateException If the log is not open.
   */
  @Override
  public void close() {
    assertIsOpen();
    flush();
    segments.close();
    compactor.close();
    open = false;
  }

  /**
   * Returns a boolean value indicating whether the log is closed.
   *
   * @return Indicates whether the log is closed.
   */
  public boolean isClosed() {
    return !open;
  }

  /**
   * Deletes the log.
   */
  public void delete() {
    segments.delete();
  }

  @Override
  public String toString() {
    return String.format("%s[segments=%s]", getClass().getSimpleName(), segments);
  }

}