org.apache.kafka.raft.RaftClient Maven / Gradle / Ivy
The newest version!
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.kafka.raft;
import org.apache.kafka.raft.errors.BufferAllocationException;
import org.apache.kafka.raft.errors.NotLeaderException;
import org.apache.kafka.raft.errors.UnexpectedBaseOffsetException;
import org.apache.kafka.snapshot.SnapshotReader;
import org.apache.kafka.snapshot.SnapshotWriter;
import java.util.List;
import java.util.Optional;
import java.util.OptionalInt;
import java.util.OptionalLong;
import java.util.concurrent.CompletableFuture;
public interface RaftClient extends AutoCloseable {
interface Listener {
/**
* Callback which is invoked for all records committed to the log.
* It is the responsibility of this implementation to invoke {@link BatchReader#close()}
* after consuming the reader.
*
* Note that there is not a one-to-one correspondence between writes through
* {@link #scheduleAppend(int, List)} or {@link #scheduleAtomicAppend(int, OptionalLong, List)}
* and this callback. The Raft implementation is free to batch together the records
* from multiple append calls provided that batch boundaries are respected. Records
* specified through {@link #scheduleAtomicAppend(int, OptionalLong, List)} are guaranteed to be a
* subset of a batch provided by the {@link BatchReader}. Records specified through
* {@link #scheduleAppend(int, List)} are guaranteed to be in the same order but
* they can map to any number of batches provided by the {@link BatchReader}.
*
* @param reader reader instance which must be iterated and closed
*/
void handleCommit(BatchReader reader);
/**
* Callback which is invoked when the Listener needs to load a snapshot.
* It is the responsibility of this implementation to invoke {@link SnapshotReader#close()}
* after consuming the reader.
*
* When handling this call, the implementation must assume that all previous calls
* to {@link #handleCommit} contain invalid data.
*
* @param reader snapshot reader instance which must be iterated and closed
*/
void handleLoadSnapshot(SnapshotReader reader);
/**
* Called on any change to leadership. This includes both when a leader is elected and
* when a leader steps down or fails.
*
* If this node is the leader, then the notification of leadership will be delayed until
* the implementation of this interface has caught up to the high-watermark through calls to
* {@link #handleLoadSnapshot(SnapshotReader)} and {@link #handleCommit(BatchReader)}.
*
* If this node is not the leader, then this method will be called as soon as possible. In
* this case the leader may or may not be known for the current epoch.
*
* Subsequent calls to this method will expose a monotonically increasing epoch. For a
* given epoch the leader may be unknown, {@code leader.leaderId} is {@code OptionalInt#empty},
* or known {@code leader.leaderId} is {@code OptionalInt#of}. Once a leader is known for
* a given epoch it will remain the leader for that epoch. In other words, the implementation of
* method should expect this method will be called at most twice for each epoch. Once if the
* epoch changed but the leader is not known and once when the leader is known for the current
* epoch.
*
* @param leader the current leader and epoch
*/
default void handleLeaderChange(LeaderAndEpoch leader) {}
default void beginShutdown() {}
}
/**
* Register a listener to get commit, snapshot and leader notifications.
*
* The implementation of this interface assumes that each call to {@code register} uses
* a different {@code Listener} instance. If the same instance is used for multiple calls
* to this method, then only one {@code Listener} will be registered.
*
* @param listener the listener to register
*/
void register(Listener listener);
/**
* Unregisters a listener.
*
* To distinguish from events that happened before the call to {@code unregister} and a future
* call to {@code register}, different {@code Listener} instances must be used.
*
* If the {@code Listener} provided was never registered then the unregistration is ignored.
*
* @param listener the listener to unregister
*/
void unregister(Listener listener);
/**
* Returns the current high watermark, or OptionalLong.empty if it is not known.
*/
OptionalLong highWatermark();
/**
* Return the current {@link LeaderAndEpoch}.
*
* @return the current leader and epoch
*/
LeaderAndEpoch leaderAndEpoch();
/**
* Get local nodeId if one is defined. This may be absent when the client is used
* as an anonymous observer, as in the case of the metadata shell.
*
* @return optional node id
*/
OptionalInt nodeId();
/**
* Append a list of records to the log. The write will be scheduled for some time
* in the future. There is no guarantee that appended records will be written to
* the log and eventually committed. While the order of the records is preserve, they can
* be appended to the log using one or more batches. Each record may be committed independently.
* If a record is committed, then all records scheduled for append during this epoch
* and prior to this record are also committed.
*
* If the provided current leader epoch does not match the current epoch, which
* is possible when the state machine has yet to observe the epoch change, then
* this method will throw an {@link NotLeaderException} to indicate the leader
* to resign its leadership. The state machine is expected to discard all
* uncommitted entries after observing an epoch change.
*
* @param epoch the current leader epoch
* @param records the list of records to append
* @return the expected offset of the last record if append succeed
* @throws org.apache.kafka.common.errors.RecordBatchTooLargeException if the size of the records is greater than the maximum
* batch size; if this exception is throw none of the elements in records were
* committed
* @throws NotLeaderException if we are not the current leader or the epoch doesn't match the leader epoch
* @throws BufferAllocationException if we failed to allocate memory for the records
*/
long scheduleAppend(int epoch, List records);
/**
* Append a list of records to the log. The write will be scheduled for some time
* in the future. There is no guarantee that appended records will be written to
* the log and eventually committed. However, it is guaranteed that if any of the
* records become committed, then all of them will be.
*
* If the provided current leader epoch does not match the current epoch, which
* is possible when the state machine has yet to observe the epoch change, then
* this method will throw an {@link NotLeaderException} to indicate the leader
* to resign its leadership. The state machine is expected to discard all
* uncommitted entries after observing an epoch change.
*
* If the current base offset does not match the supplied required base offset,
* then this method will throw {@link UnexpectedBaseOffsetException}.
*
* @param epoch the current leader epoch
* @param requiredBaseOffset if this is set, it is the offset we must use as the base offset.
* @param records the list of records to append
* @return the expected offset of the last record if append succeed
* @throws org.apache.kafka.common.errors.RecordBatchTooLargeException if the size of the records is greater than the maximum
* batch size; if this exception is throw none of the elements in records were
* committed
* @throws NotLeaderException if we are not the current leader or the epoch doesn't match the leader epoch
* @throws BufferAllocationException we failed to allocate memory for the records
* @throws UnexpectedBaseOffsetException the requested base offset could not be obtained.
*/
long scheduleAtomicAppend(int epoch, OptionalLong requiredBaseOffset, List records);
/**
* Attempt a graceful shutdown of the client. This allows the leader to proactively
* resign and help a new leader to get elected rather than forcing the remaining
* voters to wait for the fetch timeout.
*
* Note that if the client has hit an unexpected exception which has left it in an
* indeterminate state, then the call to shutdown should be skipped. However, it
* is still expected that {@link #close()} will be used to clean up any resources
* in use.
*
* @param timeoutMs How long to wait for graceful completion of pending operations.
* @return A future which is completed when shutdown completes successfully or the timeout expires.
*/
CompletableFuture shutdown(int timeoutMs);
/**
* Resign the leadership. The leader will give up its leadership in the passed epoch
* (if it matches the current epoch), and a new election will be held. Note that nothing
* prevents this node from being reelected as the leader.
*
* Notification of successful resignation can be observed through
* {@link Listener#handleLeaderChange(LeaderAndEpoch)}.
*
* @param epoch the epoch to resign from. If this epoch is smaller than the current epoch, this
* call will be ignored.
*
* @throws IllegalArgumentException - if the passed epoch is invalid (negative or greater than current) or
* if the listener is not the leader associated with this epoch.
*/
void resign(int epoch);
/**
* Create a writable snapshot file for a committed offset and epoch.
*
* The RaftClient assumes that the snapshot returned will contain the records up to, but not
* including the committed offset and epoch. If no records have been committed, it is possible
* to generate an empty snapshot using 0 for both the offset and epoch.
*
* See {@link SnapshotWriter} for details on how to use this object. If a snapshot already
* exists then returns an {@link Optional#empty()}.
*
* @param snapshotId The ID of the new snapshot, which includes the (exclusive) last committed offset
* and the last committed epoch.
* @param lastContainedLogTime The append time of the highest record contained in this snapshot
* @return a writable snapshot if it doesn't already exist
* @throws IllegalArgumentException if the committed offset is greater than the high-watermark
* or less than the log start offset.
*/
Optional> createSnapshot(OffsetAndEpoch snapshotId, long lastContainedLogTime);
/**
* The snapshot id for the latest snapshot.
*
* Returns the snapshot id of the latest snapshot, if it exists. If a snapshot doesn't exist, returns an
* {@link Optional#empty()}.
*
* @return the id of the latest snapshot, if it exists
*/
Optional latestSnapshotId();
/**
* Returns the current end of the log. This method is thread-safe.
*
* @return the log end offset, which is one greater than the offset of the last record written,
* or 0 if there have not been any records written.
*/
long logEndOffset();
}