All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.jgroups.protocols.raft.RAFT Maven / Gradle / Ivy

The newest version!
package org.jgroups.protocols.raft;

import org.jgroups.Address;
import org.jgroups.BytesMessage;
import org.jgroups.EmptyMessage;
import org.jgroups.Event;
import org.jgroups.JChannel;
import org.jgroups.Message;
import org.jgroups.ObjectMessage;
import org.jgroups.View;
import org.jgroups.annotations.MBean;
import org.jgroups.annotations.ManagedAttribute;
import org.jgroups.annotations.ManagedOperation;
import org.jgroups.annotations.Property;
import org.jgroups.conf.AttributeType;
import org.jgroups.conf.ClassConfigurator;
import org.jgroups.protocols.raft.state.RaftState;
import org.jgroups.raft.Options;
import org.jgroups.raft.Settable;
import org.jgroups.raft.StateMachine;
import org.jgroups.raft.util.CommitTable;
import org.jgroups.raft.util.LogCache;
import org.jgroups.raft.util.RequestTable;
import org.jgroups.stack.Protocol;
import org.jgroups.util.AverageMinMax;
import org.jgroups.util.ByteArrayDataInputStream;
import org.jgroups.util.ByteArrayDataOutputStream;
import org.jgroups.util.DefaultThreadFactory;
import org.jgroups.util.ExtendedUUID;
import org.jgroups.util.MessageBatch;
import org.jgroups.util.Runner;
import org.jgroups.util.Util;

import java.io.File;
import java.net.InetAddress;
import java.nio.ByteBuffer;
import java.util.ArrayList;
import java.util.Collection;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Objects;
import java.util.Set;
import java.util.concurrent.ArrayBlockingQueue;
import java.util.concurrent.BlockingQueue;
import java.util.concurrent.CompletableFuture;
import java.util.concurrent.TimeUnit;
import java.util.concurrent.atomic.AtomicInteger;
import java.util.concurrent.atomic.LongAdder;
import java.util.function.Function;
import java.util.function.ObjLongConsumer;
import java.util.regex.Matcher;
import java.util.regex.Pattern;


/**
 * Implementation of the RAFT consensus protocol in JGroups

* [1] https://github.com/ongardie/dissertation
* The implementation uses a queue to which the following types of requests are added: down-requests (invocations of * {@link #setAsync(byte[], int, int)}) * and up-requests (requests or responses received in {@link #up(Message)} or {@link #up(MessageBatch)}). *
* Leaders handle down-requests (resulting in sending AppendEntriesRequests) and up-requests (responses). * Followers handle only up-requests (AppendEntriesRequests) and send responses. Note that the periodic sending of * AppendEntriesRequests (if needed) is also done by the queue handling thread. *
* The use of the queue makes the RAFT protocol effectively single-threaded; ie. only 1 thread ever changes * state, so synchronization can be removed altogether. The only exception to this is invocation of * {@link #changeRole(Role)}, called by {@link ELECTION}: this still needs to be changed (probably by adding it as an * event to the queue, too). * @author Bela Ban * @since 0.1 */ @MBean(description="Implementation of the RAFT consensus protocol") public class RAFT extends Protocol implements Settable, DynamicMembership { public static final byte[] raft_id_key = Util.stringToBytes("raft-id"); protected static final short RAFT_ID = 521; protected static final short APPEND_ENTRIES_REQ = 2000; protected static final short APPEND_ENTRIES_RSP = 2001; protected static final short APPEND_RESULT = 2002; protected static final short INSTALL_SNAPSHOT_REQ = 2003; protected static final short LOG_ENTRIES = 2004; public static final Function print_function=uuid -> { byte[] val=uuid.get(raft_id_key); return val != null? Util.bytesToString(val) : uuid.print(); }; static { ClassConfigurator.addProtocol(RAFT_ID, RAFT.class); ClassConfigurator.add(APPEND_ENTRIES_REQ, AppendEntriesRequest.class); ClassConfigurator.add(APPEND_ENTRIES_RSP, AppendEntriesResponse.class); ClassConfigurator.add(APPEND_RESULT, AppendResult.class); ClassConfigurator.add(INSTALL_SNAPSHOT_REQ, InstallSnapshotRequest.class); ClassConfigurator.add(LOG_ENTRIES, LogEntries.class); } @Property(description="The identifier of this node. Needs to be unique and an element of members. Must not be null", writable=false) protected String raft_id; protected final PersistentState internal_state=new PersistentState(); protected final RaftState raft_state = new RaftState(this, this::leaderUpdated); @ManagedAttribute(description="Majority needed to achieve consensus; computed from members)") protected int majority=-1; @Property(description="If true, we can change 'members' at runtime") protected boolean dynamic_view_changes=true; @Property(description="The fully qualified name of the class implementing Log") protected String log_class="org.jgroups.protocols.raft.LevelDBLog"; @Property(description="Arguments to the log impl, e.g. k1=v1,k2=v2. These will be passed to init()") protected String log_args; @Property(description="The directory in which the log and snapshots are stored. Defaults to the temp dir") protected String log_dir=Util.checkForMac()? File.separator + "tmp" : System.getProperty("java.io.tmpdir", File.separator + "tmp"); @Property(description="The prefix of the log and snapshot. If null, the logical name of the channel is used as prefix") protected String log_prefix; @ManagedAttribute(description="The name of the log") protected String log_name; @Property(description="Interval (ms) at which AppendEntries messages are resent to members with missing log entries", type=AttributeType.TIME) protected long resend_interval=1000; @Property(description="Send commit message to followers immediately after leader commits (majority has consensus). " + "Caution : it may generate more traffic than expected") protected boolean send_commits_immediately; @Property(description="Max number of bytes a log can have until a snapshot is created",type=AttributeType.BYTES) protected int max_log_size=1_000_000; protected int _max_log_cache_size=1024; protected boolean _log_use_fsync; @ManagedAttribute(description="The current size of the log in bytes",type=AttributeType.BYTES) protected long curr_log_size; // keeps counts of the bytes added to the log @ManagedAttribute(description="Number of successful AppendEntriesRequests") protected int num_successful_append_requests; @ManagedAttribute(description="Number of snapshot messages received (by a follower)") protected int num_snapshot_received; @ManagedAttribute(description="Average AppendEntries batch size") protected AverageMinMax avg_append_entries_batch_size=new AverageMinMax(); @ManagedAttribute(description="Number of failed AppendEntriesRequests because the entry wasn't found in the log") protected int num_failed_append_requests_not_found; @ManagedAttribute(description="Number of failed AppendEntriesRequests because the prev entry's term didn't match") protected int num_failed_append_requests_wrong_term; protected StateMachine state_machine; protected boolean state_machine_loaded; protected Log log_impl; protected RequestTable request_table; protected CommitTable commit_table; protected final List role_change_listeners=new ArrayList<>(); // Set to true during an addServer()/removeServer() op until the change has been committed // protected final AtomicBoolean members_being_changed = new AtomicBoolean(false); /** The current role (follower, candidate or leader). Every node starts out as a follower */ protected volatile RaftImpl impl=new Follower(this); protected volatile View view; @ManagedAttribute(description="Index of the highest log entry appended to the log",type=AttributeType.SCALAR) protected long last_appended; @ManagedAttribute(description="Index of the last committed log entry",type=AttributeType.SCALAR) protected long commit_index; @ManagedAttribute(description="The number of snapshots performed") protected int num_snapshots; @ManagedAttribute(description="The number of times AppendEntriesRequests were resent") protected int num_resends; @Property(description="Max size in items the processing queue can have",type=AttributeType.SCALAR) protected int processing_queue_max_size=9182; /** All requests are added to this queue; a single thread processes this queue - hence no synchronization issues */ protected BlockingQueue processing_queue; protected final List remove_queue=new ArrayList<>(); protected Runner runner; // the single thread processing the request queue protected boolean synchronous; // used by the synchronous execution framework (only for testing) // used to add/remove servers one-by-one protected CompletableFuture add_server_future=CompletableFuture.completedFuture(null); /* ============================== EXPERIMENTAL - most of these metrics will be removed again ================== */ @ManagedAttribute(description="Size of remove-queue") public int removeQueueSize() { return remove_queue.size(); } @ManagedAttribute(description="Size of processing queue") public int processingQueueSize() { return processing_queue.size(); } @ManagedAttribute final LongAdder drained_total=new LongAdder(); @ManagedAttribute final AverageMinMax drained_avg=new AverageMinMax(); @ManagedAttribute final LongAdder drained_down=new LongAdder(), drained_up=new LongAdder(); @ManagedAttribute public String drainRatio() { double down=(double)drained_down.sum() / drained_total.sum(); double up=(double)drained_up.sum() / drained_total.sum(); return String.format("down=%.2f up=%.2f", down, up); } /* ============================================================================================================ */ public String raftId() {return raft_id;} public RAFT raftId(String id) {if(id != null) this.raft_id=id; return this;} public RaftImpl impl() {return impl;} public int majority() {return majority;} public String logClass() {return log_class;} public RAFT logClass(String clazz) {log_class=clazz; return this;} public String logArgs() {return log_args;} public RAFT logArgs(String args) {log_args=args; return this;} public String logPrefix() {return log_prefix;} public RAFT logPrefix(String name) {log_prefix=name; return this;} public String logName() {return log_name;} public long resendInterval() {return resend_interval;} public RAFT resendInterval(long val) {resend_interval=val; return this;} public boolean sendCommitsImmediately() {return send_commits_immediately;} public RAFT sendCommitsImmediately(boolean v) {send_commits_immediately=v; return this;} public int maxLogSize() {return max_log_size;} public RAFT maxLogSize(int val) {max_log_size=val; return this;} public long currentLogSize() {return curr_log_size;} @ManagedAttribute(description="Number of pending requests") public int requestTableSize() {return request_table != null? request_table.size() : 0;} public int numSnapshots() {return num_snapshots;} @ManagedAttribute(description="The current leader (can be null if there is currently no leader) ") public Address leader() {return raft_state.leader();} public RAFT leader(Address new_leader) {this.raft_state.setLeader(new_leader); return this;} public boolean isLeader() {return Objects.equals(leader(), local_addr);} public RAFT stateMachine(StateMachine sm) {this.state_machine=sm; return this;} public StateMachine stateMachine() {return state_machine;} public CommitTable commitTable() {return commit_table;} @ManagedAttribute(description="The current term. Incremented on leader change, or when a higher term is seen") public long currentTerm() {return raft_state.currentTerm();} @ManagedAttribute(description="The member this member voted for in the current term") public Address votedFor() {return raft_state.votedFor();} public long lastAppended() {return last_appended;} public long commitIndex() {return commit_index;} public Log log() {return log_impl;} public RAFT log(Log new_log) {this.log_impl=new_log; return this;} public RAFT addRoleListener(RoleChange c) {this.role_change_listeners.add(c); return this;} public RAFT remRoleListener(RoleChange c) {this.role_change_listeners.remove(c); return this;} public RAFT stateMachineLoaded(boolean b) {this.state_machine_loaded=b; return this;} public boolean synchronous() {return synchronous;} public RAFT synchronous(boolean b) {synchronous=b; return this;} public RAFT logDir(String logDir) { this.log_dir = logDir; return this; } public void resetStats() { super.resetStats(); num_snapshots=num_resends=num_successful_append_requests=num_failed_append_requests_not_found =num_failed_append_requests_wrong_term=num_snapshot_received=0; if(log_impl instanceof LogCache) ((LogCache)log_impl).resetStats(); drained_total.reset(); drained_avg.clear(); drained_down.reset(); drained_up.reset(); avg_append_entries_batch_size.clear(); } @Property(description="Max size of the log cache (0 disables the log cache)",type=AttributeType.BYTES) public int maxLogCacheSize() { return _max_log_cache_size; } @Property public RAFT maxLogCacheSize(int size) { _max_log_cache_size=size; if(log_impl == null) // initial configuration return this; if(log_impl instanceof LogCache) ((LogCache)log_impl).maxSize(size); else { if(size <= 0) disableLogCache(); else enableLogCache(); } return this; } @Property(description="If true, a change is guaranteed to be written to disk when the call returns") public RAFT logUseFsync(boolean b) {_log_use_fsync=b; if(log_impl != null) log_impl.useFsync(b); return this;} @Property public boolean logUseFsync() {return log_impl.useFsync();} @ManagedAttribute(description="Number of times the log cache has been trimmed",type=AttributeType.SCALAR) public int logCacheNumTrims() { return log_impl instanceof LogCache? ((LogCache)log_impl).numTrims() : 0; } @ManagedAttribute(description="Number of times the cache has been accessed",type=AttributeType.SCALAR) public int LogCacheNumAccesses() { return log_impl instanceof LogCache? ((LogCache)log_impl).numAccesses() : 0; } @ManagedAttribute(description="Hit ratio of the cache") public double logCacheHitRatio() { return log_impl instanceof LogCache? ((LogCache)log_impl).hitRatio() : 0; } @Property(description="List of members (logical names); majority is computed from it") public void setMembers(String list) { members(Util.parseCommaDelimitedStrings(list)); } public RAFT members(Collection list) { internal_state.setMembers(list); computeMajority(); return this; } @ManagedAttribute(description = "The current list of members") public List members() { return internal_state.getMembers(); } /** * Sets current_term if new_term is bigger * @param new_term The new term * @return -1 if new_term is smaller, 0 if equal and 1 if new_term is bigger */ public int currentTerm(final long new_term) { return raft_state.tryAdvanceTerm(new_term); } public RAFT votedFor(Address mbr) { raft_state.setVotedFor(mbr); return this; } @ManagedAttribute(description="The current role") public String role() {return impl.getClass().getSimpleName();} @ManagedOperation(description="Dumps the commit table") public String dumpCommitTable() {return commit_table != null? "\n" + commit_table : "n/a";} @ManagedAttribute(description="Number of log entries in the log") public long logSize() {return log_impl.size();} @ManagedAttribute(description="Describes the log") public String logDescription() { if(log_impl instanceof LogCache) { LogCache lc=(LogCache)log_impl; return String.format("%s (%d/%d) -> %s", lc.getClass().getSimpleName(), lc.cacheSize(), lc.maxSize(), lc.log().getClass().getSimpleName()); } return log_impl.getClass().getSimpleName(); } /** This is a managed operation because it should invoked sparingly (costly) */ @ManagedOperation(description="Number of bytes in the log") public long logSizeInBytes() { return log_impl.sizeInBytes(); } @ManagedOperation(description="Dumps the last N log entries") public String dumpLog(long last_n) { final StringBuilder sb=new StringBuilder(); long to=last_appended, from=Math.max(1, to-last_n); log_impl.forEach((entry,index) -> sb.append("index=").append(index).append(", term=").append(entry.term()).append(" (") .append(entry.command().length).append(" bytes)\n"), from, to); return sb.toString(); } @ManagedOperation(description="Dumps all log entries") public String dumpLog() {return dumpLog(last_appended - 1);} @ManagedOperation(description="Enabled the log cache") public void enableLogCache() { if(!(log_impl instanceof LogCache)) { if(_max_log_cache_size <= 0) log.error("cannot enable log cache as max_log_cache_size is 0"); else log_impl=new LogCache(log_impl, _max_log_cache_size); } } @ManagedOperation(description="Disables the log cache") public void disableLogCache() { if(log_impl instanceof LogCache) { LogCache lc=(LogCache)log_impl; log_impl=lc.log(); lc.clear(); } } @ManagedOperation(description="Clears the log cache") public RAFT clearLogCache() { if(log_impl instanceof LogCache) ((LogCache)log_impl).clear(); return this; } @ManagedOperation(description="Trims the log cache to max_log_cache_size") public RAFT trimLogCache() { if(log_impl instanceof LogCache) ((LogCache)log_impl).trim(); return this; } public void logEntries(ObjLongConsumer func) { log_impl.forEach(func); } public long createNewTerm() { return raft_state.advanceTermForElection(); } @SuppressWarnings("unchecked") public static T findProtocol(Class clazz, final Protocol start, boolean down) { Protocol prot=start; while(prot != null && clazz != null) { if(clazz.isAssignableFrom(prot.getClass())) return (T)prot; prot=down? prot.getDownProtocol() : prot.getUpProtocol(); } return null; } @ManagedOperation(description="Adds a new server to members. Prevents duplicates") public CompletableFuture addServer(String name) throws Exception { return changeMembers(name, InternalCommand.Type.addServer); } @ManagedOperation(description="Removes a new server from members") public CompletableFuture removeServer(String name) throws Exception { return changeMembers(name, InternalCommand.Type.removeServer); } /** Creates a snapshot and truncates the log. See https://github.com/belaban/jgroups-raft/issues/7 for details */ @ManagedOperation(description="Creates a new snapshot and truncates the log") public void snapshot() throws Exception { snapshotAsync().get(); } public CompletableFuture snapshotAsync() { CompletableFuture f = new CompletableFuture<>(); offer(new SnapshotRequest(f)); return f; } /** Loads the log entries from [first .. commit_index] into the state machine */ @ManagedOperation(description="Reads the snapshot (if present) and loads log entries from [first .. commit_index] " + "into the state machine") public void initStateMachineFromLog() throws Exception { if(state_machine == null || state_machine_loaded) return; int snapshot_offset=0; // 0 when no snapshot is present, 1 otherwise ByteBuffer sn=log_impl.getSnapshot(); if(sn != null) { ByteArrayDataInputStream in=new ByteArrayDataInputStream(sn); internal_state.readFrom(in); state_machine.readContentFrom(in); snapshot_offset=1; log.debug("%s: initialized state machine from snapshot (%d bytes)", local_addr, sn.position()); } long from=Math.max(1, log_impl.firstAppended()+snapshot_offset), to=commit_index, count=0; for(long i=from; i <= to; i++) { LogEntry log_entry=log_impl.get(i); if(log_entry == null) { log.error("%s: log entry for index %d not found in log", local_addr, i); break; } if(log_entry.command != null) { if(log_entry.internal) executeInternalCommand(null, log_entry.command, log_entry.offset, log_entry.length); else { state_machine.apply(log_entry.command, log_entry.offset, log_entry.length, true); count++; } } } state_machine_loaded=true; if(count > 0) log.debug("%s: applied %d entries from the log (%d - %d) to the state machine", local_addr, count, from, to); } @Override public void init() throws Exception { super.init(); // we can only add/remove 1 member at a time (section 4.1 of [1]) Set tmp=new HashSet<>(internal_state.getMembers()); if(tmp.size() != internal_state.getMembers().size()) { log.error("members (%s) contains duplicates; removing them and setting members to %s", internal_state.getMembers(), tmp); internal_state.setMembers(new ArrayList<>(tmp)); } computeMajority(); if(raft_id == null) raft_id=InetAddress.getLocalHost().getHostName(); // Set an AddressGenerator in channel which generates ExtendedUUIDs and adds the raft_id to the hashmap JChannel ch=stack != null? stack.getChannel() : null; if(ch != null) { ch.addAddressGenerator(() -> { ExtendedUUID.setPrintFunction(print_function); return ExtendedUUID.randomUUID(ch.getName()).put(raft_id_key, Util.stringToBytes(raft_id)); }); } processing_queue=new ArrayBlockingQueue<>(processing_queue_max_size); runner=new Runner(new DefaultThreadFactory("runner", true, true), "runner", this::processQueue, null); } @Override public void start() throws Exception { super.start(); if(log_impl == null) { if(log_class == null) throw new IllegalStateException("log_class has to be defined"); Class clazz=Util.loadClass(log_class, getClass()); log_impl=(Log)clazz.getDeclaredConstructor().newInstance(); Map args; if(log_args != null && !log_args.isEmpty()) args=parseCommaDelimitedProps(log_args); else args=new HashMap<>(); if(log_prefix == null) log_prefix=raft_id; log_name=createLogName(log_prefix, "log"); log_impl.init(log_name, args); } if(!(local_addr instanceof ExtendedUUID)) throw new IllegalStateException("local address must be an ExtendedUUID but is a " + local_addr.getClass().getSimpleName()); last_appended=log_impl.lastAppended(); commit_index=log_impl.commitIndex(); raft_state.reload(); log.trace("%s: set last_appended=%d, commit_index=%d, current_state=%s", local_addr, last_appended, commit_index, raft_state); initStateMachineFromLog(); if(!internal_state.getMembers().contains(raft_id)) throw new IllegalStateException(String.format("raft-id %s is not listed in members %s", raft_id, internal_state.getMembers())); curr_log_size=logSizeInBytes(); log_impl.useFsync(_log_use_fsync); if(_max_log_cache_size > 0) // the log cache is enabled log_impl=new LogCache(log_impl, _max_log_cache_size); runner.start(); } @Override public void stop() { super.stop(); add_server_future.complete(null); runner.stop(); impl.destroy(); Util.close(log_impl); } public Object down(Event evt) { if(evt.getType() == Event.VIEW_CHANGE) handleView(evt.getArg()); return down_prot.down(evt); } public Object up(Event evt) { if(evt.getType() == Event.VIEW_CHANGE) handleView(evt.getArg()); return up_prot.up(evt); } public Object up(Message msg) { RaftHeader hdr=msg.getHeader(id); if(hdr != null) { if(synchronous) handleUpRequest(msg, hdr); else add(new UpRequest(msg, hdr)); return null; } return up_prot.up(msg); } public void up(MessageBatch batch) { for(Iterator it = batch.iterator(); it.hasNext();) { Message msg=it.next(); RaftHeader hdr=msg.getHeader(id); if(hdr != null) { it.remove(); if(synchronous) handleUpRequest(msg, hdr); else add(new UpRequest(msg, hdr)); } } if(!batch.isEmpty()) up_prot.up(batch); } @ManagedOperation(description="Sends all pending AppendEntriesRequests") public void flushCommitTable() { if(commit_table != null) commit_table.forEach(this::sendAppendEntriesMessage); } /** * Triggers a flush of the entries to the given member. * * @param member: The not-null member address to send the entries. * @throws IllegalStateException: Thrown in case the current node is not the leader. * @throws NullPointerException: Thrown in case the {@param member} is null. */ public void flushCommitTable(Address member) { if (!isLeader()) throw new IllegalStateException("Currently not the leader, should be " + leader()); CommitTable.Entry e=commit_table.get(Objects.requireNonNull(member)); if(e != null) sendAppendEntriesMessage(member, e); } /** * Called by a building block to apply a change to all state machines in a cluster. This starts the consensus * protocol to get a majority to commit this change.

* This call is non-blocking and returns a future as soon as the AppendEntries message has been sent.

* Only applicable on the leader * @param buf The command * @param offset The offset into the buffer * @param length The length of the buffer * @return A CompletableFuture. Can be used to wait for the result (sync). A blocking caller could call * set(), then call future.get() to block for the result. */ public CompletableFuture setAsync(byte[] buf, int offset, int length, Options options) { return setAsync(buf, offset, length, false, options); } public CompletableFuture setAsync(byte[] buf, int offset, int length, boolean internal, Options options) { Address leader = leader(); if(leader == null || (local_addr != null && !leader.equals(local_addr))) throw notCurrentLeader(); if(buf == null) throw new IllegalArgumentException("buffer must not be null"); CompletableFuture retval=new CompletableFuture<>(); RequestTable reqtab=request_table; if(reqtab == null) { retval.completeExceptionally(new IllegalStateException("request table was null on " + impl.getClass().getSimpleName())); return retval; } if(synchronous) // set only for testing purposes handleDownRequest(retval, buf, offset, length, internal, options); else { offer(new DownRequest(retval, buf, offset, length, internal, options)); // will call handleDownRequest() } return retval; // 4. Return CompletableFuture } public String toString() { return String.format("%s %s: commit=%d last-appended=%d curr-state=%s", RAFT.class.getSimpleName(), local_addr, commit_index, last_appended, raft_state); } protected void add(Request r) { try { processing_queue.put(r); } catch(InterruptedException ex) { log.error("%s: failed adding %s to processing queue: %s", local_addr, r, ex); r.failed(ex); } } protected void offer(Request r) { if (!processing_queue.offer(r)) { r.failed(new IllegalStateException("processing queue is full")); } } /** This method is always called by a single thread only, and does therefore not need to be reentrant */ protected void handleDownRequest(CompletableFuture f, byte[] buf, int offset, int length, boolean internal, Options opts) { Address leader = leader(); if(leader == null || !Objects.equals(leader,local_addr)) throw notCurrentLeader(); RequestTable reqtab=request_table; // 1. Append to the log long prev_index=last_appended; long curr_index=++last_appended; long current_term=currentTerm(); LogEntry entry=log_impl.get(prev_index); long prev_term=entry != null? entry.term : 0; LogEntries entries=new LogEntries().add(new LogEntry(current_term, buf, offset, length, internal)); last_appended=log_impl.append(curr_index, entries); num_successful_append_requests+=entries.size(); // 2. Add the request to the client table, so we can return results to clients when done reqtab.create(curr_index, raft_id, f, this::majority, opts); // 3. Multicast an AppendEntries message (exclude self) Message msg=new ObjectMessage(null, entries) .putHeader(id, new AppendEntriesRequest(this.local_addr, current_term, prev_index, prev_term, current_term, commit_index)) .setFlag(Message.TransientFlag.DONT_LOOPBACK); // don't receive my own request down_prot.down(msg); snapshotIfNeeded(length); // see if we can already commit some entries long highest_committed=prev_index+1; while(reqtab.isCommitted(highest_committed)) highest_committed++; if(highest_committed > prev_index+1) commitLogTo(highest_committed, true); } public void handleUpRequest(Message msg, RaftHeader hdr) { // if hdr.term < current_term -> drop message // if hdr.term > current_term -> set current_term and become Follower, accept message // if hdr.term == current_term -> accept message int rc=currentTerm(hdr.curr_term); if(rc < 0) return; // same term (rc == 0) RaftImpl ri=impl; if(ri == null) return; if(hdr instanceof AppendEntriesRequest) { long current_term = currentTerm(); AppendEntriesRequest r=(AppendEntriesRequest)hdr; ObjectMessage om=(ObjectMessage)msg; log.trace("%s: from %s, %s header %s", local_addr, msg.src(), om, r); AppendResult res=ri.handleAppendEntriesRequest(om.getObject(), msg.src(), r.prev_log_index, r.prev_log_term, r.entry_term, r.leader_commit); res.commitIndex(commit_index); Message rsp=new EmptyMessage(msg.src()).putHeader(id, new AppendEntriesResponse(current_term, res)); down_prot.down(rsp); } else if(hdr instanceof AppendEntriesResponse) { AppendEntriesResponse rsp=(AppendEntriesResponse)hdr; log.trace("%s: from %s res %s", local_addr, msg.src(), rsp); ri.handleAppendEntriesResponse(msg.src(),rsp.curr_term, rsp.result); } else if(hdr instanceof InstallSnapshotRequest) { InstallSnapshotRequest req=(InstallSnapshotRequest)hdr; ri.handleInstallSnapshotRequest(msg, req.leader, req.last_included_index, req.last_included_term); } else log.warn("%s: invalid header %s",local_addr,hdr.getClass().getCanonicalName()); } protected void processQueue() { Request first_req; try { first_req=processing_queue.poll(resend_interval, TimeUnit.MILLISECONDS); if(first_req == null) { // poll() timed out if(commit_table != null) commit_table.forEach(this::sendAppendEntriesMessage); return; } for(;;) { remove_queue.clear(); if(first_req != null) { remove_queue.add(first_req); first_req=null; } processing_queue.drainTo(remove_queue); int num=remove_queue.size(); if(num > 0) { drained_total.add(num); drained_avg.add(num); final AtomicInteger down_r=new AtomicInteger(), up_r=new AtomicInteger(); remove_queue.forEach(r -> { if(r instanceof DownRequest) down_r.incrementAndGet(); else if(r instanceof UpRequest) up_r.incrementAndGet(); }); drained_down.add(down_r.get()); drained_up.add(up_r.get()); } if(remove_queue.isEmpty()) return; else process(remove_queue); } } catch(InterruptedException ignored) { } } protected void process(List q) { RequestTable reqtab=request_table; LogEntries entries=new LogEntries(); long index=last_appended+1; int length=0; long current_term = currentTerm(); Address leader = leader(); for(Request r: q) { try { if(r instanceof UpRequest) { UpRequest up=(UpRequest)r; handleUpRequest(up.msg, up.hdr); } else if(r instanceof DownRequest) { DownRequest dr=(DownRequest)r; // Complete the request exceptionally. // The request could either be lost in the reqtab reference or fail with an NPE below. // It would only complete in case a timeout is associated. if (!isLeader()) { dr.f.completeExceptionally(notCurrentLeader()); continue; } entries.add(new LogEntry(current_term, dr.buf, dr.offset, dr.length, dr.internal)); // Add the request to the client table, so we can return results to clients when done reqtab.create(index++, raft_id, dr.f, this::majority, dr.options); length+=dr.length; } else if (r instanceof SnapshotRequest) { SnapshotRequest sr = (SnapshotRequest) r; try { takeSnapshot(); sr.f.complete(null); } catch (Exception e) { sr.f.completeExceptionally(e); throw e; } } } catch(Throwable ex) { log.error("%s: failed handling request %s: %s", local_addr, r, ex); } } if(entries.size() == 0) return; // handle down requests if(leader == null || !Objects.equals(leader,local_addr)) throw notCurrentLeader(); // Append to the log long prev_index=last_appended; long curr_index=last_appended+1; LogEntry entry=log_impl.get(prev_index); long prev_term=entry != null? entry.term : 0; // Multicast an AppendEntries message (exclude self) Message msg=new ObjectMessage(null, entries) .putHeader(id, new AppendEntriesRequest(this.local_addr, current_term, prev_index, prev_term, current_term, commit_index)) .setFlag(Message.TransientFlag.DONT_LOOPBACK); // don't receive my own request down_prot.down(msg); // Appends entries to my own log last_appended=log_impl.append(curr_index, entries); int batch_size=entries.size(); num_successful_append_requests+=batch_size; avg_append_entries_batch_size.add(batch_size); // see if we can already commit some entries long highest_committed=prev_index+1; while(reqtab.isCommitted(highest_committed)) highest_committed++; if(highest_committed > prev_index+1) commitLogTo(highest_committed, true); snapshotIfNeeded(length); } IllegalStateException notCurrentLeader() { return new IllegalStateException("I'm not the leader (local_addr=" + local_addr + ", leader=" + leader() + ")"); } /** Populate with non-committed entries (from log) (https://github.com/belaban/jgroups-raft/issues/31) */ protected void createRequestTable() { request_table=new RequestTable<>(); for(long i=this.commit_index+1; i <= this.last_appended; i++) request_table.create(i, raft_id, null, this::majority); } protected void createCommitTable() { List

jg_mbrs=view != null? view.getMembers() : new ArrayList<>(); List
mbrs=new ArrayList<>(jg_mbrs); mbrs.remove(local_addr); commit_table=new CommitTable(mbrs, last_appended +1); } protected void _addServer(String name) { if(name == null) return; List current= internal_state.getMembers(); if(!current.contains(name)) { current.add(name); internal_state.setMembers(current); computeMajority(); } } protected void _removeServer(String name) { if(name == null) return; List current= internal_state.getMembers(); if(current.remove(name)) { internal_state.setMembers(current); computeMajority(); } } /** * Runs (on the leader) as part of the queue handling loop: checks if all members (except the leader) in the commit * table have received all messages and resends AppendEntries messages to members who haven't.
* For each member, a next-index and match-index is maintained: next-index is the index of the next message to send to * that member (initialized to last-applied) and match-index is the index of the highest message known to have * been received by the member.
* Messages are resent to a given member as long as that member's match-index is smaller than its next-index. When * match_index == next_index, message resending for that member is stopped. When a new message is sent, * next-index is incremented (on reception of the AppendResult) and resending starts again. */ protected void sendAppendEntriesMessage(Address member, CommitTable.Entry e) { if(e.nextIndex() < log().firstAppended()) { try { sendSnapshotTo(member); // will reset snapshot_in_progress } catch(Exception ex) { log.error("%s: failed sending snapshot to %s: next_index=%d, first_applied=%d", local_addr, member, e.nextIndex(), log().firstAppended()); } return; } if(this.last_appended >= e.nextIndex()) { long to=e.sendSingleMessage()? e.nextIndex() : last_appended; long from=Math.max(e.nextIndex(),1); if(log.isTraceEnabled()) log.trace("%s: resending [%d..%d] to %s", local_addr, from, to, member); resend(member, from, to); return; } if(this.last_appended > e.matchIndex()) { long index=this.last_appended; if(index > 0) { log.trace("%s: resending %d to %s", local_addr, index, member); resend(member, index); } return; } if(this.commit_index > e.commitIndex()) { // send an empty AppendEntries message as commit message long current_term = currentTerm(); Message msg=new ObjectMessage(member, null) .putHeader(id, new AppendEntriesRequest(this.local_addr, current_term, 0, 0, current_term, this.commit_index)); down_prot.down(msg); return; } if(this.commit_index < this.last_appended) // fixes https://github.com/belaban/jgroups-raft/issues/30 resend(member, this.commit_index+1, this.last_appended); } protected CompletableFuture changeMembers(String name, InternalCommand.Type type) throws Exception { if(!dynamic_view_changes) throw new Exception("dynamic view changes are not allowed; set dynamic_view_changes to true to enable it"); Address leader = leader(); if(leader == null || !Objects.equals(leader, local_addr)) throw notCurrentLeader(); InternalCommand cmd=new InternalCommand(type, name); byte[] buf=Util.streamableToByteBuffer(cmd); // only add/remove one server at a time (https://github.com/belaban/jgroups-raft/issues/175) return add_server_future=add_server_future // Use handle, so we can execute even if the previous execution failed. .handle((ignore, t) -> setAsync(buf, 0, buf.length, true, null)) // Chain the new setAsync invocation. .thenCompose(Function.identity()); } protected void resend(Address target, long index) { LogEntry entry=log_impl.get(index); if(entry == null) { log.error("%s: resending of %d failed; entry not found", local_addr, index); return; } LogEntry prev=log_impl.get(index-1); long prev_term=prev != null? prev.term : 0; LogEntries entries=new LogEntries().add(entry); Message msg=new ObjectMessage(target, entries) .putHeader(id, new AppendEntriesRequest(this.local_addr, currentTerm(), index - 1, prev_term, entry.term, commit_index)); down_prot.down(msg); num_resends++; } /** Resends all entries in range [from .. to] to target */ protected void resend(Address target, long from, long to) { LogEntries entries=new LogEntries(); long entry_term=0; // term of first entry to resend for(long i=from; i <= to; i++) { LogEntry e=log_impl.get(i); if(e == null) { log.error("%s: resending of %d failed; entry not found", local_addr, i); break; } if(entry_term <= 0) entry_term=e.term(); entries.add(e); } LogEntry prev=log_impl.get(from-1); long prev_term=prev != null? prev.term : 0; Message msg=new ObjectMessage(target, entries) .putHeader(id, new AppendEntriesRequest(this.local_addr, currentTerm(), from - 1, prev_term, entry_term, commit_index)); down_prot.down(msg); num_resends++; } protected void sendSnapshotTo(Address dest) throws Exception { LogEntry last_committed_entry=log_impl.get(commitIndex()); long last_index=commit_index, last_term=last_committed_entry.term; takeSnapshot(); ByteBuffer data=log_impl.getSnapshot(); log.debug("%s: sending snapshot (%s) to %s", local_addr, Util.printBytes(data.position()), dest); Message msg=new BytesMessage(dest, data) .putHeader(id, new InstallSnapshotRequest(currentTerm(), leader(), last_index, last_term)); down_prot.down(msg); } /** * Tries to move commit_index up to index_inclusive, apply the entries in [commit_index+1 .. index_inclusive] * to the state machine and notify the clients for each entry. There is no need to check if an entry is committed * in RequestTable, as this was done before calling this method. * @param index_inclusive The index to which to move commit_index * @param serialize_response When true, the response of applying a change to the state machine needs to be serialized * into a byte[] array, otherwise null can be returned (reducing serialization cost) */ protected RAFT commitLogTo(long index_inclusive, boolean serialize_response) { long to=Math.min(last_appended, index_inclusive); long last_successful_apply=applyCommits(to, serialize_response); commit_index=Math.max(commit_index, last_successful_apply); log_impl.commitIndex(commit_index); return this; } /** Appends to the log and returns true if added or false if not (e.g. because the entry already existed */ protected boolean append(long index, LogEntries entries) { if(index <= last_appended) return false; last_appended=log_impl.append(index, entries); snapshotIfNeeded((int)entries.totalSize()); return true; } protected void deleteAllLogEntriesStartingFrom(long index) { log_impl.deleteAllEntriesStartingFrom(index); last_appended=log_impl.lastAppended(); commit_index=log_impl.commitIndex(); } protected void snapshotIfNeeded(int bytes_added) { curr_log_size+=bytes_added; if(curr_log_size >= max_log_size) { try { this.log.debug("%s: current log size is %d, exceeding max_log_size of %d: creating snapshot", local_addr, curr_log_size, max_log_size); takeSnapshot(); } catch(Exception ex) { log.error("%s: failed snapshotting log: %s", local_addr, ex); } } } protected void takeSnapshot() throws Exception { if(state_machine == null) throw new IllegalStateException("state machine is null"); ByteArrayDataOutputStream out=new ByteArrayDataOutputStream(128, true); internal_state.writeTo(out); state_machine.writeContentTo(out); ByteBuffer buf=ByteBuffer.wrap(out.buffer(), 0, out.position()); log_impl.setSnapshot(buf); log_impl.truncate(commitIndex()); num_snapshots++; // curr_log_size=logSizeInBytes(); // this is faster than calling logSizeInBytes(), but may not be accurate: if commit-index is way // behind last-appended, then this may perform the next truncation later than it should curr_log_size=0; } /** * Applies log entries [commit_index+1 .. to_inclusive] to the state machine and notifies clients in RequestTable. * @param to_inclusive The end index (inclusive) of the log entries to apply * @param serialize_response Whether or not {@link StateMachine#apply(byte[], int, int, boolean)} needs to return a serialized * response * @return The last index of the range of log entries that was successfuly applied (normally this is to_inclusive) */ protected long applyCommits(long to_inclusive, boolean serialize_response) { long last_successful_apply=commit_index; for(long i=commit_index+1; i <= to_inclusive; i++) { try { applyCommit(i, serialize_response); last_successful_apply=i; } catch(Throwable t) { log.error("%s: failed moving commit_index to %d: %s", local_addr, to_inclusive, t); return last_successful_apply; } } return last_successful_apply; } /** Applies the commit at index */ protected void applyCommit(long index, boolean serialize_response) throws Exception { // Apply the modifications to the state machine LogEntry log_entry=log_impl.get(index); if(log_entry == null) throw new IllegalStateException(local_addr + ": log entry for index " + index + " not found in log"); byte[] rsp=null; RequestTable.Entry entry=request_table != null? request_table.remove(index) : null; if(log_entry.internal) { try { InternalCommand cmd=Util.streamableFromByteBuffer(InternalCommand.class, log_entry.command, log_entry.offset, log_entry.length); cmd.execute(this); } catch(Throwable t) { notify(entry, t); } } else { Options opts=entry != null? entry.options() : null; if(opts != null && opts.ignoreReturnValue()) serialize_response=false; try { rsp=state_machine.apply(log_entry.command, log_entry.offset, log_entry.length, serialize_response); } catch(Throwable t) { notify(entry, t); } } notify(entry, rsp); } public void handleView(View view) { boolean check_view=this.view != null && this.view.size() < view.size(); this.view=view; if(commit_table != null) { List
mbrs=new ArrayList<>(view.getMembers()); mbrs.remove(local_addr); commit_table.adjust(mbrs, last_appended + 1); } // if we're the leader, check if the view contains no duplicate raft-ids if(check_view && duplicatesInView(view)) log.error("view contains duplicate raft-ids: %s", view); } public RAFT setLeaderAndTerm(Address new_leader) { return setLeaderAndTerm(new_leader, 0); } /** Sets the new leader and term */ public RAFT setLeaderAndTerm(Address new_leader, long new_term) { raft_state.tryAdvanceTermAndLeader(new_term, new_leader); return this; } private void leaderUpdated(Address new_leader) { if(Objects.equals(local_addr, new_leader)) { if(!isLeader()) log.debug("%s: becoming Leader", local_addr); changeRole(Role.Leader); // no-op if already a leader } else changeRole(Role.Follower); // no-op if already a follower } protected static void notify(RequestTable.Entry e, byte[] rsp) { if(e != null) e.notify(rsp); } protected static void notify(RequestTable.Entry e, Throwable t) { if(e != null) e.notify(t); } protected RAFT changeRole(Role new_role) { RaftImpl new_impl=new_role == Role.Leader? new Leader(this) : new Follower(this); RaftImpl old_impl=impl; if(old_impl == null || !old_impl.getClass().equals(new_impl.getClass())) { if(old_impl != null) old_impl.destroy(); new_impl.init(); impl=new_impl; log.trace("%s: changed role from %s -> %s", local_addr, old_impl == null? "null" : old_impl.getClass().getSimpleName(), new_impl.getClass().getSimpleName()); notifyRoleChangeListeners(new_role); } return this; } /** If cmd is not null, execute it. Else parse buf into InternalCommand then call cmd.execute() */ protected void executeInternalCommand(InternalCommand cmd, byte[] buf, int offset, int length) { if(cmd == null) { try { cmd=Util.streamableFromByteBuffer(InternalCommand.class, buf, offset, length); } catch(Exception ex) { log.error("%s: failed unmarshalling internal command: %s", local_addr, ex); return; } } try { cmd.execute(this); } catch(Exception ex) { log.error("%s: failed executing internal command %s: %s", local_addr, cmd, ex); } } protected String createLogName(String name, String suffix) { if(!suffix.startsWith(".")) suffix="." + suffix; boolean needs_suffix=!name.endsWith(suffix); String retval=name; if(!new File(name).isAbsolute()) { retval=log_dir + File.separator + name; } return needs_suffix? retval + suffix : retval; } protected void notifyRoleChangeListeners(Role role) { for(RoleChange ch: role_change_listeners) { try { ch.roleChanged(role); } catch(Throwable ignored) {} } } /** Checks if a given view contains duplicate raft-ids. Uses key raft-id in ExtendedUUID to compare */ protected boolean duplicatesInView(View view) { Set mbrs=new HashSet<>(); for(Address addr : view) { if(!(addr instanceof ExtendedUUID)) log.warn("address %s is not an ExtendedUUID but a %s", addr, addr.getClass().getSimpleName()); else { ExtendedUUID uuid=(ExtendedUUID)addr; byte[] val=uuid.get(raft_id_key); String m=val != null? Util.bytesToString(val) : null; if(m == null) log.error("address %s doesn't have a raft-id", addr); else if(!mbrs.add(m)) return true; } } return false; } protected static Map parseCommaDelimitedProps(String s) { if (s == null) return null; Map props=new HashMap<>(); Pattern p=Pattern.compile("\\s*([^=\\s]+)\\s*=\\s*([^=\\s,]+)\\s*,?"); //Pattern.compile("\\s*([^=\\s]+)\\s*=\\s([^=\\s]+)\\s*,?"); Matcher matcher=p.matcher(s); while(matcher.find()) { props.put(matcher.group(1), matcher.group(2)); } return props; } public interface RoleChange { void roleChanged(Role role); } protected void computeMajority() { majority=(internal_state.getMembers().size() / 2) + 1; } protected static class Request { protected void failed(Throwable t) { } } /** Received by up(Message) or up(MessageBatch) */ protected static class UpRequest extends Request { private final Message msg; private final RaftHeader hdr; public UpRequest(Message msg, RaftHeader hdr) { this.msg=msg; this.hdr=hdr; } public String toString() { return String.format("%s %s", UpRequest.class.getSimpleName(), hdr); } } /** Generated by {@link RAFT#setAsync(byte[], int, int)} */ protected static class DownRequest extends Request { final CompletableFuture f; final byte[] buf; final int offset, length; final boolean internal; final Options options; public DownRequest(CompletableFuture f, byte[] buf, int offset, int length, boolean internal, Options opts) { this.f=f; this.buf=buf; this.offset=offset; this.length=length; this.internal=internal; this.options=opts; } @Override protected final void failed(Throwable t) { f.completeExceptionally(t); } public String toString() { return String.format("%s %d bytes", DownRequest.class.getSimpleName(), length); } } protected static class SnapshotRequest extends Request { final CompletableFuture f; public SnapshotRequest(CompletableFuture f) { this.f = f; } @Override protected final void failed(Throwable t) { f.completeExceptionally(t); } @Override public String toString() { return SnapshotRequest.class.getSimpleName(); } } }




© 2015 - 2025 Weber Informatics LLC | Privacy Policy