org.jgroups.protocols.SEQUENCER Maven / Gradle / Ivy
Go to download
This artifact provides a single jar that contains all classes required to use remote EJB and JMS, including
all dependencies. It is intended for use by those not using maven, maven users should just import the EJB and
JMS BOM's instead (shaded JAR's cause lots of problems with maven, as it is very easy to inadvertently end up
with different versions on classes on the class path).
The newest version!
package org.jgroups.protocols;
import org.jgroups.*;
import org.jgroups.annotations.MBean;
import org.jgroups.annotations.ManagedAttribute;
import org.jgroups.annotations.ManagedOperation;
import org.jgroups.annotations.Property;
import org.jgroups.stack.Protocol;
import org.jgroups.util.*;
import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
import java.util.*;
import java.util.concurrent.ConcurrentMap;
import java.util.concurrent.ConcurrentSkipListMap;
import java.util.concurrent.atomic.AtomicInteger;
import java.util.concurrent.atomic.AtomicLong;
import java.util.concurrent.locks.Condition;
import java.util.concurrent.locks.Lock;
import java.util.concurrent.locks.ReentrantLock;
import java.util.function.Supplier;
/**
* Implementation of total order protocol using a sequencer.
* Consult SEQUENCER.txt for details
* @author Bela Ban
*/
@MBean(description="Implementation of total order protocol using a sequencer")
public class SEQUENCER extends Protocol {
protected volatile Address coord;
protected volatile View view;
@ManagedAttribute
protected volatile boolean is_coord;
protected final AtomicLong seqno=new AtomicLong(0);
/** Maintains messages forwarded to the coord which which no ack has been received yet.
* Needs to be sorted so we resend them in the right order
*/
protected final NavigableMap forward_table=new ConcurrentSkipListMap<>();
protected final Lock send_lock=new ReentrantLock();
protected final Condition send_cond=send_lock.newCondition();
/** When ack_mode is set, we need to wait for an ack for each forwarded message until we can send the next one */
@ManagedAttribute(description="is ack-mode enabled or not")
protected volatile boolean ack_mode=true;
/** Set when we block all sending threads to resend all messages from forward_table */
protected volatile boolean flushing;
protected volatile boolean running=true;
/** Keeps track of the threads sending messages */
protected final AtomicInteger in_flight_sends=new AtomicInteger(0);
// Maintains received seqnos, so we can weed out dupes
protected final ConcurrentMap> delivery_table=Util.createConcurrentMap();
protected volatile Flusher flusher;
/** Used for each resent message to wait until the message has been received */
protected final Promise ack_promise=new Promise<>();
protected MessageFactory msg_factory;
@Property(description="Size of the set to store received seqnos (for duplicate checking)")
protected int delivery_table_max_size=2000;
@Property(description="Number of acks needed before going from ack-mode to normal mode. " +
"0 disables this, which means that ack-mode is always on")
protected int threshold=10;
@Property(description="If true, all messages in the forward-table are sent to the new coord, else thye're " +
"dropped (https://issues.redhat.com/browse/JGRP-2268)")
protected boolean flush_forward_table=true;
@ManagedAttribute protected int num_acks;
@ManagedAttribute protected long forwarded_msgs;
@ManagedAttribute protected long bcast_msgs;
@ManagedAttribute protected long received_forwards;
@ManagedAttribute protected long received_bcasts;
@ManagedAttribute protected long delivered_bcasts;
@ManagedAttribute
public boolean isCoordinator() {return is_coord;}
public Address getCoordinator() {return coord;}
@ManagedAttribute(description="Number of messages in the forward-table")
public int getForwardTableSize() {return forward_table.size();}
public SEQUENCER setThreshold(int new_threshold) {this.threshold=new_threshold; return this;}
public SEQUENCER setDeliveryTableMaxSize(int size) {delivery_table_max_size=size; return this;}
@ManagedOperation
public void resetStats() {
forwarded_msgs=bcast_msgs=received_forwards=received_bcasts=delivered_bcasts=0L;
}
public void init() throws Exception {
super.init();
msg_factory=getTransport().getMessageFactory();
}
public void start() throws Exception {
super.start();
running=true;
ack_mode=true;
}
public void stop() {
running=false;
unblockAll();
stopFlusher();
super.stop();
}
public Object down(Event evt) {
switch(evt.getType()) {
case Event.VIEW_CHANGE:
handleViewChange(evt.getArg());
break;
case Event.TMP_VIEW:
handleTmpView(evt.getArg());
break;
}
return down_prot.down(evt);
}
public Object down(Message msg) {
if(msg.getDest() != null || msg.isFlagSet(Message.Flag.NO_TOTAL_ORDER) || msg.isFlagSet(Message.Flag.OOB))
return down_prot.down(msg);
if(msg.getSrc() == null)
msg.setSrc(local_addr);
if(flushing)
block();
// A seqno is not used to establish ordering, but only to weed out duplicates; next_seqno doesn't need
// to increase monotonically, but only to be unique (https://issues.redhat.com/browse/JGRP-1461) !
long next_seqno=seqno.incrementAndGet();
in_flight_sends.incrementAndGet();
try {
SequencerHeader hdr=new SequencerHeader(is_coord? SequencerHeader.BCAST : SequencerHeader.WRAPPED_BCAST, next_seqno);
msg.putHeader(this.id, hdr);
if(log.isTraceEnabled())
log.trace("[" + local_addr + "]: forwarding " + local_addr + "::" + seqno + " to coord " + coord);
// We always forward messages to the coordinator, even if we're the coordinator. Having the coord
// send its messages directly led to starvation of messages from other members. MPerf perf went up
// from 20MB/sec/node to 50MB/sec/node with this change !
forwardToCoord(next_seqno, msg);
}
catch(Exception ex) {
log.error(Util.getMessage("FailedSendingMessage"), ex);
}
finally {
in_flight_sends.decrementAndGet();
}
return null; // don't pass down
}
public Object up(Event evt) {
switch(evt.getType()) {
case Event.VIEW_CHANGE:
Object retval=up_prot.up(evt);
handleViewChange(evt.getArg());
return retval;
case Event.TMP_VIEW:
handleTmpView(evt.getArg());
break;
}
return up_prot.up(evt);
}
public Object up(Message msg) {
SequencerHeader hdr;
if(msg.isFlagSet(Message.Flag.NO_TOTAL_ORDER) || msg.isFlagSet(Message.Flag.OOB))
return up_prot.up(msg);
hdr=msg.getHeader(this.id);
if(hdr == null)
return up_prot.up(msg); // pass up
switch(hdr.type) {
case SequencerHeader.FORWARD:
case SequencerHeader.FLUSH:
if(!is_coord) {
if(log.isErrorEnabled())
log.error(local_addr + ": non-coord; dropping FORWARD request from " + msg.getSrc());
return null;
}
Address sender=msg.getSrc();
if(view != null && !view.containsMember(sender)) {
log.error("%s: dropping FORWARD request from non-member %s; view=%s", local_addr, sender, view);
return null;
}
broadcast(msg, true, msg.getSrc(), hdr.seqno, hdr.type == SequencerHeader.FLUSH); // do copy the message
received_forwards++;
break;
case SequencerHeader.BCAST:
deliver(msg, hdr);
received_bcasts++;
break;
case SequencerHeader.WRAPPED_BCAST:
unwrapAndDeliver(msg, hdr.flush_ack); // unwrap the original message (in the payload) and deliver it
received_bcasts++;
break;
}
return null;
}
public void up(MessageBatch batch) {
Iterator it=batch.iterator();
while(it.hasNext()) {
Message msg=it.next();
if(msg.isFlagSet(Message.Flag.NO_TOTAL_ORDER) || msg.isFlagSet(Message.Flag.OOB) || msg.getHeader(id) == null)
continue;
it.remove();
// simplistic implementation
try {
up(msg);
}
catch(Throwable t) {
log.error(Util.getMessage("FailedPassingUpMessage"), t);
}
}
if(!batch.isEmpty())
up_prot.up(batch);
}
/* --------------------------------- Private Methods ----------------------------------- */
protected void handleViewChange(View v) {
List mbrs=v.getMembers();
if(mbrs.isEmpty()) return;
if(view == null || view.compareTo(v) < 0)
view=v;
else
return;
delivery_table.keySet().retainAll(mbrs);
Address existing_coord=coord, new_coord=mbrs.get(0);
boolean coord_changed=!Objects.equals(existing_coord, new_coord);
if(coord_changed && new_coord != null) {
stopFlusher();
startFlusher(new_coord); // needs to be done in the background, to prevent blocking if down() would block
}
}
protected void flush(final Address new_coord) throws InterruptedException {
// wait until all threads currently sending messages have returned (new threads after flushing=true) will block
// flushing is set to true in startFlusher()
while(flushing && running) {
if(in_flight_sends.get() == 0)
break;
Thread.sleep(100);
}
send_lock.lockInterruptibly();
try {
if(log.isTraceEnabled())
log.trace(local_addr + ": coord changed from " + coord + " to " + new_coord);
coord=new_coord;
is_coord=Objects.equals(local_addr, coord);
if(flush_forward_table)
flushMessagesInForwardTable();
}
finally {
if(log.isTraceEnabled())
log.trace(local_addr + ": flushing completed");
flushing=false;
ack_mode=true; // go to ack-mode after flushing
num_acks=0;
send_cond.signalAll();
send_lock.unlock();
}
}
// If we're becoming coordinator, we need to handle TMP_VIEW as
// an immediate change of view. See JGRP-1452.
private void handleTmpView(View v) {
Address new_coord=v.getCoord();
if(new_coord != null && !new_coord.equals(coord) && local_addr != null && local_addr.equals(new_coord))
handleViewChange(v);
}
/**
* Sends all messages currently in forward_table to the new coordinator (changing the dest field).
* This needs to be done, so the underlying reliable unicast protocol (e.g. UNICAST) adds these messages
* to its retransmission mechanism
* Note that we need to resend the messages in order of their seqnos! We also need to prevent other message
* from being inserted until we're done, that's why there's synchronization.
* Access to the forward_table doesn't need to be synchronized as there won't be any insertions during flushing
* (all down-threads are blocked)
*/
protected void flushMessagesInForwardTable() {
if(is_coord) {
for(Map.Entry entry: forward_table.entrySet()) {
Long key=entry.getKey();
Message msg=entry.getValue();
ByteArray buf;
try {
buf=Util.messageToBuffer(msg);
}
catch(Exception e) {
log.error(Util.getMessage("FlushingBroadcastingFailed"), e);
continue;
}
SequencerHeader hdr=new SequencerHeader(SequencerHeader.WRAPPED_BCAST, key);
Message forward_msg=new BytesMessage(null, buf).putHeader(this.id, hdr);
if(log.isTraceEnabled())
log.trace(local_addr + ": flushing (broadcasting) " + local_addr + "::" + key);
down_prot.down(forward_msg);
}
return;
}
// for forwarded messages, we need to receive the forwarded message from the coordinator, to prevent this case:
// - V1={A,B,C}
// - A crashes
// - C installs V2={B,C}
// - C forwards messages 3 and 4 to B (the new coord)
// - B drops 3 because its view is still V1
// - B installs V2
// - B receives message 4 and broadcasts it
// ==> C's message 4 is delivered *before* message 3 !
// ==> By resending 3 until it is received, then resending 4 until it is received, we make sure this won't happen
// (see https://issues.redhat.com/browse/JGRP-1449)
while(flushing && running && !forward_table.isEmpty()) {
Map.Entry entry=forward_table.firstEntry();
final Long key=entry.getKey();
Message msg=entry.getValue();
ByteArray buf;
try {
buf=Util.messageToBuffer(msg);
}
catch(Exception e) {
log.error(Util.getMessage("FlushingBroadcastingFailed"), e);
continue;
}
while(flushing && running && !forward_table.isEmpty()) {
SequencerHeader hdr=new SequencerHeader(SequencerHeader.FLUSH, key);
Message forward_msg=new BytesMessage(coord, buf).putHeader(this.id, hdr).setFlag(Message.Flag.DONT_BUNDLE);
if(log.isTraceEnabled())
log.trace(local_addr + ": flushing (forwarding) " + local_addr + "::" + key + " to coord " + coord);
ack_promise.reset();
down_prot.down(forward_msg);
Long ack=ack_promise.getResult(500);
if((Objects.equals(ack, key)) || !forward_table.containsKey(key))
break;
}
}
}
protected void forwardToCoord(long seqno, Message msg) {
if(is_coord) {
forward(msg, seqno, false);
return;
}
if(!running || flushing) {
forward_table.put(seqno, msg);
return;
}
if(!ack_mode) {
forward_table.put(seqno, msg);
forward(msg, seqno, false);
return;
}
send_lock.lock();
try {
forward_table.put(seqno, msg);
while(running && !flushing) {
ack_promise.reset();
forward(msg, seqno, true);
if(!ack_mode || !running || flushing)
break;
Long ack=ack_promise.getResult(500);
if((Objects.equals(ack, seqno)) || !forward_table.containsKey(seqno))
break;
}
}
finally {
send_lock.unlock();
}
}
protected void forward(final Message msg, long seqno, boolean flush) {
Address target=coord;
if(target == null)
return;
byte type=flush? SequencerHeader.FLUSH : SequencerHeader.FORWARD;
try {
SequencerHeader hdr=new SequencerHeader(type, seqno);
Message forward_msg=new BytesMessage(target, Util.messageToBuffer(msg)).putHeader(this.id, hdr);
down_prot.down(forward_msg);
forwarded_msgs++;
}
catch(Exception ex) {
log.error(Util.getMessage("FailedForwardingMessageTo") + msg.getDest(), ex);
}
}
protected void broadcast(final Message msg, boolean copy, Address original_sender, long seqno, boolean resend) {
Message bcast_msg=null;
if(!copy) {
bcast_msg=msg; // no need to add a header, message already has one
}
else {
SequencerHeader new_hdr=new SequencerHeader(SequencerHeader.WRAPPED_BCAST, seqno);
bcast_msg=new BytesMessage(null, msg.getArray(), msg.getOffset(), msg.getLength()).putHeader(this.id, new_hdr);
if(resend) {
new_hdr.flush_ack=true;
bcast_msg.setFlag(Message.Flag.DONT_BUNDLE);
}
}
if(log.isTraceEnabled())
log.trace(local_addr + ": broadcasting " + original_sender + "::" + seqno);
down_prot.down(bcast_msg);
bcast_msgs++;
}
/**
* Unmarshal the original message (in the payload) and then pass it up (unless already delivered)
*/
protected void unwrapAndDeliver(final Message msg, boolean flush_ack) {
try {
// Message msg_to_deliver=Util.streamableFromBuffer(BytesMessage::new, msg.getArray(), msg.getOffset(), msg.getLength());
Message msg_to_deliver=Util.messageFromBuffer(msg.getArray(), msg.getOffset(), msg.getLength(), msg_factory);
SequencerHeader hdr=msg_to_deliver.getHeader(this.id);
if(flush_ack)
hdr.flush_ack=true;
deliver(msg_to_deliver, hdr);
}
catch(Exception ex) {
log.error(Util.getMessage("FailureUnmarshallingBuffer"), ex);
}
}
protected void deliver(Message msg, SequencerHeader hdr) {
Address sender=msg.getSrc();
if(sender == null) {
log.error("%s: sender is null, cannot deliver ::%d", local_addr, hdr.getSeqno());
return;
}
long msg_seqno=hdr.getSeqno();
if(sender.equals(local_addr)) {
forward_table.remove(msg_seqno);
if(hdr.flush_ack) {
ack_promise.setResult(msg_seqno);
if(ack_mode && !flushing && threshold > 0 && ++num_acks >= threshold) {
ack_mode=false;
num_acks=0;
}
}
}
if(!canDeliver(sender, msg_seqno)) {
log.warn("%s: dropped duplicate message %s::%d", local_addr, sender, msg_seqno);
return;
}
if(log.isTraceEnabled())
log.trace(local_addr + ": delivering " + sender + "::" + msg_seqno);
up_prot.up(msg);
delivered_bcasts++;
}
/**
* Checks if seqno has already been received from sender. This weeds out duplicates.
* Note that this method is never called concurrently for the same sender, as the sender in NAKACK will always be
* the coordinator.
*/
protected boolean canDeliver(Address sender, long seqno) {
BoundedHashMap seqno_set=delivery_table.get(sender);
if(seqno_set == null) {
seqno_set=new BoundedHashMap<>(delivery_table_max_size);
BoundedHashMap existing=delivery_table.put(sender,seqno_set);
if(existing != null)
seqno_set=existing;
}
return seqno_set.add(seqno, seqno);
}
protected void block() {
send_lock.lock();
try {
while(flushing && running) {
try {
send_cond.await();
}
catch(InterruptedException e) {
}
}
}
finally {
send_lock.unlock();
}
}
protected void unblockAll() {
flushing=false;
send_lock.lock();
try {
send_cond.signalAll();
ack_promise.setResult(null);
}
finally {
send_lock.unlock();
}
}
protected synchronized void startFlusher(final Address new_coord) {
if(flusher == null || !flusher.isAlive()) {
if(log.isTraceEnabled())
log.trace(local_addr + ": flushing started");
// causes subsequent message sends (broadcasts and forwards) to block (https://issues.redhat.com/browse/JGRP-1495)
flushing=true;
flusher=new Flusher(new_coord);
flusher.setName("Flusher");
flusher.start();
}
}
protected void stopFlusher() {
flushing=false;
Thread tmp=flusher;
while(tmp != null && tmp.isAlive()) {
tmp.interrupt();
ack_promise.setResult(null);
try {
tmp.join();
}
catch(InterruptedException e) {
}
}
}
/* ----------------------------- End of Private Methods -------------------------------- */
protected class Flusher extends Thread {
protected final Address new_coord;
public Flusher(Address new_coord) {
this.new_coord=new_coord;
}
public void run() {
try {
flush(new_coord);
}
catch (InterruptedException e) {
}
}
}
public static class SequencerHeader extends Header {
protected static final byte FORWARD = 1;
protected static final byte FLUSH = 2;
protected static final byte BCAST = 3;
protected static final byte WRAPPED_BCAST = 4;
protected byte type=-1;
protected long seqno=-1;
protected boolean flush_ack;
public SequencerHeader() {
}
public SequencerHeader(byte type) {
this.type=type;
}
public SequencerHeader(byte type, long seqno) {
this(type);
this.seqno=seqno;
}
public short getMagicId() {return 61;}
public long getSeqno() {
return seqno;
}
public Supplier extends Header> create() {return SequencerHeader::new;}
public String toString() {
StringBuilder sb=new StringBuilder(64);
sb.append(printType());
if(seqno >= 0)
sb.append(" seqno=" + seqno);
if(flush_ack)
sb.append(" (flush_ack)");
return sb.toString();
}
protected final String printType() {
switch(type) {
case FORWARD: return "FORWARD";
case FLUSH: return "FLUSH";
case BCAST: return "BCAST";
case WRAPPED_BCAST: return "WRAPPED_BCAST";
default: return "n/a";
}
}
@Override
public void writeTo(DataOutput out) throws IOException {
out.writeByte(type);
Bits.writeLongCompressed(seqno, out);
out.writeBoolean(flush_ack);
}
@Override
public void readFrom(DataInput in) throws IOException {
type=in.readByte();
seqno=Bits.readLongCompressed(in);
flush_ack=in.readBoolean();
}
@Override
public int serializedSize() {
return Global.BYTE_SIZE + Bits.size(seqno) + Global.BYTE_SIZE; // type + seqno + flush_ack
}
}
}