package org.yamcs.replication;
import java.io.IOException;
import java.io.RandomAccessFile;
import java.nio.ByteBuffer;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.concurrent.TimeUnit;
import java.util.stream.Collectors;
import javax.net.ssl.SSLException;
import org.yamcs.AbstractYamcsService;
import org.yamcs.ConfigurationException;
import org.yamcs.InitException;
import org.yamcs.Spec;
import org.yamcs.YConfiguration;
import org.yamcs.YamcsException;
import org.yamcs.YamcsServer;
import org.yamcs.YamcsServerInstance;
import org.yamcs.Spec.OptionType;
import org.yamcs.replication.Message.TransactionMessage;
import org.yamcs.replication.protobuf.ColumnInfo;
import org.yamcs.replication.protobuf.Request;
import org.yamcs.replication.protobuf.Response;
import org.yamcs.replication.protobuf.StreamInfo;
import org.yamcs.replication.protobuf.TimeMessage;
import org.yamcs.time.SimulationTimeService;
import org.yamcs.time.TimeService;
import org.yamcs.utils.DecodingException;
import org.yamcs.yarch.ColumnDefinition;
import org.yamcs.yarch.ColumnSerializer;
import org.yamcs.yarch.ColumnSerializerFactory;
import org.yamcs.yarch.DataType;
import org.yamcs.yarch.Stream;
import org.yamcs.yarch.Tuple;
import org.yamcs.yarch.TupleDefinition;
import org.yamcs.yarch.YarchDatabase;
import org.yamcs.yarch.YarchDatabaseInstance;
import com.google.protobuf.TextFormat;
import io.netty.buffer.ByteBuf;
import io.netty.buffer.ByteBufUtil;
import io.netty.buffer.Unpooled;
import io.netty.channel.ChannelHandler;
import io.netty.channel.ChannelHandlerContext;
import io.netty.channel.ChannelInboundHandlerAdapter;
import io.netty.handler.ssl.SslContext;
import io.netty.handler.ssl.SslContextBuilder;
import io.netty.util.concurrent.ScheduledFuture;
public class ReplicationSlave extends AbstractYamcsService {
private TcpRole tcpRole;
int port;
String host;
ReplicationClient tcpClient;
long reconnectionInterval;
String masterInstance;
long lastTxId;
SlaveChannelHandler slaveChannelHandler;
// remote (master) stream name -> local stream name
Map streamNames = new HashMap<>();
RandomAccessFile lastTxFile;
Path txtfilePath;
int localInstanceId;
SslContext sslCtx = null;
int maxTupleSize;
long timeoutMillis;
SimulationTimeService simTimeService = null;
public void init(String yamcsInstance, String serviceName, YConfiguration config) throws InitException {
super.init(yamcsInstance, serviceName, config);
YamcsServerInstance ysi =YamcsServer.getServer().getInstance(yamcsInstance);
this.localInstanceId = ysi.getInstanceId();
boolean updateSimTime = config.getBoolean("updateSimTime");
if (updateSimTime) {
TimeService srv = ysi.getTimeService();
if (srv instanceof SimulationTimeService) {
simTimeService = (SimulationTimeService) srv;
} else {
throw new ConfigurationException(
"Cannot use updateSimTime unless the simulated time service is configured");
List streams = config.getList("streams");
for (String s : streams) {
String[] a = s.split("\\s*\\-\\>\\s*");
if(a.length == 1) {
streamNames.put(a[0], a[0]);
} else if (a.length == 2) {
streamNames.put(a[0], a[1]);
} else {
throw new ConfigurationException("Invalid stream spec '" + s + "'");
tcpRole = config.getEnum("tcpRole", TcpRole.class, TcpRole.CLIENT);
if (tcpRole == TcpRole.CLIENT) {
host = config.getString("masterHost");
port = config.getInt("masterPort");
reconnectionInterval = 1000 * config.getLong("reconnectionIntervalSec", 30);
boolean enableTls = config.getBoolean("enableTls", false);
if (enableTls) {
try {
sslCtx = SslContextBuilder.forClient().build();
} catch (SSLException e) {
throw new InitException("Failed to initialize the TLS: " + e.toString());
} else {
ReplicationServer server = getReplicationServer();
masterInstance = config.getString("masterInstance", yamcsInstance);// by default we ask the same instance from
String dataDir = YarchDatabase.getDataDir();
Path replicationDir = Paths.get(dataDir).resolve(yamcsInstance).resolve("replication");
String lastTxFilename = config.getString("lastTxFile", serviceName + "-lastid.txt");
this.maxTupleSize = config.getInt("maxTupleSize");
this.timeoutMillis = (long) (config.getDouble("timeoutSec") * 1000);
txtfilePath = replicationDir.resolve(lastTxFilename);
try {
lastTxFile = new RandomAccessFile(txtfilePath.toFile(), "rw");
String line = lastTxFile.readLine();
if (line != null) {
lastTxId = Long.parseLong(line);
} else {
lastTxId = -1;
} catch (IOException e) {
throw new InitException(e);
} catch (NumberFormatException e) {
throw new InitException("Cannot parse number from " + txtfilePath + ": " + e);
public Spec getSpec() {
Spec spec = new Spec();
spec.addOption("streams", OptionType.LIST).withElementType(OptionType.STRING).withRequired(true);
spec.addOption("tcpRole", OptionType.STRING);
spec.addOption("masterHost", OptionType.STRING);
spec.addOption("masterPort", OptionType.INTEGER);
spec.addOption("reconnectionIntervalSec", OptionType.INTEGER);
spec.addOption("enableTls", OptionType.BOOLEAN);
spec.addOption("masterInstance", OptionType.STRING);
spec.addOption("lastTxFile", OptionType.STRING);
spec.addOption("maxTupleSize", OptionType.INTEGER).withDefault(131072)
.withDescription("Maximum size of the serialized tuple");
spec.addOption("timeoutSec", OptionType.FLOAT)
"Timeout in seconds. If no message is received in this time, the connection will be closed")
spec.addOption("updateSimTime", OptionType.BOOLEAN).withDefault(false)
.withDescription("If true, update the simulation time with the time received from the master");
return spec;
protected void doStart() {
if (tcpRole == TcpRole.CLIENT) {
tcpClient = new ReplicationClient(yamcsInstance, host, port, sslCtx, reconnectionInterval, maxTupleSize,
() -> new SlaveChannelHandler(this));
protected void doStop() {
private void failService(String errMsg) {
log.warn("Replication failed: {}", errMsg);
log.warn("Shutting down the service");
notifyFailed(new Exception(errMsg));
private void shutdown() {
log.debug("Shutting down the replication slave");
if (tcpClient != null) {
if (tcpRole == TcpRole.SERVER) {
try {
} catch (InitException e) {
// shouldn't happen since we are already started
throw new RuntimeException(e);
if (slaveChannelHandler != null) {
slaveChannelHandler = null;
try {
} catch (IOException e) {
log.error("Failed to close the last TX id file");
private void updateLastTxFile() {
try {
lastTxFile.writeBytes(Long.toString(lastTxId) + "\n");
} catch (IOException e) {
log.warn("Failed to update the last tx file " + txtfilePath, e);
private ReplicationServer getReplicationServer() throws InitException {
List servers = YamcsServer.getServer().getGlobalServices(ReplicationServer.class);
if (servers.isEmpty()) {
throw new InitException(
"ReplicationSlave is defined with the role Server; that requires the ReplicationServer global service (yamcs.yaml) to be defined");
} else if (servers.size() > 1) {
log.warn("There are {} ReplicationServer services defined. Registering to the first one.",
return servers.get(0);
public List getStreamNames() {
return streamNames.entrySet().stream().map(e -> {
if (e.getKey().equals(e.getValue())) {
return e.getKey();
} else {
return e.getKey() + "->" + e.getValue();
public boolean isTcpClient() {
return tcpRole == TcpRole.CLIENT;
public ReplicationClient getTcpClient() {
return tcpClient;
public String getMasterHost() {
return host;
public int getMasterPort() {
return port;
public String getMasterInstance() {
return masterInstance;
public long getTxId() {
return lastTxId;
* Called when the tcpRole = Server and a new client connects to {@link ReplicationServer}
* @throws YamcsException
* if there is already a connection open to this slave
public ChannelHandler newChannelHandler() throws YamcsException {
if (slaveChannelHandler != null) {
throw new YamcsException("There is already a connection open to this slave");
slaveChannelHandler = new SlaveChannelHandler(this);
return slaveChannelHandler;
private void processTimeMessage(TimeMessage timeMsg) {
if (simTimeService != null) {
simTimeService.setSimElapsedTime(timeMsg.getLocalTime(), timeMsg.getMissionTime());
if (timeMsg.hasSpeed()) {
public class SlaveChannelHandler extends ChannelInboundHandlerAdapter {
ReplicationSlave replSlave;
private ChannelHandlerContext channelHandlerContext;
Map streamWriters = new HashMap<>();
long lastMsgReceivedTime;
private ScheduledFuture> timeoutFuture;
public SlaveChannelHandler(ReplicationSlave slave) {
this.replSlave = slave;
this.lastMsgReceivedTime = System.currentTimeMillis();
public void channelRead(ChannelHandlerContext ctx, Object o) {
ByteBuf nettybuf = (ByteBuf) o;
try {
doChannelRead(ctx, nettybuf);
} finally {
private void doChannelRead(ChannelHandlerContext ctx, ByteBuf nettybuf) {
ByteBuffer buf = nettybuf.nioBuffer();
if (state() != State.RUNNING) {
Message msg;
try {
msg = Message.decode(buf);
} catch (DecodingException e) {
log.warn("TX{} Failed to decode message {}; closing connection", lastTxId,
ByteBufUtil.hexDump(nettybuf), e);
lastMsgReceivedTime = System.currentTimeMillis();
if (msg.type == Message.DATA) {
TransactionMessage tmsg = (TransactionMessage) msg;
if (tmsg.txId <= lastTxId) {
log.warn("Received data from the past txId={}, lastTxId={}", tmsg.txId, lastTxId);
} else {
int streamId = tmsg.buf.getInt();
if (tmsg.instanceId == localInstanceId) {
log.trace("Skipping data originating from myself (serverId: {})", tmsg.instanceId);
ByteBufToStream bbs = streamWriters.get(streamId);
if (bbs == null) {
log.trace("Skipping data for unknown stream {}", streamId);
if (log.isTraceEnabled()) {
log.trace("TX{} received data for stream {}, length {}", tmsg.txId, bbs.stream.getName(),
bbs.processData(tmsg.txId, tmsg.buf);
} else if (msg.type == Message.STREAM_INFO) {
TransactionMessage tmsg = (TransactionMessage) msg;
if (tmsg.txId > lastTxId) { // we expect to receive previous stream info transactions
StreamInfo streamInfo = (StreamInfo) msg.protoMsg;
if (!streamInfo.hasName() || !streamInfo.hasId()) {
failService("TX" + tmsg.txId + ": received invalid stream info: " + streamInfo);
log.debug("TX{}: received stream info {}", tmsg.txId, TextFormat.shortDebugString(streamInfo));
String remoteStreamName = streamInfo.getName();
if (!streamNames.containsKey(remoteStreamName)) {
log.debug("TX{}: Ignoring stream {} because it is not in the list configured", tmsg.txId,
String localStreamName = streamNames.get(remoteStreamName);
YarchDatabaseInstance ydb = YarchDatabase.getInstance(yamcsInstance);
Stream stream = ydb.getStream(localStreamName);
if (stream == null) {
log.warn("TX{}: Received data for stream {} which does not exist", tmsg.txId, localStreamName);
streamWriters.put(streamInfo.getId(), new ByteBufToStream(stream, streamInfo));
} else if (msg.type == Message.RESPONSE) {// this is sent by a master when we are slave.
Response resp = (Response) msg.protoMsg;
if (resp.getResult() != 0) {
failService("Received negative response: " + resp.getErrorMsg());
} else {
log.info("Received response {}", resp);
} else if (msg.type == Message.TIME) {
TimeMessage timeMsg = (TimeMessage) msg.protoMsg;
} else {
failService("Unexpected message type " + msg.type + " received from the master");
private void checkMissing(TransactionMessage tmsg) {
if (tmsg.txId != lastTxId + 1) {
log.warn("Transactions {} to {} are missing", lastTxId + 1, tmsg.txId - 1);
lastTxId = tmsg.txId;
// called when tcpRole=Client and the connection is open
public void channelActive(ChannelHandlerContext ctx) throws Exception {
// called when tcpRole=Server and this handler is added to the pipeline by the ReplicationServer
public void handlerAdded(ChannelHandlerContext ctx) throws Exception {
if (tcpRole == TcpRole.CLIENT) {
this.channelHandlerContext = ctx;
private void sendRequest() {
Request.Builder reqb = Request.newBuilder().setRequestSeq(1).setYamcsInstance(masterInstance);
if (lastTxId >= 0) {
reqb.setStartTxId(lastTxId + 1);
Request req = reqb.build();
log.debug("Connection {} opened, sending request {}", channelHandlerContext.channel().remoteAddress(),
ByteBuf buf = Unpooled.wrappedBuffer(Message.get(req).encode());
timeoutFuture = channelHandlerContext.executor().scheduleAtFixedRate(this::checkTimeout, timeoutMillis,
timeoutMillis, TimeUnit.MILLISECONDS);
void checkTimeout() {
long now = System.currentTimeMillis();
if (now - lastMsgReceivedTime > timeoutMillis) {
log.warn("No message received in the last {} seconds. Closing the connection",
(now - lastMsgReceivedTime) / 1000);
void cancelTimeoutFuture() {
ScheduledFuture> sf = timeoutFuture;
if (sf != null) {
public void channelRegistered(ChannelHandlerContext ctx) throws Exception {
this.channelHandlerContext = ctx;
public void shutdown() {
public void exceptionCaught(ChannelHandlerContext ctx, Throwable cause) {
log.warn("Caught exception", cause);
public void channelInactive(ChannelHandlerContext ctx) throws Exception {
log.debug("Connection {} closed", ctx.channel().remoteAddress());
slaveChannelHandler = null;
class ByteBufToStream {
TupleDefinition completeTuple;
ColumnSerializer>[] serializers;
Stream stream;
public ByteBufToStream(Stream stream, StreamInfo streamInfo) {
this.stream = stream;
completeTuple = new TupleDefinition();
serializers = new ColumnSerializer>[streamInfo.getColumnsCount()];
for (int i = 0; i < serializers.length; i++) {
ColumnInfo cinfo = streamInfo.getColumns(i);
if (cinfo.getId() != i) {
log.warn("Corrupted metadata? c[{}].getId = {} (should be {})", i, cinfo.getId(), i);
String cname = cinfo.getName();
String ctype = cinfo.getType();
DataType type = DataType.byName(ctype);
ColumnDefinition cd = new ColumnDefinition(cname, type);
serializers[i] = ColumnSerializerFactory.getColumnSerializerForReplication(cd);
public void processData(long txId, ByteBuffer niobuf) {
TupleDefinition tdef = new TupleDefinition();
