r.0.9.1.source-code.LogReplicator.kt Maven / Gradle / Ivy
The newest version!
package se.wollan.tolr
import kotlinx.coroutines.CancellationException
import kotlinx.coroutines.CoroutineScope
import kotlinx.coroutines.channels.Channel
import kotlinx.coroutines.channels.SendChannel
import kotlinx.coroutines.delay
import kotlinx.coroutines.launch
import org.slf4j.Logger
import se.wollan.datascope.DataScope
import se.wollan.time.HLCTimestamp
import java.security.SecureRandom
import java.util.*
import kotlin.random.asKotlinRandom
import kotlin.time.Duration
internal interface LogReplicator {
suspend fun startReplicating()
suspend fun triggerReplication()
suspend fun triggerReplicationFor(remote: RemoteHostname): Boolean
suspend fun handleIncomingBatch(batch: ReplicationBatch): ReplicationBatch
}
private enum class RetryPolicy { IMMEDIATE, SCHEDULED, NO_RETRY }
private val DEFAULT_RETRY_POLICY = RetryPolicy.IMMEDIATE
private data class TriggerRequest(
val responseChannel: SendChannel?,
val retryPolicy: RetryPolicy = DEFAULT_RETRY_POLICY
) {
companion object {
val noResponse = TriggerRequest(responseChannel = null)
}
}
internal class LogReplicatorImpl(
private val coroutineScope: CoroutineScope,
private val recordRepo: LogRecordRepo,
private val serverServerAPI: ServerServerAPI,
private val logger: Logger,
private val dataScope: DataScope,
private val configurationProvider: ConfigurationProvider,
) : LogReplicator {
// always use [triggers] function instead to get values
@Volatile
private var _triggers: Map>? = null
private val random = SecureRandom().asKotlinRandom()
override suspend fun triggerReplication() {
for (requestChannel in triggers().values.shuffled())
requestChannel.trySend(TriggerRequest.noResponse)
}
override suspend fun triggerReplicationFor(remote: RemoteHostname): Boolean =
triggerReplicationFor(remote, withPolicy = DEFAULT_RETRY_POLICY)
private suspend fun triggerReplicationFor(remote: RemoteHostname, withPolicy: RetryPolicy): Boolean {
val channel = triggers()[remote] ?: return false
channel.trySend(TriggerRequest(null, withPolicy))
return true
}
override suspend fun startReplicating() {
startReplicationLoops()
startHeartbeatReplicationTrigger()
}
private suspend fun startReplicationLoops() {
for ((remoteHostname, channel) in triggers()) {
coroutineScope.launch {
for ((responseChannel, retryPolicy) in channel) {
try {
startReplicationWithRetry(remoteHostname, retryPolicy)
responseChannel?.trySend(Unit)
} catch (e: Exception) {
responseChannel?.close(e)
}
}
}
}
}
private fun startHeartbeatReplicationTrigger() = coroutineScope.launch {
awaitSingleReplicationForEachNodeBestEffort()
while (true) {
val interval = heartbeatTriggerInterval() * nextJitterFactor()
delay(interval)
logger.debug("heartbeat trigger after $interval interval")
triggerReplication()
}
}
private suspend fun awaitSingleReplicationForEachNodeBestEffort() {
for (requestChannel in triggers().values.shuffled(random).toList()) {
val responseChannel = Channel(Channel.RENDEZVOUS)
requestChannel.send(TriggerRequest(responseChannel))
responseChannel.receive()
}
}
private suspend fun startReplicationWithRetry(remote: RemoteHostname, retryPolicy: RetryPolicy) {
logger.info("start replication with $remote")
try {
startReplicationWith(remote)
logger.info("successfully completed replication with $remote")
} catch (_: CancellationException) {
logger.warn("replication with $remote was cancelled, no retry")
} catch (e: Exception) {
when (retryPolicy) {
RetryPolicy.IMMEDIATE -> {
// let's do one fast retry, because let's face it - most likely it's a shaky network and will work next time.. right?
logger.warn("replication with $remote failed due to '${e.message}', retrying at once", e)
startReplicationWithRetry(remote, RetryPolicy.SCHEDULED)
}
RetryPolicy.SCHEDULED -> {
// ..okay I was wrong, fine, let schedule a retry then
val d = retryDelayOnFailure() * nextJitterFactor()
logger.warn("replication with $remote failed due to '${e.message}', will retry in $d", e)
coroutineScope.launch {
delay(d)
triggerReplicationFor(remote, withPolicy = RetryPolicy.NO_RETRY)
}
}
RetryPolicy.NO_RETRY -> {
logger.warn("replication with $remote failed due to '${e.message}', awaiting next heartbeat.", e)
}
}
}
}
private suspend fun startReplicationWith(remote: RemoteHostname) {
val lastLocalTimestamps = recordRepo.listLatestPerNodeTimestamps()
val (initialRespondBatch, target) =
serverServerAPI.replicateLogInitial(remote, ReplicationBatch(lastLocalTimestamps, emptyList()))
processRespondReplicationBatch(remote, initialRespondBatch, target)
}
private suspend fun processRespondReplicationBatch(
remote: RemoteHostname,
respondBatch: ReplicationBatch,
target: RemoteHostname,
) {
val (latestLocalTimestamps, missingRemoteRecords) = dataScope.write {
recordRepo.insertIfMissing(respondBatch.records)
val latestLocalTimestamps = recordRepo.listLatestPerNodeTimestamps()
val hasLocallyMissingRecords = anyLocallyMissingRecords(
latestLocally = latestLocalTimestamps,
latestRemotely = respondBatch.latestTimestamps
)
val latestOfMissingRemoteTSs = getLatestRemoteTimestampsOfRemotelyMissingRecords(
latestLocally = latestLocalTimestamps,
latestRemotely = respondBatch.latestTimestamps
)
if (!hasLocallyMissingRecords && latestOfMissingRemoteTSs.isEmpty())
return@write null // replication completed successfully
val missingRemoteRecords = recordRepo.listLaterThanNodeTimestamps(latestOfMissingRemoteTSs)
latestLocalTimestamps to missingRemoteRecords
} ?: return
val (nextRespondBatch, nextTarget) = serverServerAPI.replicateLog(
remote = remote,
batch = ReplicationBatch(latestLocalTimestamps, missingRemoteRecords),
target = target
)
check(target == nextTarget) { "target cannot change mid-replication! (from $target to $nextTarget)" }
processRespondReplicationBatch(remote, nextRespondBatch, target)
}
@Suppress("PARAMETER_NAME_CHANGED_ON_OVERRIDE")
override suspend fun handleIncomingBatch(incomingBatch: ReplicationBatch): ReplicationBatch = dataScope.write {
val locallyInserted = recordRepo.insertIfMissing(incomingBatch.records)
val latestLocalTimestamps = recordRepo.listLatestPerNodeTimestamps()
val latestOfMissingRemoteTSs = getLatestRemoteTimestampsOfRemotelyMissingRecords(
latestLocally = latestLocalTimestamps, latestRemotely = incomingBatch.latestTimestamps
)
val missingRemoteRecords = recordRepo.listLaterThanNodeTimestamps(latestOfMissingRemoteTSs)
// TODO: optimization: only trigger replication for public client replications
if (locallyInserted > 0)
dataScope.addPostCommitHook(::triggerReplication)
ReplicationBatch(latestLocalTimestamps, missingRemoteRecords)
}
private fun nextJitterFactor(): Double = synchronized(random) {
random.nextDouble(from = 0.9, until = 1.1)
}
private suspend fun heartbeatTriggerInterval(): Duration =
configurationProvider.getConfiguration().heartbeatTriggerInterval
private suspend fun retryDelayOnFailure(): Duration =
configurationProvider.getConfiguration().retryDelayOnFailure
/** doesn't need to be thread-safe due to called during initialization of TOLR */
private suspend fun triggers(): Map> {
_triggers?.let { return it }
val triggers = configurationProvider.getConfiguration().remoteHostnamesSafe
.associateWith> { Channel(Channel.CONFLATED) }
_triggers = triggers
return triggers
}
}
internal fun anyLocallyMissingRecords(
latestLocally: Map,
latestRemotely: Map,
): Boolean = latestRemotely.any { remoteTS ->
val localTS = latestLocally[remoteTS.key] ?: return@any true
remoteTS.value > localTS
}
internal fun getLatestRemoteTimestampsOfRemotelyMissingRecords(
latestLocally: Map,
latestRemotely: Map,
): Map {
// first extract all local ts that are later than remote equivalent
val laterLocalTimestamps = latestLocally.filter { localTS ->
val remoteTS = latestRemotely[localTS.key] ?: return@filter true
localTS.value > remoteTS
}
// then we want the remote equivalent ts of each of those remotely missing ones, so we can get records from local db
return laterLocalTimestamps.mapValues { l ->
latestRemotely[l.key] ?: HLCTimestamp.initial
}
}