Many resources are needed to download a project. Please understand that we have to compensate our server costs. Thank you in advance. Project price only 1 $
You can buy this project and download/modify it how often you want.
/*
* Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
* or more contributor license agreements. Licensed under the Elastic License
* 2.0 and the Server Side Public License, v 1; you may not use this file except
* in compliance with, at your election, the Elastic License 2.0 or the Server
* Side Public License, v 1.
*/
package org.elasticsearch.cluster.coordination;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
import org.apache.logging.log4j.message.ParameterizedMessage;
import org.elasticsearch.ElasticsearchException;
import org.elasticsearch.Version;
import org.elasticsearch.action.ActionListener;
import org.elasticsearch.action.ActionListenerResponseHandler;
import org.elasticsearch.action.support.ChannelActionListener;
import org.elasticsearch.cluster.ClusterState;
import org.elasticsearch.cluster.ClusterStatePublicationEvent;
import org.elasticsearch.cluster.Diff;
import org.elasticsearch.cluster.IncompatibleClusterStateVersionException;
import org.elasticsearch.cluster.node.DiscoveryNode;
import org.elasticsearch.cluster.node.DiscoveryNodes;
import org.elasticsearch.common.bytes.ReleasableBytesReference;
import org.elasticsearch.common.compress.Compressor;
import org.elasticsearch.common.compress.CompressorFactory;
import org.elasticsearch.common.io.Streams;
import org.elasticsearch.common.io.stream.BytesStreamOutput;
import org.elasticsearch.common.io.stream.InputStreamStreamInput;
import org.elasticsearch.common.io.stream.NamedWriteableAwareStreamInput;
import org.elasticsearch.common.io.stream.NamedWriteableRegistry;
import org.elasticsearch.common.io.stream.PositionTrackingOutputStreamStreamOutput;
import org.elasticsearch.common.io.stream.ReleasableBytesStreamOutput;
import org.elasticsearch.common.io.stream.StreamInput;
import org.elasticsearch.common.io.stream.StreamOutput;
import org.elasticsearch.common.util.BigArrays;
import org.elasticsearch.common.util.LazyInitializable;
import org.elasticsearch.core.AbstractRefCounted;
import org.elasticsearch.core.Releasables;
import org.elasticsearch.core.internal.io.IOUtils;
import org.elasticsearch.discovery.zen.PublishClusterStateAction;
import org.elasticsearch.discovery.zen.PublishClusterStateStats;
import org.elasticsearch.threadpool.ThreadPool;
import org.elasticsearch.transport.BytesTransportRequest;
import org.elasticsearch.transport.TransportException;
import org.elasticsearch.transport.TransportRequestOptions;
import org.elasticsearch.transport.TransportResponse;
import org.elasticsearch.transport.TransportService;
import java.io.IOException;
import java.util.HashMap;
import java.util.Map;
import java.util.Optional;
import java.util.concurrent.ConcurrentHashMap;
import java.util.concurrent.atomic.AtomicLong;
import java.util.concurrent.atomic.AtomicReference;
import java.util.function.BiConsumer;
import java.util.function.Function;
public class PublicationTransportHandler {
private static final Logger logger = LogManager.getLogger(PublicationTransportHandler.class);
public static final String PUBLISH_STATE_ACTION_NAME = "internal:cluster/coordination/publish_state";
public static final String COMMIT_STATE_ACTION_NAME = "internal:cluster/coordination/commit_state";
private final BigArrays bigArrays;
private final TransportService transportService;
private final NamedWriteableRegistry namedWriteableRegistry;
private final Function handlePublishRequest;
private final AtomicReference lastSeenClusterState = new AtomicReference<>();
// the master needs the original non-serialized state as the cluster state contains some volatile information that we
// don't want to be replicated because it's not usable on another node (e.g. UnassignedInfo.unassignedTimeNanos) or
// because it's mostly just debugging info that would unnecessarily blow up CS updates (I think there was one in
// snapshot code).
// TODO: look into these and check how to get rid of them
private final AtomicReference currentPublishRequestToSelf = new AtomicReference<>();
private final AtomicLong fullClusterStateReceivedCount = new AtomicLong();
private final AtomicLong incompatibleClusterStateDiffReceivedCount = new AtomicLong();
private final AtomicLong compatibleClusterStateDiffReceivedCount = new AtomicLong();
// -> no need to put a timeout on the options here, because we want the response to eventually be received
// and not log an error if it arrives after the timeout
private static final TransportRequestOptions STATE_REQUEST_OPTIONS = TransportRequestOptions.of(
null,
TransportRequestOptions.Type.STATE
);
private final SerializationStatsTracker serializationStatsTracker = new SerializationStatsTracker();
public PublicationTransportHandler(
BigArrays bigArrays,
TransportService transportService,
NamedWriteableRegistry namedWriteableRegistry,
Function handlePublishRequest,
BiConsumer> handleApplyCommit
) {
this.bigArrays = bigArrays;
this.transportService = transportService;
this.namedWriteableRegistry = namedWriteableRegistry;
this.handlePublishRequest = handlePublishRequest;
transportService.registerRequestHandler(
PUBLISH_STATE_ACTION_NAME,
ThreadPool.Names.GENERIC,
false,
false,
BytesTransportRequest::new,
(request, channel, task) -> channel.sendResponse(handleIncomingPublishRequest(request))
);
transportService.registerRequestHandler(
PublishClusterStateAction.SEND_ACTION_NAME,
ThreadPool.Names.GENERIC,
false,
false,
BytesTransportRequest::new,
(request, channel, task) -> {
handleIncomingPublishRequest(request);
channel.sendResponse(TransportResponse.Empty.INSTANCE);
}
);
transportService.registerRequestHandler(
COMMIT_STATE_ACTION_NAME,
ThreadPool.Names.GENERIC,
false,
false,
ApplyCommitRequest::new,
(request, channel, task) -> handleApplyCommit.accept(
request,
new ChannelActionListener<>(channel, COMMIT_STATE_ACTION_NAME, request).map(r -> TransportResponse.Empty.INSTANCE)
)
);
transportService.registerRequestHandler(
PublishClusterStateAction.COMMIT_ACTION_NAME,
ThreadPool.Names.GENERIC,
false,
false,
PublishClusterStateAction.CommitClusterStateRequest::new,
(request, channel, task) -> {
final Optional matchingClusterState = Optional.ofNullable(lastSeenClusterState.get())
.filter(cs -> cs.stateUUID().equals(request.stateUUID));
if (matchingClusterState.isPresent() == false) {
throw new IllegalStateException("can't resolve cluster state with uuid" + " [" + request.stateUUID + "] to commit");
}
final ApplyCommitRequest applyCommitRequest = new ApplyCommitRequest(
matchingClusterState.get().getNodes().getMasterNode(),
matchingClusterState.get().term(),
matchingClusterState.get().version()
);
handleApplyCommit.accept(
applyCommitRequest,
new ChannelActionListener<>(channel, COMMIT_STATE_ACTION_NAME, request).map(r -> TransportResponse.Empty.INSTANCE)
);
}
);
}
public PublishClusterStateStats stats() {
return new PublishClusterStateStats(
fullClusterStateReceivedCount.get(),
incompatibleClusterStateDiffReceivedCount.get(),
compatibleClusterStateDiffReceivedCount.get(),
serializationStatsTracker.getSerializationStats()
);
}
private PublishWithJoinResponse handleIncomingPublishRequest(BytesTransportRequest request) throws IOException {
final Compressor compressor = CompressorFactory.compressor(request.bytes());
StreamInput in = request.bytes().streamInput();
try {
if (compressor != null) {
in = new InputStreamStreamInput(compressor.threadLocalInputStream(in));
}
in = new NamedWriteableAwareStreamInput(in, namedWriteableRegistry);
in.setVersion(request.version());
// If true we received full cluster state - otherwise diffs
if (in.readBoolean()) {
final ClusterState incomingState;
// Close early to release resources used by the de-compression as early as possible
try (StreamInput input = in) {
incomingState = ClusterState.readFrom(input, transportService.getLocalNode());
} catch (Exception e) {
logger.warn("unexpected error while deserializing an incoming cluster state", e);
throw e;
}
fullClusterStateReceivedCount.incrementAndGet();
logger.debug("received full cluster state version [{}] with size [{}]", incomingState.version(), request.bytes().length());
final PublishWithJoinResponse response = acceptState(incomingState);
lastSeenClusterState.set(incomingState);
return response;
} else {
final ClusterState lastSeen = lastSeenClusterState.get();
if (lastSeen == null) {
logger.debug("received diff for but don't have any local cluster state - requesting full state");
incompatibleClusterStateDiffReceivedCount.incrementAndGet();
throw new IncompatibleClusterStateVersionException("have no local cluster state");
} else {
ClusterState incomingState;
try {
final Diff diff;
// Close stream early to release resources used by the de-compression as early as possible
try (StreamInput input = in) {
diff = ClusterState.readDiffFrom(input, lastSeen.nodes().getLocalNode());
}
incomingState = diff.apply(lastSeen); // might throw IncompatibleClusterStateVersionException
} catch (IncompatibleClusterStateVersionException e) {
incompatibleClusterStateDiffReceivedCount.incrementAndGet();
throw e;
} catch (Exception e) {
logger.warn("unexpected error while deserializing an incoming cluster state", e);
throw e;
}
compatibleClusterStateDiffReceivedCount.incrementAndGet();
logger.debug(
"received diff cluster state version [{}] with uuid [{}], diff size [{}]",
incomingState.version(),
incomingState.stateUUID(),
request.bytes().length()
);
final PublishWithJoinResponse response = acceptState(incomingState);
lastSeenClusterState.compareAndSet(lastSeen, incomingState);
return response;
}
}
} finally {
IOUtils.close(in);
}
}
private PublishWithJoinResponse acceptState(ClusterState incomingState) {
// if the state is coming from the current node, use original request instead (see currentPublishRequestToSelf for explanation)
if (transportService.getLocalNode().equals(incomingState.nodes().getMasterNode())) {
final PublishRequest publishRequest = currentPublishRequestToSelf.get();
if (publishRequest == null || publishRequest.getAcceptedState().stateUUID().equals(incomingState.stateUUID()) == false) {
throw new IllegalStateException("publication to self failed for " + publishRequest);
} else {
return handlePublishRequest.apply(publishRequest);
}
}
return handlePublishRequest.apply(new PublishRequest(incomingState));
}
public PublicationContext newPublicationContext(ClusterStatePublicationEvent clusterStatePublicationEvent) {
final PublicationContext publicationContext = new PublicationContext(clusterStatePublicationEvent);
boolean success = false;
try {
// Build the serializations we expect to need now, early in the process, so that an error during serialization fails the
// publication straight away. This isn't watertight since we send diffs on a best-effort basis and may fall back to sending a
// full state (and therefore serializing it) if the diff-based publication fails.
publicationContext.buildDiffAndSerializeStates();
success = true;
return publicationContext;
} finally {
if (success == false) {
publicationContext.decRef();
}
}
}
private ReleasableBytesReference serializeFullClusterState(ClusterState clusterState, DiscoveryNode node) {
final Version nodeVersion = node.getVersion();
final BytesStreamOutput bytesStream = new ReleasableBytesStreamOutput(bigArrays);
boolean success = false;
try {
final long uncompressedBytes;
try (
StreamOutput stream = new PositionTrackingOutputStreamStreamOutput(
CompressorFactory.COMPRESSOR.threadLocalOutputStream(Streams.flushOnCloseStream(bytesStream))
)
) {
stream.setVersion(nodeVersion);
stream.writeBoolean(true);
clusterState.writeTo(stream);
uncompressedBytes = stream.position();
} catch (IOException e) {
throw new ElasticsearchException("failed to serialize cluster state for publishing to node {}", e, node);
}
final ReleasableBytesReference result = new ReleasableBytesReference(bytesStream.bytes(), bytesStream::close);
serializationStatsTracker.serializedFullState(uncompressedBytes, result.length());
logger.trace(
"serialized full cluster state version [{}] for node version [{}] with size [{}]",
clusterState.version(),
nodeVersion,
result.length()
);
success = true;
return result;
} finally {
if (success == false) {
bytesStream.close();
}
}
}
private ReleasableBytesReference serializeDiffClusterState(long clusterStateVersion, Diff diff, DiscoveryNode node) {
final Version nodeVersion = node.getVersion();
final BytesStreamOutput bytesStream = new ReleasableBytesStreamOutput(bigArrays);
boolean success = false;
try {
final long uncompressedBytes;
try (
StreamOutput stream = new PositionTrackingOutputStreamStreamOutput(
CompressorFactory.COMPRESSOR.threadLocalOutputStream(Streams.flushOnCloseStream(bytesStream))
)
) {
stream.setVersion(nodeVersion);
stream.writeBoolean(false);
diff.writeTo(stream);
uncompressedBytes = stream.position();
} catch (IOException e) {
throw new ElasticsearchException("failed to serialize cluster state diff for publishing to node {}", e, node);
}
final ReleasableBytesReference result = new ReleasableBytesReference(bytesStream.bytes(), bytesStream::close);
serializationStatsTracker.serializedDiff(uncompressedBytes, result.length());
logger.trace(
"serialized cluster state diff for version [{}] for node version [{}] with size [{}]",
clusterStateVersion,
nodeVersion,
result.length()
);
success = true;
return result;
} finally {
if (success == false) {
bytesStream.close();
}
}
}
/**
* Publishing a cluster state typically involves sending the same cluster state (or diff) to every node, so the work of diffing,
* serializing, and compressing the state can be done once and the results shared across publish requests. The
* {@code PublicationContext} implements this sharing. It's ref-counted: the initial reference is released by the coordinator when
* a state (or diff) has been sent to every node, every transmitted diff also holds a reference in case it needs to retry with a full
* state.
*/
public class PublicationContext extends AbstractRefCounted {
private final DiscoveryNodes discoveryNodes;
private final ClusterState newState;
private final ClusterState previousState;
private final boolean sendFullVersion;
// All the values of these maps have one ref for the context (while it's open) and one for each in-flight message.
private final Map serializedStates = new ConcurrentHashMap<>();
private final Map serializedDiffs = new HashMap<>();
PublicationContext(ClusterStatePublicationEvent clusterStatePublicationEvent) {
discoveryNodes = clusterStatePublicationEvent.getNewState().nodes();
newState = clusterStatePublicationEvent.getNewState();
previousState = clusterStatePublicationEvent.getOldState();
sendFullVersion = previousState.getBlocks().disableStatePersistence();
}
void buildDiffAndSerializeStates() {
assert refCount() > 0;
final LazyInitializable, RuntimeException> diffSupplier = new LazyInitializable<>(
() -> newState.diff(previousState)
);
for (DiscoveryNode node : discoveryNodes) {
if (sendFullVersion || previousState.nodes().nodeExists(node) == false) {
serializedStates.computeIfAbsent(node.getVersion(), v -> serializeFullClusterState(newState, node));
} else {
serializedDiffs.computeIfAbsent(
node.getVersion(),
v -> serializeDiffClusterState(newState.version(), diffSupplier.getOrCompute(), node)
);
}
}
}
public void sendPublishRequest(
DiscoveryNode destination,
PublishRequest publishRequest,
ActionListener listener
) {
assert refCount() > 0;
assert publishRequest.getAcceptedState() == newState : "state got switched on us";
assert transportService.getThreadPool().getThreadContext().isSystemContext();
final ActionListener responseActionListener;
if (destination.equals(discoveryNodes.getLocalNode())) {
// if publishing to self, use original request instead (see currentPublishRequestToSelf for explanation)
final PublishRequest previousRequest = currentPublishRequestToSelf.getAndSet(publishRequest);
// we might override an in-flight publication to self in case where we failed as master and became master again,
// and the new publication started before the previous one completed (which fails anyhow because of higher current term)
assert previousRequest == null || previousRequest.getAcceptedState().term() < publishRequest.getAcceptedState().term();
responseActionListener = new ActionListener() {
@Override
public void onResponse(PublishWithJoinResponse publishWithJoinResponse) {
currentPublishRequestToSelf.compareAndSet(publishRequest, null); // only clean-up our mess
listener.onResponse(publishWithJoinResponse);
}
@Override
public void onFailure(Exception e) {
currentPublishRequestToSelf.compareAndSet(publishRequest, null); // only clean-up our mess
listener.onFailure(e);
}
};
} else {
responseActionListener = listener;
}
if (sendFullVersion || previousState.nodes().nodeExists(destination) == false) {
logger.trace("sending full cluster state version [{}] to [{}]", newState.version(), destination);
sendFullClusterState(destination, responseActionListener);
} else {
logger.trace("sending cluster state diff for version [{}] to [{}]", newState.version(), destination);
sendClusterStateDiff(destination, responseActionListener);
}
}
public void sendApplyCommit(
DiscoveryNode destination,
ApplyCommitRequest applyCommitRequest,
ActionListener listener
) {
assert transportService.getThreadPool().getThreadContext().isSystemContext();
if (Coordinator.isZen1Node(destination)) {
transportService.sendRequest(
destination,
PublishClusterStateAction.COMMIT_ACTION_NAME,
new PublishClusterStateAction.CommitClusterStateRequest(newState.stateUUID()),
STATE_REQUEST_OPTIONS,
new ActionListenerResponseHandler<>(listener, in -> TransportResponse.Empty.INSTANCE, ThreadPool.Names.GENERIC)
);
} else {
transportService.sendRequest(
destination,
COMMIT_STATE_ACTION_NAME,
applyCommitRequest,
STATE_REQUEST_OPTIONS,
new ActionListenerResponseHandler<>(listener, in -> TransportResponse.Empty.INSTANCE, ThreadPool.Names.GENERIC)
);
}
}
private void sendFullClusterState(DiscoveryNode destination, ActionListener listener) {
assert refCount() > 0;
ReleasableBytesReference bytes = serializedStates.get(destination.getVersion());
if (bytes == null) {
try {
bytes = serializedStates.computeIfAbsent(
destination.getVersion(),
v -> serializeFullClusterState(newState, destination)
);
} catch (Exception e) {
logger.warn(
() -> new ParameterizedMessage("failed to serialize cluster state before publishing it to node {}", destination),
e
);
listener.onFailure(e);
return;
}
}
sendClusterState(destination, bytes, listener);
}
private void sendClusterStateDiff(DiscoveryNode destination, ActionListener listener) {
final ReleasableBytesReference bytes = serializedDiffs.get(destination.getVersion());
assert bytes != null
: "failed to find serialized diff for node " + destination + " of version [" + destination.getVersion() + "]";
// acquire a ref to the context just in case we need to try again with the full cluster state
if (tryIncRef() == false) {
assert false;
listener.onFailure(new IllegalStateException("publication context released before transmission"));
return;
}
sendClusterState(destination, bytes, ActionListener.runAfter(listener.delegateResponse((delegate, e) -> {
if (e instanceof TransportException) {
final TransportException transportException = (TransportException) e;
if (transportException.unwrapCause() instanceof IncompatibleClusterStateVersionException) {
logger.debug(
() -> new ParameterizedMessage(
"resending full cluster state to node {} reason {}",
destination,
transportException.getDetailedMessage()
)
);
sendFullClusterState(destination, delegate);
return;
}
}
logger.debug(new ParameterizedMessage("failed to send cluster state to {}", destination), e);
delegate.onFailure(e);
}), this::decRef));
}
private void sendClusterState(
DiscoveryNode destination,
ReleasableBytesReference bytes,
ActionListener listener
) {
assert refCount() > 0;
if (bytes.tryIncRef() == false) {
assert false;
listener.onFailure(new IllegalStateException("serialized cluster state released before transmission"));
return;
}
try {
if (Coordinator.isZen1Node(destination)) {
transportService.sendRequest(
destination,
PublishClusterStateAction.SEND_ACTION_NAME,
new BytesTransportRequest(bytes, destination.getVersion()),
STATE_REQUEST_OPTIONS,
new ActionListenerResponseHandler<>(
ActionListener.runAfter(listener, bytes::decRef),
PublishWithJoinResponse::new,
ThreadPool.Names.GENERIC
).wrap(
empty -> new PublishWithJoinResponse(
new PublishResponse(newState.term(), newState.version()),
Optional.of(
new Join(
destination,
transportService.getLocalNode(),
newState.term(),
newState.term(),
newState.version()
)
)
),
in -> TransportResponse.Empty.INSTANCE
)
);
} else {
transportService.sendRequest(
destination,
PUBLISH_STATE_ACTION_NAME,
new BytesTransportRequest(bytes, destination.getVersion()),
STATE_REQUEST_OPTIONS,
new ActionListenerResponseHandler<>(
ActionListener.runAfter(listener, bytes::decRef),
PublishWithJoinResponse::new,
ThreadPool.Names.GENERIC
)
);
}
} catch (Exception e) {
assert false : e;
logger.warn(() -> new ParameterizedMessage("error sending cluster state to {}", destination), e);
listener.onFailure(e);
}
}
@Override
protected void closeInternal() {
serializedDiffs.values().forEach(Releasables::closeExpectNoException);
serializedStates.values().forEach(Releasables::closeExpectNoException);
}
}
private static class SerializationStatsTracker {
private long fullStateCount;
private long totalUncompressedFullStateBytes;
private long totalCompressedFullStateBytes;
private long diffCount;
private long totalUncompressedDiffBytes;
private long totalCompressedDiffBytes;
public synchronized void serializedFullState(long uncompressedBytes, int compressedBytes) {
fullStateCount += 1;
totalUncompressedFullStateBytes += uncompressedBytes;
totalCompressedFullStateBytes += compressedBytes;
}
public synchronized void serializedDiff(long uncompressedBytes, int compressedBytes) {
diffCount += 1;
totalUncompressedDiffBytes += uncompressedBytes;
totalCompressedDiffBytes += compressedBytes;
}
public synchronized ClusterStateSerializationStats getSerializationStats() {
return new ClusterStateSerializationStats(
fullStateCount,
totalUncompressedFullStateBytes,
totalCompressedFullStateBytes,
diffCount,
totalUncompressedDiffBytes,
totalCompressedDiffBytes
);
}
}
}