All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.cassandra.streaming.StreamSession Maven / Gradle / Ivy

Go to download

The Apache Cassandra Project develops a highly scalable second-generation distributed database, bringing together Dynamo's fully distributed design and Bigtable's ColumnFamily-based data model.

There is a newer version: 5.0.2
Show newest version
/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.cassandra.streaming;

import java.io.EOFException;
import java.net.SocketTimeoutException;
import java.util.ArrayList;
import java.util.Collection;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.UUID;
import java.util.concurrent.ConcurrentHashMap;
import java.util.concurrent.ConcurrentMap;
import java.util.concurrent.Future;
import java.util.concurrent.TimeUnit;

import com.google.common.annotations.VisibleForTesting;

import com.google.common.collect.ArrayListMultimap;
import com.google.common.collect.Iterables;
import com.google.common.collect.Lists;
import com.google.common.collect.Multimap;
import com.google.common.collect.Sets;

import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import io.netty.channel.Channel;
import io.netty.channel.ChannelId;
import org.apache.cassandra.concurrent.ScheduledExecutors;
import org.apache.cassandra.config.DatabaseDescriptor;
import org.apache.cassandra.db.ColumnFamilyStore;
import org.apache.cassandra.db.Keyspace;
import org.apache.cassandra.db.lifecycle.TransactionAlreadyCompletedException;
import org.apache.cassandra.dht.OwnedRanges;
import org.apache.cassandra.dht.Range;
import org.apache.cassandra.dht.Token;
import org.apache.cassandra.gms.EndpointState;
import org.apache.cassandra.gms.IEndpointStateChangeSubscriber;
import org.apache.cassandra.locator.InetAddressAndPort;
import org.apache.cassandra.locator.RangesAtEndpoint;
import org.apache.cassandra.locator.Replica;
import org.apache.cassandra.metrics.StreamingMetrics;
import org.apache.cassandra.net.OutboundConnectionSettings;
import org.apache.cassandra.schema.TableId;
import org.apache.cassandra.service.StorageService;
import org.apache.cassandra.streaming.async.NettyStreamingMessageSender;
import org.apache.cassandra.streaming.messages.CompleteMessage;
import org.apache.cassandra.streaming.messages.IncomingStreamMessage;
import org.apache.cassandra.streaming.messages.OutgoingStreamMessage;
import org.apache.cassandra.streaming.messages.PrepareAckMessage;
import org.apache.cassandra.streaming.messages.PrepareSynAckMessage;
import org.apache.cassandra.streaming.messages.PrepareSynMessage;
import org.apache.cassandra.streaming.messages.ReceivedMessage;
import org.apache.cassandra.streaming.messages.SessionFailedMessage;
import org.apache.cassandra.streaming.messages.StreamInitMessage;
import org.apache.cassandra.streaming.messages.StreamMessage;
import org.apache.cassandra.streaming.messages.StreamMessageHeader;
import org.apache.cassandra.utils.FBUtilities;
import org.apache.cassandra.utils.JVMStabilityInspector;
import org.apache.cassandra.utils.NoSpamLogger;

import static com.google.common.collect.Iterables.all;
import static org.apache.cassandra.net.MessagingService.current_version;

/**
 * Handles the streaming a one or more streams to and from a specific remote node.
 *

* Both this node and the remote one will create a similar symmetrical {@link StreamSession}. A streaming * session has the following life-cycle: *

 * 1. Session Initialization
 *
 *   (a) A node (the initiator in the following) create a new {@link StreamSession},
 *       initialize it {@link #init(StreamResultFuture)}, and then start it ({@link #start()}).
 *       Starting a session causes a {@link StreamInitMessage} to be sent.
 *   (b) Upon reception of that {@link StreamInitMessage}, the follower creates its own {@link StreamSession},
 *       and initializes it if it still does not exist.
 *   (c) After the initiator sends the {@link StreamInitMessage}, it invokes
 *       {@link StreamSession#onInitializationComplete()} to start the streaming prepare phase.
 *
 * 2. Streaming preparation phase
 *
 *   (a) A {@link PrepareSynMessage} is sent that includes a) what files/sections this node will stream to the follower
 *       (stored locally in a {@link StreamTransferTask}, one for each table) and b) what the follower needs to
 *       stream back (stored locally in a {@link StreamReceiveTask}, one for each table).
 *   (b) Upon reception of the {@link PrepareSynMessage}, the follower records which files/sections it will receive
 *       and send back a {@link PrepareSynAckMessage}, which contains a summary of the files/sections that will be sent to
 *       the initiator.
 *   (c) When the initiator receives the {@link PrepareSynAckMessage}, it records which files/sections it will
 *       receive, and then goes to it's Streaming phase (see next section). If the intiator is to receive files,
 *       it sends a {@link PrepareAckMessage} to the follower to indicate that it can start streaming to the initiator.
 *   (d) (Optional) If the follower receives a {@link PrepareAckMessage}, it enters it's Streaming phase.
 *
 * 3. Streaming phase
 *
 *   (a) The streaming phase is started at each node by calling {@link StreamSession#startStreamingFiles(boolean)}.
 *       This will send, sequentially on each outbound streaming connection (see {@link NettyStreamingMessageSender}),
 *       an {@link OutgoingStreamMessage} for each stream in each of the {@link StreamTransferTask}.
 *       Each {@link OutgoingStreamMessage} consists of a {@link StreamMessageHeader} that contains metadata about
 *       the stream, followed by the stream content itself. Once all the files for a {@link StreamTransferTask} are sent,
 *       the task is marked complete {@link StreamTransferTask#complete(int)}.
 *   (b) On the receiving side, the incoming data is written to disk, and once the stream is fully received,
 *       it will be marked as complete ({@link StreamReceiveTask#received(IncomingStream)}). When all streams
 *       for the {@link StreamReceiveTask} have been received, the data is added to the CFS (and 2ndary indexes/MV are built),
 *        and the task is marked complete ({@link #taskCompleted(StreamReceiveTask)}).
 *   (b) If during the streaming of a particular stream an error occurs on the receiving end of a stream
 *       (it may be either the initiator or the follower), the node will send a {@link SessionFailedMessage}
 *       to the sender and close the stream session.
 *   (c) When all transfer and receive tasks for a session are complete, the session moves to the Completion phase
 *       ({@link #maybeCompleted()}).
 *
 * 4. Completion phase
 *
 *   (a) When the initiator finishes streaming, it enters the {@link StreamSession.State#WAIT_COMPLETE} state, and waits
 *       for the follower to send a {@link CompleteMessage} once it finishes streaming too. Once the {@link CompleteMessage}
 *       is received, initiator sets its own state to {@link StreamSession.State#COMPLETE} and closes all channels attached
 *       to this session.
 *
 * 
* * In brief, the message passing looks like this (I for initiator, F for follwer): *
 * (session init)
 * I: StreamInitMessage
 * (session prepare)
 * I: PrepareSynMessage
 * F: PrepareSynAckMessage
 * I: PrepareAckMessage
 * (stream - this can happen in both directions)
 * I: OutgoingStreamMessage
 * F: ReceivedMessage
 * (completion)
 * F: CompleteMessage
 *
* * All messages which derive from {@link StreamMessage} are sent by the standard internode messaging * (via {@link org.apache.cassandra.net.MessagingService}, while the actual files themselves are sent by a special * "streaming" connection type. See {@link NettyStreamingMessageSender} for details. Because of the asynchronous */ public class StreamSession implements IEndpointStateChangeSubscriber { private static final Logger logger = LoggerFactory.getLogger(StreamSession.class); // for test purpose to record received message and state transition public volatile static MessageStateSink sink = MessageStateSink.NONE; private final StreamOperation streamOperation; /** * Streaming endpoint. * * Each {@code StreamSession} is identified by this InetAddressAndPort which is broadcast address of the node streaming. */ public final InetAddressAndPort peer; private final OutboundConnectionSettings template; private final int index; // should not be null when session is started private StreamResultFuture streamResult; // stream requests to send to the peer protected final Set requests = Sets.newConcurrentHashSet(); // streaming tasks are created and managed per ColumnFamily ID @VisibleForTesting protected final ConcurrentHashMap transfers = new ConcurrentHashMap<>(); // data receivers, filled after receiving prepare message private final Map receivers = new ConcurrentHashMap<>(); private final StreamingMetrics metrics; final Map>> transferredRangesPerKeyspace = new HashMap<>(); private final boolean isFollower; private final StreamingMessageSender messageSender; // contains both inbound and outbound channels private final ConcurrentMap channels = new ConcurrentHashMap<>(); // "maybeCompleted()" should be executed at most once. Because it can be executed asynchronously by IO // threads(serialization/deserialization) and stream messaging processing thread, causing connection closed before // receiving peer's CompleteMessage. private boolean maybeCompleted = false; private Future closeFuture; private final UUID pendingRepair; private final PreviewKind previewKind; /** * State Transition: * *
 *  +------------------+-----> FAILED | ABORTED <---------------+
 *  |                  |              ^                         |
 *  |                  |              |       initiator         |
 *  INITIALIZED --> PREPARING --> STREAMING ------------> WAIT_COMPLETE ----> COMPLETED
 *  |                  |              |                         ^                 ^
 *  |                  |              |       follower          |                 |
 *  |                  |              +-------------------------)-----------------+
 *  |                  |                                        |                 |
 *  |                  |         if preview                     |                 |
 *  |                  +----------------------------------------+                 |
 *  |               nothing to request or to transfer                             |
 *  +-----------------------------------------------------------------------------+
 *                  nothing to request or to transfer
 *
 *  
*/ public enum State { INITIALIZED(false), PREPARING(false), STREAMING(false), WAIT_COMPLETE(false), COMPLETE(true), FAILED(true), ABORTED(true); private final boolean finalState; State(boolean finalState) { this.finalState = finalState; } /** * @return true if current state is final, either COMPLETE, FAILED, or ABORTED. */ public boolean isFinalState() { return finalState; } } private volatile State state = State.INITIALIZED; /** * Create new streaming session with the peer. */ public StreamSession(StreamOperation streamOperation, InetAddressAndPort peer, StreamConnectionFactory factory, boolean isFollower, int index, UUID pendingRepair, PreviewKind previewKind) { this.streamOperation = streamOperation; this.peer = peer; this.template = new OutboundConnectionSettings(peer); this.isFollower = isFollower; this.index = index; this.messageSender = new NettyStreamingMessageSender(this, template, factory, current_version, previewKind.isPreview()); this.metrics = StreamingMetrics.get(peer); this.pendingRepair = pendingRepair; this.previewKind = previewKind; logger.debug("Creating stream session to {} as {}", template, isFollower ? "follower" : "initiator"); } public boolean isFollower() { return isFollower; } public UUID planId() { return streamResult == null ? null : streamResult.planId; } public int sessionIndex() { return index; } public StreamOperation streamOperation() { return streamResult == null ? null : streamResult.streamOperation; } public StreamOperation getStreamOperation() { return streamOperation; } public UUID getPendingRepair() { return pendingRepair; } public boolean isPreview() { return previewKind.isPreview(); } public PreviewKind getPreviewKind() { return previewKind; } public StreamReceiver getAggregator(TableId tableId) { assert receivers.containsKey(tableId) : "Missing tableId " + tableId; return receivers.get(tableId).getReceiver(); } /** * Bind this session to report to specific {@link StreamResultFuture} and * perform pre-streaming initialization. * * @param streamResult result to report to */ public void init(StreamResultFuture streamResult) { this.streamResult = streamResult; StreamHook.instance.reportStreamFuture(this, streamResult); } /** * Attach a channel to this session upon receiving the first inbound message. * * @param channel The channel to attach. * @param isControlChannel If the channel is the one to send control messages to. * @return False if the channel was already attached, true otherwise. */ public synchronized boolean attachInbound(Channel channel, boolean isControlChannel) { failIfFinished(); if (messageSender instanceof NettyStreamingMessageSender) { NettyStreamingMessageSender nettyStreamingMessageSender = (NettyStreamingMessageSender) messageSender; if (!nettyStreamingMessageSender.hasControlChannel() && isControlChannel) nettyStreamingMessageSender.injectControlMessageChannel(channel); channel.closeFuture().addListener(ignored -> onChannelClose(channel)); return channels.putIfAbsent(channel.id(), channel) == null; } else { throw new IllegalStateException("Tried to attach non-Netty message sender"); } } /** * Attach a channel to this session upon sending the first outbound message. * * @param channel The channel to attach. * @return False if the channel was already attached, true otherwise. */ public synchronized boolean attachOutbound(Channel channel) { failIfFinished(); channel.closeFuture().addListener(ignored -> onChannelClose(channel)); return channels.putIfAbsent(channel.id(), channel) == null; } /** * On channel closing, if no channels are left just close the message sender; this must be closed last to ensure * keep alive messages are sent until the very end of the streaming session. */ private void onChannelClose(Channel channel) { if (channels.remove(channel.id()) != null && channels.isEmpty()) messageSender.close(); } /** * invoked by the node that begins the stream session (it may be sending files, receiving files, or both) */ public void start() { if (requests.isEmpty() && transfers.isEmpty()) { logger.info("[Stream #{}] Session does not have any tasks.", planId()); closeSession(State.COMPLETE); return; } try { logger.info("[Stream #{}] Starting streaming to {}{}", planId(), peer, template.connectTo == null ? "" : " through " + template.connectTo); getMessageSender().initialize(); onInitializationComplete(); } catch (Exception e) { JVMStabilityInspector.inspectThrowable(e); onError(e); } } /** * Request data fetch task to this session. * * Here, we have to encode both _local_ range transientness (encoded in Replica itself, in RangesAtEndpoint) * and _remote_ (source) range transientmess, which is encoded by splitting ranges into full and transient. * * @param keyspace Requesting keyspace * @param fullRanges Ranges to retrieve data that will return full data from the source * @param transientRanges Ranges to retrieve data that will return transient data from the source * @param columnFamilies ColumnFamily names. Can be empty if requesting all CF under the keyspace. */ public void addStreamRequest(String keyspace, RangesAtEndpoint fullRanges, RangesAtEndpoint transientRanges, Collection columnFamilies) { //It should either be a dummy address for repair or if it's a bootstrap/move/rebuild it should be this node assert all(fullRanges, Replica::isSelf) || RangesAtEndpoint.isDummyList(fullRanges) : fullRanges.toString(); assert all(transientRanges, Replica::isSelf) || RangesAtEndpoint.isDummyList(transientRanges) : transientRanges.toString(); requests.add(new StreamRequest(keyspace, fullRanges, transientRanges, columnFamilies)); } /** * Set up transfer for specific keyspace/ranges/CFs * * @param keyspace Transfer keyspace * @param replicas Transfer ranges * @param columnFamilies Transfer ColumnFamilies * @param flushTables flush tables? */ synchronized void addTransferRanges(String keyspace, RangesAtEndpoint replicas, Collection columnFamilies, boolean flushTables) { failIfFinished(); Collection stores = getColumnFamilyStores(keyspace, columnFamilies); if (flushTables) flushSSTables(stores); //Was it safe to remove this normalize, sorting seems not to matter, merging? Maybe we should have? //Do we need to unwrap here also or is that just making it worse? //Range and if it's transient RangesAtEndpoint unwrappedRanges = replicas.unwrap(); List streams = getOutgoingStreamsForRanges(unwrappedRanges, stores, pendingRepair, previewKind); addTransferStreams(streams); Set> toBeUpdated = transferredRangesPerKeyspace.get(keyspace); if (toBeUpdated == null) { toBeUpdated = new HashSet<>(); } toBeUpdated.addAll(replicas.ranges()); transferredRangesPerKeyspace.put(keyspace, toBeUpdated); } private void failIfFinished() { if (state().isFinalState()) throw new RuntimeException(String.format("Stream %s is finished with state %s", planId(), state().name())); } private Collection getColumnFamilyStores(String keyspace, Collection columnFamilies) { Collection stores = new HashSet<>(); // if columnfamilies are not specified, we add all cf under the keyspace if (columnFamilies.isEmpty()) { stores.addAll(Keyspace.open(keyspace).getColumnFamilyStores()); } else { for (String cf : columnFamilies) stores.add(Keyspace.open(keyspace).getColumnFamilyStore(cf)); } return stores; } @VisibleForTesting public List getOutgoingStreamsForRanges(RangesAtEndpoint replicas, Collection stores, UUID pendingRepair, PreviewKind previewKind) { List streams = new ArrayList<>(); try { for (ColumnFamilyStore cfs: stores) { streams.addAll(cfs.getStreamManager().createOutgoingStreams(this, replicas, pendingRepair, previewKind)); } } catch (Throwable t) { streams.forEach(OutgoingStream::finish); throw t; } return streams; } synchronized void addTransferStreams(Collection streams) { failIfFinished(); for (OutgoingStream stream: streams) { TableId tableId = stream.getTableId(); StreamTransferTask task = transfers.get(tableId); if (task == null) { //guarantee atomicity StreamTransferTask newTask = new StreamTransferTask(this, tableId); task = transfers.putIfAbsent(tableId, newTask); if (task == null) task = newTask; } task.addTransferStream(stream); } } private synchronized Future closeSession(State finalState) { // it's session is already closed if (closeFuture != null) return closeFuture; state(finalState); List> futures = new ArrayList<>(); // ensure aborting the tasks do not happen on the network IO thread (read: netty event loop) // as we don't want any blocking disk IO to stop the network thread if (finalState == State.FAILED || finalState == State.ABORTED) futures.add(ScheduledExecutors.nonPeriodicTasks.submit(this::abortTasks)); // Channels should only be closed by the initiator; but, if this session closed // due to failure, channels should be always closed regardless, even if this is not the initator. if (!isFollower || state != State.COMPLETE) { logger.debug("[Stream #{}] Will close attached channels {}", planId(), channels); channels.values().forEach(channel -> futures.add(channel.close())); } sink.onClose(peer); streamResult.handleSessionComplete(this); closeFuture = FBUtilities.allOf(futures); return closeFuture; } private void abortTasks() { try { receivers.values().forEach(StreamReceiveTask::abort); transfers.values().forEach(StreamTransferTask::abort); } catch (Exception e) { logger.warn("[Stream #{}] failed to abort some streaming tasks", planId(), e); } } /** * Set current state to {@code newState}. * * @param newState new state to set */ public void state(State newState) { if (logger.isTraceEnabled()) logger.trace("[Stream #{}] Changing session state from {} to {}", planId(), state, newState); sink.recordState(peer, newState); state = newState; } /** * @return current state */ public State state() { return state; } public StreamingMessageSender getMessageSender() { return messageSender; } /** * Return if this session completed successfully. * * @return true if session completed successfully. */ public boolean isSuccess() { return state == State.COMPLETE; } /** * Return if this session was failed or aborted * * @return true if session was failed or aborted */ public boolean isFailedOrAborted() { return state == State.FAILED || state == State.ABORTED; } public synchronized void messageReceived(StreamMessage message) { if (message.type != StreamMessage.Type.KEEP_ALIVE) failIfFinished(); sink.recordMessage(peer, message.type); switch (message.type) { case STREAM_INIT: // at follower, nop break; case PREPARE_SYN: // at follower PrepareSynMessage msg = (PrepareSynMessage) message; prepare(msg.requests, msg.summaries); break; case PREPARE_SYNACK: // at initiator prepareSynAck((PrepareSynAckMessage) message); break; case PREPARE_ACK: // at follower prepareAck((PrepareAckMessage) message); break; case STREAM: receive((IncomingStreamMessage) message); break; case RECEIVED: ReceivedMessage received = (ReceivedMessage) message; received(received.tableId, received.sequenceNumber); break; case COMPLETE: // at initiator complete(); break; case KEEP_ALIVE: // NOP - we only send/receive the KEEP_ALIVE to force the TCP connection to remain open break; case SESSION_FAILED: sessionFailed(); break; default: throw new AssertionError("unhandled StreamMessage type: " + message.getClass().getName()); } } /** * Call back when connection initialization is complete to start the prepare phase. */ public void onInitializationComplete() { // send prepare message state(State.PREPARING); PrepareSynMessage prepare = new PrepareSynMessage(); prepare.requests.addAll(requests); for (StreamTransferTask task : transfers.values()) prepare.summaries.add(task.getSummary()); getMessageSender().sendMessage(prepare); } /** * Signal an error to this stream session: if it's an EOF exception, it tries to understand if the socket was closed * after completion or because the peer was down, otherwise sends a {@link SessionFailedMessage} and closes * the session as {@link State#FAILED}. */ public synchronized Future onError(Throwable e) { boolean isEofException = e instanceof EOFException; if (isEofException) { if (state.finalState) { logger.debug("[Stream #{}] Socket closed after session completed with state {}", planId(), state); return null; } else { logger.error("[Stream #{}] Socket closed before session completion, peer {} is probably down.", planId(), peer.getHostAddressAndPort(), e); return closeSession(State.FAILED); } } else if (e instanceof TransactionAlreadyCompletedException && isFailedOrAborted()) { // StreamDeserializer threads may actively be writing SSTables when the stream // is failed or canceled, which aborts the lifecycle transaction and throws an exception // when any new SSTable is added. Since the stream has already failed, suppress // extra streaming log failure messages. logger.debug("Stream lifecycle transaction already completed after stream failure (ignore)", e); return null; } logError(e); if (getMessageSender().connected()) { state(State.FAILED); // make sure subsequent error handling sees the session in a final state getMessageSender().sendMessage(new SessionFailedMessage()); } return closeSession(State.FAILED); } private void logError(Throwable e) { if (e instanceof SocketTimeoutException) { logger.error("[Stream #{}] Did not receive response from peer {}{} for {} secs. Is peer down? " + "If not, maybe try increasing streaming_keep_alive_period_in_secs.", planId(), peer.getHostAddressAndPort(), template.connectTo == null ? "" : " through " + template.connectTo.getHostAddressAndPort(), 2 * DatabaseDescriptor.getStreamingKeepAlivePeriod(), e); } else { logger.error("[Stream #{}] Streaming error occurred on session with peer {}{}", planId(), peer.getHostAddressAndPort(), template.connectTo == null ? "" : " through " + template.connectTo.getHostAddressAndPort(), e); } } /** * Prepare this session for sending/receiving files. */ public void prepare(Collection requests, Collection summaries) { // prepare tasks state(State.PREPARING); ScheduledExecutors.nonPeriodicTasks.execute(() -> { try { prepareAsync(requests, summaries); } catch (Exception e) { onError(e); } }); } /** * Finish preparing the session. This method is blocking (memtables are flushed in {@link #addTransferRanges}), * so the logic should not execute on the main IO thread (read: netty event loop). */ @VisibleForTesting public void prepareAsync(Collection requests, Collection summaries) { processStreamRequests(requests); for (StreamSummary summary : summaries) prepareReceiving(summary); PrepareSynAckMessage prepareSynAck = new PrepareSynAckMessage(); if (!peer.equals(FBUtilities.getBroadcastAddressAndPort())) for (StreamTransferTask task : transfers.values()) prepareSynAck.summaries.add(task.getSummary()); getMessageSender().sendMessage(prepareSynAck); streamResult.handleSessionPrepared(this); if (isPreview()) completePreview(); else maybeCompleted(); } private void prepareSynAck(PrepareSynAckMessage msg) { if (!msg.summaries.isEmpty()) { for (StreamSummary summary : msg.summaries) prepareReceiving(summary); // only send the (final) ACK if we are expecting the peer to send this node (the initiator) some files if (!isPreview()) getMessageSender().sendMessage(new PrepareAckMessage()); } if (isPreview()) completePreview(); else startStreamingFiles(true); } private void prepareAck(PrepareAckMessage msg) { if (isPreview()) throw new RuntimeException(String.format("[Stream #%s] Cannot receive PrepareAckMessage for preview session", planId())); startStreamingFiles(true); } private void processStreamRequests(Collection requests) { List rejectedRequests = new ArrayList<>(); // group requests by keyspace Multimap requestsByKeyspace = ArrayListMultimap.create(); requests.forEach(r -> requestsByKeyspace.put(r.keyspace, r)); requestsByKeyspace.asMap().forEach((ks, reqs) -> { OwnedRanges ownedRanges = StorageService.instance.getNormalizedLocalRanges(ks); reqs.forEach(req -> { RangesAtEndpoint allRangesAtEndpoint = RangesAtEndpoint.concat(req.full, req.transientReplicas); if (ownedRanges.validateRangeRequest(allRangesAtEndpoint.ranges(), "Stream #" + planId(), "stream request", peer)) addTransferRanges(req.keyspace, allRangesAtEndpoint, req.columnFamilies, true); // always flush on stream request else rejectedRequests.add(req); }); }); if (!rejectedRequests.isEmpty()) throw new StreamRequestOutOfTokenRangeException(rejectedRequests); } /** * Call back after sending StreamMessageHeader. * * @param message sent stream message */ public void streamSent(OutgoingStreamMessage message) { long headerSize = message.stream.getEstimatedSize(); StreamingMetrics.totalOutgoingBytes.inc(headerSize); metrics.outgoingBytes.inc(headerSize); if(StreamOperation.REPAIR == getStreamOperation()) { StreamingMetrics.totalOutgoingRepairBytes.inc(headerSize); StreamingMetrics.totalOutgoingRepairSSTables.inc(message.stream.getNumFiles()); } // schedule timeout for receiving ACK StreamTransferTask task = transfers.get(message.header.tableId); if (task != null) { task.scheduleTimeout(message.header.sequenceNumber, 12, TimeUnit.HOURS); } } /** * Call back after receiving a stream. * * @param message received stream */ public void receive(IncomingStreamMessage message) { if (isPreview()) { throw new RuntimeException(String.format("[Stream #%s] Cannot receive files for preview session", planId())); } long headerSize = message.stream.getSize(); StreamingMetrics.totalIncomingBytes.inc(headerSize); metrics.incomingBytes.inc(headerSize); // send back file received message getMessageSender().sendMessage(new ReceivedMessage(message.header.tableId, message.header.sequenceNumber)); StreamHook.instance.reportIncomingStream(message.header.tableId, message.stream, this, message.header.sequenceNumber); long receivedStartNanos = System.nanoTime(); try { receivers.get(message.header.tableId).received(message.stream); } finally { long latencyNanos = System.nanoTime() - receivedStartNanos; metrics.incomingProcessTime.update(latencyNanos, TimeUnit.NANOSECONDS); long latencyMs = TimeUnit.NANOSECONDS.toMillis(latencyNanos); int timeout = DatabaseDescriptor.getInternodeStreamingTcpUserTimeoutInMS(); if (timeout > 0 && latencyMs > timeout) NoSpamLogger.log(logger, NoSpamLogger.Level.WARN, 1, TimeUnit.MINUTES, "The time taken ({} ms) for processing the incoming stream message ({})" + " exceeded internode streaming TCP user timeout ({} ms).\n" + "The streaming connection might be closed due to tcp user timeout.\n" + "Try to increase the internode_streaming_tcp_user_timeout_in_ms" + " or set it to 0 to use system defaults.", latencyMs, message, timeout); } } public void progress(String filename, ProgressInfo.Direction direction, long bytes, long total) { ProgressInfo progress = new ProgressInfo(peer, index, filename, direction, bytes, total); streamResult.handleProgress(progress); } public void received(TableId tableId, int sequenceNumber) { transfers.get(tableId).complete(sequenceNumber); } /** * Check if session is completed on receiving {@code StreamMessage.Type.COMPLETE} message. */ public synchronized void complete() { logger.debug("[Stream #{}] handling Complete message, state = {}", planId(), state); if (!isFollower) { if (state == State.WAIT_COMPLETE) closeSession(State.COMPLETE); else state(State.WAIT_COMPLETE); } else { // pre-4.0 nodes should not be connected via streaming, see {@link MessagingService#accept_streaming} throw new IllegalStateException(String.format("[Stream #%s] Complete message can be only received by the initiator!", planId())); } } /** * Synchronize both {@link #complete()} and {@link #maybeCompleted()} to avoid racing */ private synchronized boolean maybeCompleted() { if (!(receivers.isEmpty() && transfers.isEmpty())) return false; // if already executed once, skip it if (maybeCompleted) return true; maybeCompleted = true; if (!isFollower) { if (state == State.WAIT_COMPLETE) closeSession(State.COMPLETE); else state(State.WAIT_COMPLETE); } else { getMessageSender().sendMessage(new CompleteMessage()); closeSession(State.COMPLETE); } return true; } /** * Call back on receiving {@code StreamMessage.Type.SESSION_FAILED} message. */ public synchronized void sessionFailed() { logger.error("[Stream #{}] Remote peer {} failed stream session.", planId(), peer.toString()); closeSession(State.FAILED); } /** * @return Current snapshot of this session info. */ public SessionInfo getSessionInfo() { List receivingSummaries = Lists.newArrayList(); for (StreamTask receiver : receivers.values()) receivingSummaries.add(receiver.getSummary()); List transferSummaries = Lists.newArrayList(); for (StreamTask transfer : transfers.values()) transferSummaries.add(transfer.getSummary()); // TODO: the connectTo treatment here is peculiar, and needs thinking about - since the connection factory can change it return new SessionInfo(peer, index, template.connectTo == null ? peer : template.connectTo, receivingSummaries, transferSummaries, state); } public synchronized void taskCompleted(StreamReceiveTask completedTask) { receivers.remove(completedTask.tableId); maybeCompleted(); } public synchronized void taskCompleted(StreamTransferTask completedTask) { transfers.remove(completedTask.tableId); maybeCompleted(); } public void onRemove(InetAddressAndPort endpoint) { logger.error("[Stream #{}] Session failed because remote peer {} has left.", planId(), peer.toString()); closeSession(State.FAILED); } public void onRestart(InetAddressAndPort endpoint, EndpointState epState) { logger.error("[Stream #{}] Session failed because remote peer {} was restarted.", planId(), peer.toString()); closeSession(State.FAILED); } private void completePreview() { try { state(State.WAIT_COMPLETE); closeSession(State.COMPLETE); } finally { // aborting the tasks here needs to be the last thing we do so that we accurately report // expected streaming, but don't leak any resources held by the task for (StreamTask task : Iterables.concat(receivers.values(), transfers.values())) task.abort(); } } /** * Flushes matching column families from the given keyspace, or all columnFamilies * if the cf list is empty. */ private void flushSSTables(Iterable stores) { List> flushes = new ArrayList<>(); for (ColumnFamilyStore cfs : stores) flushes.add(cfs.forceFlush()); FBUtilities.waitOnFutures(flushes); } @VisibleForTesting public synchronized void prepareReceiving(StreamSummary summary) { failIfFinished(); if (summary.files > 0) receivers.put(summary.tableId, new StreamReceiveTask(this, summary.tableId, summary.files, summary.totalSize)); } private void startStreamingFiles(boolean notifyPrepared) { if (notifyPrepared) streamResult.handleSessionPrepared(this); state(State.STREAMING); for (StreamTransferTask task : transfers.values()) { Collection messages = task.getFileMessages(); if (!messages.isEmpty()) { for (OutgoingStreamMessage ofm : messages) { // pass the session planId/index to the OFM (which is only set at init(), after the transfers have already been created) ofm.header.addSessionInfo(this); getMessageSender().sendMessage(ofm); } } else { taskCompleted(task); // there are no files to send } } maybeCompleted(); } @VisibleForTesting public int getNumRequests() { return requests.size(); } @VisibleForTesting public int getNumTransfers() { return transferredRangesPerKeyspace.size(); } @VisibleForTesting public static interface MessageStateSink { static final MessageStateSink NONE = new MessageStateSink() { @Override public void recordState(InetAddressAndPort from, State state) { } @Override public void recordMessage(InetAddressAndPort from, StreamMessage.Type message) { } @Override public void onClose(InetAddressAndPort from) { } }; /** * @param from peer that is connected in the stream session * @param state new state to change to */ public void recordState(InetAddressAndPort from, StreamSession.State state); /** * @param from peer that sends the given message * @param message stream message sent by peer */ public void recordMessage(InetAddressAndPort from, StreamMessage.Type message); /** * * @param from peer that is being disconnected */ public void onClose(InetAddressAndPort from); } public synchronized void abort() { if (state.isFinalState()) { logger.debug("[Stream #{}] Stream session with peer {} is already in a final state on abort.", planId(), peer); return; } logger.info("[Stream #{}] Aborting stream session with peer {}...", planId(), peer); if (getMessageSender().connected()) getMessageSender().sendMessage(new SessionFailedMessage()); try { closeSession(State.ABORTED); } catch (Exception e) { logger.error("[Stream #{}] Error aborting stream session with peer {}", planId(), peer); } } }




© 2015 - 2024 Weber Informatics LLC | Privacy Policy