
kafka.controller.TopicDeletionManager.scala Maven / Gradle / Ivy
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package kafka.controller
import kafka.utils.Logging
import kafka.zk.KafkaZkClient
import org.apache.kafka.common.TopicPartition
import scala.collection.{Set, mutable}
/**
* This manages the state machine for topic deletion.
* 1. TopicCommand issues topic deletion by creating a new admin path /admin/delete_topics/
* 2. The controller listens for child changes on /admin/delete_topic and starts topic deletion for the respective topics
* 3. The controller's ControllerEventThread handles topic deletion. A topic will be ineligible
* for deletion in the following scenarios -
* 3.1 broker hosting one of the replicas for that topic goes down
* 3.2 partition reassignment for partitions of that topic is in progress
* 4. Topic deletion is resumed when -
* 4.1 broker hosting one of the replicas for that topic is started
* 4.2 partition reassignment for partitions of that topic completes
* 5. Every replica for a topic being deleted is in either of the 3 states -
* 5.1 TopicDeletionStarted Replica enters TopicDeletionStarted phase when onPartitionDeletion is invoked.
* This happens when the child change watch for /admin/delete_topics fires on the controller. As part of this state
* change, the controller sends StopReplicaRequests to all replicas. It registers a callback for the
* StopReplicaResponse when deletePartition=true thereby invoking a callback when a response for delete replica
* is received from every replica)
* 5.2 TopicDeletionSuccessful moves replicas from
* TopicDeletionStarted->TopicDeletionSuccessful depending on the error codes in StopReplicaResponse
* 5.3 TopicDeletionFailed moves replicas from
* TopicDeletionStarted->TopicDeletionFailed depending on the error codes in StopReplicaResponse.
* In general, if a broker dies and if it hosted replicas for topics being deleted, the controller marks the
* respective replicas in TopicDeletionFailed state in the onBrokerFailure callback. The reason is that if a
* broker fails before the request is sent and after the replica is in TopicDeletionStarted state,
* it is possible that the replica will mistakenly remain in TopicDeletionStarted state and topic deletion
* will not be retried when the broker comes back up.
* 6. A topic is marked successfully deleted only if all replicas are in TopicDeletionSuccessful
* state. Topic deletion teardown mode deletes all topic state from the controllerContext
* as well as from zookeeper. This is the only time the /brokers/topics/ path gets deleted. On the other hand,
* if no replica is in TopicDeletionStarted state and at least one replica is in TopicDeletionFailed state, then
* it marks the topic for deletion retry.
* @param controller
*/
class TopicDeletionManager(controller: KafkaController,
eventManager: ControllerEventManager,
zkClient: KafkaZkClient) extends Logging {
this.logIdent = s"[Topic Deletion Manager ${controller.config.brokerId}], "
val controllerContext = controller.controllerContext
val isDeleteTopicEnabled = controller.config.deleteTopicEnable
val topicsToBeDeleted = mutable.Set.empty[String]
/** The following topicsWithDeletionStarted variable is used to properly update the offlinePartitionCount metric.
* When a topic is going through deletion, we don't want to keep track of its partition state
* changes in the offlinePartitionCount metric, see the PartitionStateMachine#updateControllerMetrics
* for detailed logic. This goal means if some partitions of a topic are already
* in OfflinePartition state when deletion starts, we need to change the corresponding partition
* states to NonExistentPartition first before starting the deletion.
*
* However we can NOT change partition states to NonExistentPartition at the time of enqueuing topics
* for deletion. The reason is that when a topic is enqueued for deletion, it may be ineligible for
* deletion due to ongoing partition reassignments. Hence there might be a delay between enqueuing
* a topic for deletion and the actual start of deletion. In this delayed interval, partitions may still
* transition to or out of the OfflinePartition state.
*
* Hence we decide to change partition states to NonExistentPartition only when the actual deletion have started.
* For topics whose deletion have actually started, we keep track of them in the following topicsWithDeletionStarted
* variable. And once a topic is in the topicsWithDeletionStarted set, we are sure there will no longer
* be partition reassignments to any of its partitions, and only then it's safe to move its partitions to
* NonExistentPartition state. Once a topic is in the topicsWithDeletionStarted set, we will stop monitoring
* its partition state changes in the offlinePartitionCount metric
*/
val topicsWithDeletionStarted = mutable.Set.empty[String]
val topicsIneligibleForDeletion = mutable.Set.empty[String]
def init(initialTopicsToBeDeleted: Set[String], initialTopicsIneligibleForDeletion: Set[String]): Unit = {
if (isDeleteTopicEnabled) {
topicsToBeDeleted ++= initialTopicsToBeDeleted
topicsIneligibleForDeletion ++= initialTopicsIneligibleForDeletion & topicsToBeDeleted
} else {
// if delete topic is disabled clean the topic entries under /admin/delete_topics
info(s"Removing $initialTopicsToBeDeleted since delete topic is disabled")
zkClient.deleteTopicDeletions(initialTopicsToBeDeleted.toSeq, controllerContext.epochZkVersion)
}
}
def tryTopicDeletion(): Unit = {
if (isDeleteTopicEnabled) {
resumeDeletions()
}
}
/**
* Invoked when the current controller resigns. At this time, all state for topic deletion should be cleared.
*/
def reset() {
if (isDeleteTopicEnabled) {
topicsToBeDeleted.clear()
topicsWithDeletionStarted.clear()
topicsIneligibleForDeletion.clear()
}
}
/**
* Invoked by the child change listener on /admin/delete_topics to queue up the topics for deletion. The topic gets added
* to the topicsToBeDeleted list and only gets removed from the list when the topic deletion has completed successfully
* i.e. all replicas of all partitions of that topic are deleted successfully.
* @param topics Topics that should be deleted
*/
def enqueueTopicsForDeletion(topics: Set[String]) {
if (isDeleteTopicEnabled) {
topicsToBeDeleted ++= topics
resumeDeletions()
}
}
/**
* Invoked when any event that can possibly resume topic deletion occurs. These events include -
* 1. New broker starts up. Any replicas belonging to topics queued up for deletion can be deleted since the broker is up
* 2. Partition reassignment completes. Any partitions belonging to topics queued up for deletion finished reassignment
* @param topics Topics for which deletion can be resumed
*/
def resumeDeletionForTopics(topics: Set[String] = Set.empty) {
if (isDeleteTopicEnabled) {
val topicsToResumeDeletion = topics & topicsToBeDeleted
if (topicsToResumeDeletion.nonEmpty) {
topicsIneligibleForDeletion --= topicsToResumeDeletion
resumeDeletions()
}
}
}
/**
* Invoked when a broker that hosts replicas for topics to be deleted goes down. Also invoked when the callback for
* StopReplicaResponse receives an error code for the replicas of a topic to be deleted. As part of this, the replicas
* are moved from ReplicaDeletionStarted to ReplicaDeletionIneligible state. Also, the topic is added to the list of topics
* ineligible for deletion until further notice.
* @param replicas Replicas for which deletion has failed
*/
def failReplicaDeletion(replicas: Set[PartitionAndReplica]) {
if (isDeleteTopicEnabled) {
val replicasThatFailedToDelete = replicas.filter(r => isTopicQueuedUpForDeletion(r.topic))
if (replicasThatFailedToDelete.nonEmpty) {
val topics = replicasThatFailedToDelete.map(_.topic)
debug(s"Deletion failed for replicas ${replicasThatFailedToDelete.mkString(",")}. Halting deletion for topics $topics")
controller.replicaStateMachine.handleStateChanges(replicasThatFailedToDelete.toSeq, ReplicaDeletionIneligible)
markTopicIneligibleForDeletion(topics)
resumeDeletions()
}
}
}
/**
* Halt delete topic if -
* 1. replicas being down
* 2. partition reassignment in progress for some partitions of the topic
* @param topics Topics that should be marked ineligible for deletion. No op if the topic is was not previously queued up for deletion
*/
def markTopicIneligibleForDeletion(topics: Set[String]) {
if (isDeleteTopicEnabled) {
val newTopicsToHaltDeletion = topicsToBeDeleted & topics
topicsIneligibleForDeletion ++= newTopicsToHaltDeletion
if (newTopicsToHaltDeletion.nonEmpty)
info(s"Halted deletion of topics ${newTopicsToHaltDeletion.mkString(",")}")
}
}
private def isTopicIneligibleForDeletion(topic: String): Boolean = {
if (isDeleteTopicEnabled) {
topicsIneligibleForDeletion.contains(topic)
} else
true
}
private def isTopicDeletionInProgress(topic: String): Boolean = {
if (isDeleteTopicEnabled) {
controller.replicaStateMachine.isAtLeastOneReplicaInDeletionStartedState(topic)
} else
false
}
def isTopicWithDeletionStarted(topic: String) = {
if (isDeleteTopicEnabled) {
topicsWithDeletionStarted.contains(topic)
} else
false
}
def isTopicQueuedUpForDeletion(topic: String): Boolean = {
if (isDeleteTopicEnabled) {
topicsToBeDeleted.contains(topic)
} else
false
}
/**
* Invoked by the StopReplicaResponse callback when it receives no error code for a replica of a topic to be deleted.
* As part of this, the replicas are moved from ReplicaDeletionStarted to ReplicaDeletionSuccessful state. Tears down
* the topic if all replicas of a topic have been successfully deleted
* @param replicas Replicas that were successfully deleted by the broker
*/
def completeReplicaDeletion(replicas: Set[PartitionAndReplica]) {
val successfullyDeletedReplicas = replicas.filter(r => isTopicQueuedUpForDeletion(r.topic))
debug(s"Deletion successfully completed for replicas ${successfullyDeletedReplicas.mkString(",")}")
controller.replicaStateMachine.handleStateChanges(successfullyDeletedReplicas.toSeq, ReplicaDeletionSuccessful)
resumeDeletions()
}
/**
* Topic deletion can be retried if -
* 1. Topic deletion is not already complete
* 2. Topic deletion is currently not in progress for that topic
* 3. Topic is currently marked ineligible for deletion
* @param topic Topic
* @return Whether or not deletion can be retried for the topic
*/
private def isTopicEligibleForDeletion(topic: String): Boolean = {
topicsToBeDeleted.contains(topic) && (!isTopicDeletionInProgress(topic) && !isTopicIneligibleForDeletion(topic))
}
/**
* If the topic is queued for deletion but deletion is not currently under progress, then deletion is retried for that topic
* To ensure a successful retry, reset states for respective replicas from ReplicaDeletionIneligible to OfflineReplica state
*@param topic Topic for which deletion should be retried
*/
private def markTopicForDeletionRetry(topic: String) {
// reset replica states from ReplicaDeletionIneligible to OfflineReplica
val failedReplicas = controller.replicaStateMachine.replicasInState(topic, ReplicaDeletionIneligible)
info(s"Retrying delete topic for topic $topic since replicas ${failedReplicas.mkString(",")} were not successfully deleted")
controller.replicaStateMachine.handleStateChanges(failedReplicas.toSeq, OfflineReplica)
}
private def completeDeleteTopic(topic: String) {
// deregister partition change listener on the deleted topic. This is to prevent the partition change listener
// firing before the new topic listener when a deleted topic gets auto created
controller.unregisterPartitionModificationsHandlers(Seq(topic))
val replicasForDeletedTopic = controller.replicaStateMachine.replicasInState(topic, ReplicaDeletionSuccessful)
// controller will remove this replica from the state machine as well as its partition assignment cache
controller.replicaStateMachine.handleStateChanges(replicasForDeletedTopic.toSeq, NonExistentReplica)
topicsToBeDeleted -= topic
topicsWithDeletionStarted -= topic
zkClient.deleteTopicZNode(topic, controllerContext.epochZkVersion)
zkClient.deleteTopicConfigs(Seq(topic), controllerContext.epochZkVersion)
zkClient.deleteTopicDeletions(Seq(topic), controllerContext.epochZkVersion)
controllerContext.removeTopic(topic)
}
/**
* Invoked with the list of topics to be deleted
* It invokes onPartitionDeletion for all partitions of a topic.
* The updateMetadataRequest is also going to set the leader for the topics being deleted to
* {@link LeaderAndIsr#LeaderDuringDelete}. This lets each broker know that this topic is being deleted and can be
* removed from their caches.
*/
private def onTopicDeletion(topics: Set[String]) {
info(s"Topic deletion callback for ${topics.mkString(",")}")
// send update metadata so that brokers stop serving data for topics to be deleted
val partitions = topics.flatMap(controllerContext.partitionsForTopic)
val unseenTopicsForDeletion = topics -- topicsWithDeletionStarted
if (unseenTopicsForDeletion.nonEmpty) {
val unseenPartitionsForDeletion = unseenTopicsForDeletion.flatMap(controllerContext.partitionsForTopic)
controller.partitionStateMachine.handleStateChanges(unseenPartitionsForDeletion.toSeq, OfflinePartition)
controller.partitionStateMachine.handleStateChanges(unseenPartitionsForDeletion.toSeq, NonExistentPartition)
// adding of unseenTopicsForDeletion to topicsBeingDeleted must be done after the partition state changes
// to make sure the offlinePartitionCount metric is properly updated
topicsWithDeletionStarted ++= unseenTopicsForDeletion
}
controller.sendUpdateMetadataRequest(controllerContext.liveOrShuttingDownBrokerIds.toSeq, partitions)
topics.foreach { topic =>
onPartitionDeletion(controllerContext.partitionsForTopic(topic))
}
}
/**
* Invoked by onPartitionDeletion. It is the 2nd step of topic deletion, the first being sending
* UpdateMetadata requests to all brokers to start rejecting requests for deleted topics. As part of starting deletion,
* the topics are added to the in progress list. As long as a topic is in the in progress list, deletion for that topic
* is never retried. A topic is removed from the in progress list when
* 1. Either the topic is successfully deleted OR
* 2. No replica for the topic is in ReplicaDeletionStarted state and at least one replica is in ReplicaDeletionIneligible state
* If the topic is queued for deletion but deletion is not currently under progress, then deletion is retried for that topic
* As part of starting deletion, all replicas are moved to the ReplicaDeletionStarted state where the controller sends
* the replicas a StopReplicaRequest (delete=true)
* This method does the following things -
* 1. Move all dead replicas directly to ReplicaDeletionIneligible state. Also mark the respective topics ineligible
* for deletion if some replicas are dead since it won't complete successfully anyway
* 2. Move all alive replicas to ReplicaDeletionStarted state so they can be deleted successfully
*@param replicasForTopicsToBeDeleted
*/
private def startReplicaDeletion(replicasForTopicsToBeDeleted: Set[PartitionAndReplica]) {
replicasForTopicsToBeDeleted.groupBy(_.topic).keys.foreach { topic =>
val aliveReplicasForTopic = controllerContext.allLiveReplicas().filter(p => p.topic == topic)
val deadReplicasForTopic = replicasForTopicsToBeDeleted -- aliveReplicasForTopic
val successfullyDeletedReplicas = controller.replicaStateMachine.replicasInState(topic, ReplicaDeletionSuccessful)
val replicasForDeletionRetry = aliveReplicasForTopic -- successfullyDeletedReplicas
// move dead replicas directly to failed state
controller.replicaStateMachine.handleStateChanges(deadReplicasForTopic.toSeq, ReplicaDeletionIneligible, new Callbacks())
// send stop replica to all followers that are not in the OfflineReplica state so they stop sending fetch requests to the leader
controller.replicaStateMachine.handleStateChanges(replicasForDeletionRetry.toSeq, OfflineReplica, new Callbacks())
debug(s"Deletion started for replicas ${replicasForDeletionRetry.mkString(",")}")
controller.replicaStateMachine.handleStateChanges(replicasForDeletionRetry.toSeq, ReplicaDeletionStarted,
new Callbacks(stopReplicaResponseCallback = (stopReplicaResponseObj, replicaId) =>
eventManager.put(controller.TopicDeletionStopReplicaResponseReceived(stopReplicaResponseObj, replicaId))))
if (deadReplicasForTopic.nonEmpty) {
debug(s"Dead Replicas (${deadReplicasForTopic.mkString(",")}) found for topic $topic")
markTopicIneligibleForDeletion(Set(topic))
}
}
}
/**
* Invoked by onTopicDeletion with the list of partitions for topics to be deleted
* It does the following -
* 1. Send UpdateMetadataRequest to all live brokers (that are not shutting down) for partitions that are being
* deleted. The brokers start rejecting all client requests with UnknownTopicOrPartitionException
* 2. Move all replicas for the partitions to OfflineReplica state. This will send StopReplicaRequest to the replicas
* and LeaderAndIsrRequest to the leader with the shrunk ISR. When the leader replica itself is moved to OfflineReplica state,
* it will skip sending the LeaderAndIsrRequest since the leader will be updated to -1
* 3. Move all replicas to ReplicaDeletionStarted state. This will send StopReplicaRequest with deletePartition=true. And
* will delete all persistent data from all replicas of the respective partitions
*/
private def onPartitionDeletion(partitionsToBeDeleted: Set[TopicPartition]) {
info(s"Partition deletion callback for ${partitionsToBeDeleted.mkString(",")}")
val replicasPerPartition = controllerContext.replicasForPartition(partitionsToBeDeleted)
startReplicaDeletion(replicasPerPartition)
}
private def resumeDeletions(): Unit = {
val topicsQueuedForDeletion = Set.empty[String] ++ topicsToBeDeleted
if (topicsQueuedForDeletion.nonEmpty)
info(s"Handling deletion for topics ${topicsQueuedForDeletion.mkString(",")}")
topicsQueuedForDeletion.foreach { topic =>
// if all replicas are marked as deleted successfully, then topic deletion is done
if (controller.replicaStateMachine.areAllReplicasForTopicDeleted(topic)) {
// clear up all state for this topic from controller cache and zookeeper
completeDeleteTopic(topic)
info(s"Deletion of topic $topic successfully completed")
} else {
if (controller.replicaStateMachine.isAtLeastOneReplicaInDeletionStartedState(topic)) {
// ignore since topic deletion is in progress
val replicasInDeletionStartedState = controller.replicaStateMachine.replicasInState(topic, ReplicaDeletionStarted)
val replicaIds = replicasInDeletionStartedState.map(_.replica)
val partitions = replicasInDeletionStartedState.map(_.topicPartition)
info(s"Deletion for replicas ${replicaIds.mkString(",")} for partition ${partitions.mkString(",")} of topic $topic in progress")
} else {
// if you come here, then no replica is in TopicDeletionStarted and all replicas are not in
// TopicDeletionSuccessful. That means, that either given topic haven't initiated deletion
// or there is at least one failed replica (which means topic deletion should be retried).
if (controller.replicaStateMachine.isAnyReplicaInState(topic, ReplicaDeletionIneligible)) {
// mark topic for deletion retry
markTopicForDeletionRetry(topic)
}
}
}
// Try delete topic if it is eligible for deletion.
if (isTopicEligibleForDeletion(topic)) {
info(s"Deletion of topic $topic (re)started")
// topic deletion will be kicked off
onTopicDeletion(Set(topic))
} else if (isTopicIneligibleForDeletion(topic)) {
info(s"Not retrying deletion of topic $topic at this time since it is marked ineligible for deletion")
}
}
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy