Please wait. This can take some minutes ...
Many resources are needed to download a project. Please understand that we have to compensate our server costs. Thank you in advance.
Project price only 1 $
You can buy this project and download/modify it how often you want.
io.streamnative.pulsar.handlers.kop.proxy.KafkaProxyRequestHandler Maven / Gradle / Ivy
/**
* Copyright (c) 2019 - 2024 StreamNative, Inc.. All Rights Reserved.
*/
/**
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package io.streamnative.pulsar.handlers.kop.proxy;
import static java.nio.charset.StandardCharsets.UTF_8;
import static org.apache.kafka.common.message.DeleteRecordsRequestData.DeleteRecordsTopic;
import static org.apache.kafka.common.message.DescribeClusterResponseData.DescribeClusterBroker;
import static org.apache.kafka.common.message.DescribeClusterResponseData.DescribeClusterBrokerCollection;
import static org.apache.kafka.common.message.FetchRequestData.FetchTopic;
import static org.apache.kafka.common.message.FetchResponseData.FetchableTopicResponse;
import static org.apache.kafka.common.message.ListOffsetsRequestData.ListOffsetsPartition;
import static org.apache.kafka.common.message.ListOffsetsRequestData.ListOffsetsTopic;
import static org.apache.kafka.common.message.ListOffsetsResponseData.ListOffsetsPartitionResponse;
import static org.apache.kafka.common.message.ListOffsetsResponseData.ListOffsetsTopicResponse;
import static org.apache.kafka.common.message.ProduceRequestData.TopicProduceData;
import static org.apache.kafka.common.message.ProduceRequestData.TopicProduceDataCollection;
import static org.apache.kafka.common.message.ProduceResponseData.PartitionProduceResponse;
import static org.apache.kafka.common.message.ProduceResponseData.TopicProduceResponse;
import static org.apache.kafka.common.requests.ProduceResponse.PartitionResponse;
import static org.apache.kafka.common.requests.ProduceResponse.RecordError;
import com.github.benmanes.caffeine.cache.Cache;
import com.google.common.annotations.VisibleForTesting;
import io.netty.buffer.ByteBuf;
import io.netty.channel.Channel;
import io.netty.channel.ChannelHandlerContext;
import io.netty.channel.ChannelInboundHandlerAdapter;
import io.netty.handler.ssl.SslHandshakeCompletionEvent;
import io.netty.util.ReferenceCountUtil;
import io.streamnative.pulsar.handlers.kop.EndPoint;
import io.streamnative.pulsar.handlers.kop.KafkaRequestHandler;
import io.streamnative.pulsar.handlers.kop.KafkaServiceConfiguration;
import io.streamnative.pulsar.handlers.kop.security.Authenticator;
import io.streamnative.pulsar.handlers.kop.security.ProxySslSaslServer;
import io.streamnative.pulsar.handlers.kop.security.Session;
import io.streamnative.pulsar.handlers.kop.security.SslAuthenticator;
import io.streamnative.pulsar.handlers.kop.utils.CoreUtils;
import io.streamnative.pulsar.handlers.kop.utils.KafkaResponseUtils;
import java.io.IOException;
import java.net.InetSocketAddress;
import java.util.ArrayList;
import java.util.Collections;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Optional;
import java.util.concurrent.CompletableFuture;
import java.util.concurrent.LinkedBlockingQueue;
import java.util.concurrent.atomic.AtomicBoolean;
import java.util.function.Function;
import lombok.extern.slf4j.Slf4j;
import org.apache.commons.lang3.tuple.Pair;
import org.apache.kafka.common.Node;
import org.apache.kafka.common.TopicPartition;
import org.apache.kafka.common.errors.UnsupportedVersionException;
import org.apache.kafka.common.message.DeleteRecordsRequestData;
import org.apache.kafka.common.message.FetchRequestData;
import org.apache.kafka.common.message.FetchResponseData;
import org.apache.kafka.common.message.FindCoordinatorResponseData;
import org.apache.kafka.common.message.ListOffsetsResponseData;
import org.apache.kafka.common.message.MetadataResponseData;
import org.apache.kafka.common.message.ProduceRequestData;
import org.apache.kafka.common.message.ProduceResponseData;
import org.apache.kafka.common.message.SaslAuthenticateRequestData;
import org.apache.kafka.common.message.SaslHandshakeRequestData;
import org.apache.kafka.common.protocol.ApiKeys;
import org.apache.kafka.common.protocol.Errors;
import org.apache.kafka.common.requests.AddOffsetsToTxnResponse;
import org.apache.kafka.common.requests.AddPartitionsToTxnResponse;
import org.apache.kafka.common.requests.DeleteRecordsRequest;
import org.apache.kafka.common.requests.DeleteRecordsResponse;
import org.apache.kafka.common.requests.DescribeClusterResponse;
import org.apache.kafka.common.requests.EndTxnResponse;
import org.apache.kafka.common.requests.FetchRequest;
import org.apache.kafka.common.requests.FetchResponse;
import org.apache.kafka.common.requests.FindCoordinatorRequest;
import org.apache.kafka.common.requests.FindCoordinatorResponse;
import org.apache.kafka.common.requests.HeartbeatResponse;
import org.apache.kafka.common.requests.InitProducerIdResponse;
import org.apache.kafka.common.requests.JoinGroupResponse;
import org.apache.kafka.common.requests.KopResponseUtils;
import org.apache.kafka.common.requests.LeaveGroupResponse;
import org.apache.kafka.common.requests.ListOffsetsRequest;
import org.apache.kafka.common.requests.ListOffsetsResponse;
import org.apache.kafka.common.requests.MetadataResponse;
import org.apache.kafka.common.requests.OffsetCommitResponse;
import org.apache.kafka.common.requests.OffsetDeleteResponse;
import org.apache.kafka.common.requests.OffsetFetchResponse;
import org.apache.kafka.common.requests.ProduceRequest;
import org.apache.kafka.common.requests.ProduceResponse;
import org.apache.kafka.common.requests.RequestHeader;
import org.apache.kafka.common.requests.SaslAuthenticateRequest;
import org.apache.kafka.common.requests.SaslHandshakeRequest;
import org.apache.kafka.common.requests.SyncGroupResponse;
import org.apache.kafka.common.requests.TxnOffsetCommitResponse;
import org.apache.kafka.common.security.auth.SecurityProtocol;
import org.apache.pulsar.broker.authentication.AuthenticationProviderTls;
import org.apache.pulsar.common.util.FutureUtil;
import org.apache.pulsar.common.util.Murmur3_32Hash;
@Slf4j
public class KafkaProxyRequestHandler extends ChannelInboundHandlerAdapter {
private final LinkedBlockingQueue requestQueue = new LinkedBlockingQueue<>(500);
private final AtomicBoolean isActive = new AtomicBoolean(false);
private final Node selfNode;
private final List replicaIds;
private final BrokerConnectionGroup connectionGroup;
@VisibleForTesting
final Cache leaderCache;
private ChannelHandlerContext ctx;
private Authenticator authenticator;
public KafkaProxyRequestHandler(final EndPoint advertisedEndPoint,
final ConnectionFactory connectionFactory,
final boolean isClientAuth,
final Cache leaderCache) {
this.selfNode = new Node(Murmur3_32Hash.getInstance().makeHash(
(advertisedEndPoint.getHostname() + advertisedEndPoint.getPort()).getBytes(UTF_8)
), advertisedEndPoint.getHostname(), advertisedEndPoint.getPort());
this.replicaIds = Collections.singletonList(selfNode.id());
this.connectionGroup = new BrokerConnectionGroup(connectionFactory);
if (advertisedEndPoint.getSecurityProtocol().equals(SecurityProtocol.SSL) && isClientAuth) {
AuthenticationProviderTls authenticationProviderTls = new AuthenticationProviderTls();
this.authenticator = new SslAuthenticator(authenticationProviderTls, new KafkaServiceConfiguration());
}
this.leaderCache = leaderCache;
}
@Override
public void channelRead(final ChannelHandlerContext ctx, final Object msg) throws Exception {
final var buf = (ByteBuf) msg;
KafkaProxyExtension.BYTES_COUNTER.inc(buf.readableBytes());
try {
final var channel = ctx.channel();
final var inflightRequest = new InflightRequest(buf, channel.remoteAddress());
if (log.isDebugEnabled()) {
log.debug("[{}] Received kafka cmd {}", channel, inflightRequest);
}
final var apiKeys = inflightRequest.getHeader().apiKey();
inflightRequest.registerCallback(() -> flush(channel), ctx.executor());
KafkaProxyExtension.OPS_COUNTER.inc();
if (!ApiKeys.PRODUCE.equals(apiKeys) || ((ProduceRequest) inflightRequest.getRequest()).acks() != 0) {
requestQueue.put(inflightRequest);
}
// TODO: *_GROUPS requests might need to be split for multiple group coordinators
switch (apiKeys) {
case API_VERSIONS -> handleApiVersions(inflightRequest);
case METADATA -> handleMetadata(inflightRequest);
case PRODUCE -> handleProduce(inflightRequest);
case FIND_COORDINATOR -> handleFindCoordinator(inflightRequest);
case JOIN_GROUP, SYNC_GROUP, LEAVE_GROUP, OFFSET_FETCH, OFFSET_COMMIT, HEARTBEAT, OFFSET_DELETE,
TXN_OFFSET_COMMIT -> handleGroupRequest(apiKeys, inflightRequest);
case LIST_OFFSETS -> handleListOffsets(inflightRequest);
case FETCH -> handleFetch(inflightRequest);
case SASL_HANDSHAKE, SASL_AUTHENTICATE -> connectionGroup.authenticate(inflightRequest);
case CREATE_TOPICS, DELETE_TOPICS, DESCRIBE_CONFIGS, ALTER_CONFIGS,
LIST_GROUPS, DELETE_GROUPS, DESCRIBE_GROUPS ->
connectionGroup.getMetadataBroker().forwardRequest(inflightRequest);
case DESCRIBE_CLUSTER -> handleDescribeCluster(inflightRequest);
case DELETE_RECORDS -> handleDeleteRecords(inflightRequest);
case INIT_PRODUCER_ID, ADD_PARTITIONS_TO_TXN, ADD_OFFSETS_TO_TXN, END_TXN ->
handleTxnRequest(apiKeys, inflightRequest);
case WRITE_TXN_MARKERS -> throw new IllegalStateException(apiKeys + " should be handled in broker");
default -> inflightRequest.complete(inflightRequest.getRequest().getErrorResponse(
new UnsupportedVersionException("API " + apiKeys + " is not supported")));
}
} catch (IOException e) {
log.warn("{}", e.getMessage());
close(ctx);
} catch (Throwable throwable) {
log.error("[{}] Unexpected exception when handling request", ctx.channel(), throwable);
close(ctx);
} finally {
ReferenceCountUtil.safeRelease(buf);
}
}
@Override
public void userEventTriggered(ChannelHandlerContext ctx, Object evt) throws Exception {
// Handle ssl handshake completion event
if (evt instanceof SslHandshakeCompletionEvent) {
if (((SslHandshakeCompletionEvent) evt).isSuccess()) {
if (this.authenticator != null && this.authenticator instanceof SslAuthenticator) {
this.authenticator.authenticate(ctx,
null,
null,
null,
null);
Session session = this.authenticator.session();
if (session != null && session.getPrincipal() != null && session.getPrincipal().getName() != null) {
InflightRequest saslHandshake = newSaslHandshake(ProxySslSaslServer.PROXY_SSL_MECHANISM);
connectionGroup.addSaslRequestBuffer(saslHandshake);
InflightRequest saslAuthenticate = newSaslAuthenticate(
session.getPrincipal().getName().getBytes(UTF_8));
connectionGroup.addSaslRequestBuffer(saslAuthenticate);
}
}
}
} else {
super.userEventTriggered(ctx, evt);
}
}
private static InflightRequest newSaslHandshake(final String mechanism) {
var request = new SaslHandshakeRequest.Builder(new SaslHandshakeRequestData()
.setMechanism(mechanism)).build();
ByteBuf byteBuf = KopResponseUtils.serializeRequestToPooledBuffer(new RequestHeader(ApiKeys.SASL_HANDSHAKE,
(short) 1, "kop-proxy", 0), request);
// fake address
InetSocketAddress localhost = InetSocketAddress.createUnresolved("localhost", 65535);
return new InflightRequest(byteBuf, localhost);
}
private static InflightRequest newSaslAuthenticate(final byte[] saslAuthBytes) {
var request = new SaslAuthenticateRequest
.Builder(new SaslAuthenticateRequestData().setAuthBytes(saslAuthBytes))
.build();
ByteBuf byteBuf = KopResponseUtils.serializeRequestToPooledBuffer(new RequestHeader(ApiKeys.SASL_AUTHENTICATE,
(short) 2, "kop-proxy", 1), request);
// fake address
InetSocketAddress localhost = InetSocketAddress.createUnresolved("localhost", 65535);
return new InflightRequest(byteBuf, localhost);
}
@Override
public void channelActive(ChannelHandlerContext ctx) throws Exception {
super.channelActive(ctx);
this.ctx = ctx;
this.connectionGroup.setClientChannel(ctx);
isActive.set(true);
KafkaProxyExtension.ACTIVE_CONNECTIONS.inc();
KafkaProxyExtension.NEW_CONNECTIONS.inc();
}
@Override
public void channelInactive(final ChannelHandlerContext ctx) throws Exception {
super.channelInactive(ctx);
log.info("close channel {}", ctx.channel());
KafkaProxyExtension.ACTIVE_CONNECTIONS.dec();
connectionGroup.close();
}
@Override
public void exceptionCaught(ChannelHandlerContext ctx, Throwable cause) throws Exception {
log.error("[{}] Unexpected exception", ctx.channel(), cause);
close(ctx);
}
private void close(final ChannelHandlerContext ctx) {
if (isActive.compareAndSet(true, false)) {
ctx.close();
if (!requestQueue.isEmpty()) {
log.info("[{}] Close with {} pending requests", ctx, requestQueue.size());
}
requestQueue.clear();
connectionGroup.close();
}
}
private void flush(final Channel channel) {
while (isActive.get()) {
final var inflightRequest = requestQueue.peek();
if (inflightRequest == null) {
break;
}
if (!inflightRequest.hasReceivedResponse()) {
break;
} else if (!requestQueue.remove(inflightRequest)) { // it has been removed by another thread
continue;
}
if (inflightRequest.hasFailed(e -> {
if (e instanceof ConnectionToBroker.ConnectError) {
log.warn("[{}] {} failed with {}", channel, inflightRequest.getHeader(), e.getMessage());
} else {
log.error("[{}] request {} completed exceptionally", channel, inflightRequest.getHeader(), e);
}
close(ctx);
})) {
return;
}
final var buf = inflightRequest.toResponseBuf();
if (log.isDebugEnabled()) {
log.debug("[{}] Write kafka cmd to client ({} requests left): {}",
channel, requestQueue.size(), inflightRequest.getHeader());
}
channel.writeAndFlush(buf).addListener(future -> {
if (!future.isSuccess()) {
log.error("[{}] Failed to write {}", channel, inflightRequest.getHeader(), future.cause());
}
});
}
}
private void handleApiVersions(final InflightRequest inflightRequest) {
short version = inflightRequest.getHeader().apiVersion();
inflightRequest.complete(KafkaRequestHandler.overloadDefaultApiVersionsResponse(
!ApiKeys.API_VERSIONS.isVersionSupported(version)));
}
private void handleMetadata(final InflightRequest inflightRequest) throws IOException {
inflightRequest.setResponseMapper(originalResponse -> {
final var metadataResponse = (MetadataResponse) originalResponse;
final var data = metadataResponse.data();
final var brokers = data.brokers();
if (log.isDebugEnabled()) {
final var leaderMap = new HashMap();
data.topics().forEach(topic -> topic.partitions().forEach(partition -> {
final var topicPartition = new TopicPartition(topic.name(), partition.partitionIndex());
final var broker = brokers.find(partition.leaderId());
if (broker != null) {
leaderMap.put(topicPartition, broker.host() + ":" + broker.port());
} else {
leaderMap.put(topicPartition, Errors.forCode(partition.errorCode()).message());
}
}));
log.debug("[{}] MetadataResponse: {}", inflightRequest.getHeader(), leaderMap);
}
data.topics().forEach(topic -> {
final String topicName = topic.name();
topic.partitions().forEach(partition -> {
final var broker = brokers.find(partition.leaderId());
if (broker != null) {
leaderCache.put(new TopicPartition(topicName, partition.partitionIndex()),
InetSocketAddress.createUnresolved(broker.host(), broker.port()));
}
partition.setLeaderId(selfNode.id());
partition.setReplicaNodes(replicaIds);
partition.setIsrNodes(replicaIds);
});
});
data.setControllerId(selfNode.id());
brokers.clear();
brokers.add(new MetadataResponseData.MetadataResponseBroker().setNodeId(selfNode.id())
.setHost(selfNode.host()).setPort(selfNode.port()));
return metadataResponse;
});
connectionGroup.getMetadataBroker().forwardRequest(inflightRequest);
}
@VisibleForTesting
void handleProduce(final InflightRequest inflightRequest) throws IOException {
final var request = (ProduceRequest) inflightRequest.getRequest();
final var errorsMap = new HashMap();
final var partitionDataMap = new HashMap>();
for (var topicData : request.data().topicData()) {
final var topic = topicData.name();
for (var partitionData : topicData.partitionData()) {
final var topicPartition = new TopicPartition(topic, partitionData.index());
final var leader = leaderCache.getIfPresent(topicPartition);
if (leader == null) {
errorsMap.put(topicPartition, Errors.NOT_LEADER_OR_FOLLOWER);
continue;
}
partitionDataMap.computeIfAbsent(leader, __ -> new HashMap<>())
.computeIfAbsent(topic, __ -> new TopicProduceData().setName(topic))
.partitionData().add(partitionData);
}
}
if (partitionDataMap.isEmpty()) {
log.warn("No leader found for {}", inflightRequest.getHeader());
inflightRequest.complete(createProduceResponse(errorsMap));
return;
}
final Function> getLeader = address -> {
try {
return Optional.of(connectionGroup.getLeader(address));
} catch (IOException e) {
log.warn("[{}] Failed to connect to leader {}: {}", ctx, address, e.getMessage());
Optional.ofNullable(partitionDataMap.get(address)).ifPresent(map -> map.forEach((topic, data) ->
data.partitionData().stream().map(__ -> new TopicPartition(topic, __.index()))
.forEach(topicPartition -> {
leaderCache.invalidate(topicPartition);
errorsMap.put(topicPartition, Errors.NOT_LEADER_OR_FOLLOWER);
})));
return Optional.empty();
}
};
final boolean cacheRequest = request.acks() != 0;
// If there is only 1 broker to send, forward the request directly. Otherwise, N Produce requests need to be
// created to N brokers.
if (errorsMap.isEmpty() && partitionDataMap.size() == 1) {
inflightRequest.setResponseMapper(originalResponse -> {
if (errorsMap.isEmpty()) {
return originalResponse;
}
if (originalResponse == null) {
return createProduceResponse(errorsMap);
}
final var produceResponse = (ProduceResponse) originalResponse;
return createProduceResponse(errorsMap, produceResponse.data());
});
getLeader.apply(partitionDataMap.keySet().iterator().next()).ifPresentOrElse(leader -> {
// When errorsMap is not empty, the response needs to be merged with errorsMap
inflightRequest.setSkipParsingResponse(errorsMap.isEmpty());
leader.forwardRequest(inflightRequest, cacheRequest);
}, () -> inflightRequest.complete(null));
} else {
final var responseFutures = new ArrayList>();
partitionDataMap.forEach((address, topicDataMap) -> {
getLeader.apply(address).ifPresent(connection -> {
final var singleRequest = new ProduceRequest(new ProduceRequestData().setAcks(request.acks())
.setTimeoutMs(request.timeout()).setTransactionalId(request.transactionalId())
.setTopicData(new TopicProduceDataCollection(topicDataMap.values().iterator())),
request.version());
final var buf = KopResponseUtils.serializeRequestToPooledBuffer(
inflightRequest.getHeader(), singleRequest);
final var singleInflightRequest = new InflightRequest(
buf, inflightRequest.getRemoteAddress(), false);
responseFutures.add(singleInflightRequest.getResponseFuture());
connection.forwardRequest(singleInflightRequest);
});
});
FutureUtil.waitForAll(responseFutures).thenAccept(__ -> {
final var map = CoreUtils.mapValue(errorsMap, PartitionResponse::new);
responseFutures.stream().map(CompletableFuture::join).forEach(singleResponse -> {
singleResponse.data().responses().forEach(topicProduceResponse -> {
final var topic = topicProduceResponse.name();
topicProduceResponse.partitionResponses().forEach(r -> {
final var topicPartition = new TopicPartition(topic, r.index());
map.put(topicPartition, new PartitionResponse(Errors.forCode(r.errorCode()), r.baseOffset(),
r.logAppendTimeMs(), r.logStartOffset(), r.recordErrors().stream().map(e ->
new RecordError(e.batchIndex(), e.batchIndexErrorMessage())).toList(),
r.errorMessage()));
});
});
});
if (log.isDebugEnabled()) {
log.debug("[{}] ProduceResponse: {}", inflightRequest.getHeader(), CoreUtils.mapValue(map,
r -> r.error));
}
inflightRequest.complete(new ProduceResponse(map));
}).exceptionally(e -> {
log.error("[{}] Failed to wait for the produce responses", ctx, e);
close(ctx);
return null;
});
}
}
private static ProduceResponse createProduceResponse(final Map errorsMap) {
return createProduceResponse(errorsMap, new ProduceResponseData());
}
private static ProduceResponse createProduceResponse(final Map errorsMap,
final ProduceResponseData responseData) {
errorsMap.forEach((topicPartition, errors) -> {
final var topic = topicPartition.topic();
var topicProduceResponse = responseData.responses().find(topic);
if (topicProduceResponse == null) {
topicProduceResponse = new TopicProduceResponse().setName(topic);
responseData.responses().add(topicProduceResponse);
}
topicProduceResponse.partitionResponses().add(new PartitionProduceResponse()
.setErrorCode(errors.code()).setIndex(topicPartition.partition()));
});
return new ProduceResponse(responseData);
}
private void handleFindCoordinator(final InflightRequest inflightRequest) throws IOException {
final var request = (FindCoordinatorRequest) inflightRequest.getRequest();
inflightRequest.setResponseMapper(__ -> {
// Ignore the original response. This request was sent just to execute the necessary operations in broker.
final var data = new FindCoordinatorResponseData();
if (request.version() < FindCoordinatorRequest.MIN_BATCHED_VERSION) {
data.setErrorCode(Errors.NONE.code()).setErrorMessage(Errors.NONE.message())
.setHost(selfNode.host()).setPort(selfNode.port()).setNodeId(selfNode.id());
} else {
final var coordinatorKeys = request.data().coordinatorKeys();
data.setCoordinators(coordinatorKeys.stream().map(key ->
new FindCoordinatorResponseData.Coordinator()
.setKey(key)
.setErrorCode(Errors.NONE.code())
.setErrorMessage(Errors.NONE.message())
.setHost(selfNode.host())
.setPort(selfNode.port())
.setNodeId(selfNode.id())
).toList());
}
return new FindCoordinatorResponse(data);
});
connectionGroup.getMetadataBroker().forwardRequest(inflightRequest);
}
private void handleListOffsets(final InflightRequest inflightRequest) throws IOException {
if (inflightRequest.getHeader().apiVersion() == 0) {
// TODO: handle ListOffset request v0
throw new RuntimeException("KoP proxy does not support ListOffset v0 yet");
} else {
final var request = (ListOffsetsRequest) inflightRequest.getRequest();
if (request.data().topics().size() == 0) {
inflightRequest.complete(new ListOffsetsResponse(new ListOffsetsResponseData()));
return;
}
handleListOffsetsV1OrAbove(inflightRequest, request);
}
}
private void handleListOffsetsV1OrAbove(final InflightRequest originalRequest, final ListOffsetsRequest request)
throws IOException {
final var errorsMap = new HashMap();
final var leaderToOffsetData = new HashMap>>();
request.data().topics().forEach(topic-> {
topic.partitions().forEach(partitionData -> {
final var topicPartition = new TopicPartition(topic.name(), partitionData.partitionIndex());
final var leader = leaderCache.getIfPresent(topicPartition);
if (leader == null) {
errorsMap.put(topicPartition, Errors.UNKNOWN_TOPIC_OR_PARTITION);
return;
}
leaderToOffsetData.computeIfAbsent(leader, __ -> new HashMap<>())
.computeIfAbsent(topic.name(), __ -> new ArrayList<>()).add(partitionData);
});
});
if (leaderToOffsetData.size() == 1) {
final var leader = leaderToOffsetData.keySet().iterator().next();
originalRequest.setSkipParsingResponse(true);
connectionGroup.getLeader(leader).forwardRequest(originalRequest);
} else {
final var responseFutures = new ArrayList>();
leaderToOffsetData.forEach((leader, offsetData) -> {
try {
final var connection = connectionGroup.getLeader(leader);
final var targetTimes = offsetData.entrySet().stream().map(e -> new ListOffsetsTopic()
.setName(e.getKey()).setPartitions(e.getValue())).toList();
final var singleRequest = ListOffsetsRequest.Builder
.forConsumer(true, request.isolationLevel(), false)
.setTargetTimes(targetTimes)
.build(request.version());
final var buf = KopResponseUtils.serializeRequestToPooledBuffer(
originalRequest.getHeader(), singleRequest);
final var singleInflightRequest = new InflightRequest(
buf, originalRequest.getRemoteAddress(), false);
responseFutures.add(singleInflightRequest.getResponseFuture());
connection.forwardRequest(singleInflightRequest);
} catch (IOException e) {
log.warn("[{}] Failed to connect to leader {}: {}", ctx, leader, e.getMessage());
offsetData.forEach((topic, partitions) -> partitions.forEach(partition -> {
final var topicPartition = new TopicPartition(topic, partition.partitionIndex());
errorsMap.put(topicPartition, Errors.UNKNOWN_TOPIC_OR_PARTITION);
leaderCache.invalidate(topicPartition);
}));
}
});
FutureUtil.waitForAll(responseFutures).thenAccept(__ -> {
final var topicMap = new HashMap>();
responseFutures.stream().map(CompletableFuture::join).forEach(response -> {
response.data().topics().forEach(topic -> {
final var partitions = topicMap.get(topic.name());
if (partitions == null) {
topicMap.put(topic.name(), topic.partitions());
} else {
partitions.addAll(topic.partitions());
}
});
});
errorsMap.forEach((topicPartition, errors) -> topicMap.computeIfAbsent(topicPartition.topic(),
topic -> new ArrayList<>()
).add(new ListOffsetsPartitionResponse().setErrorCode(errors.code())));
final var data = topicMap.entrySet().stream().map(e -> new ListOffsetsTopicResponse()
.setName(e.getKey())
.setPartitions(e.getValue())).toList();
originalRequest.complete(new ListOffsetsResponse(new ListOffsetsResponseData().setTopics(data)));
});
}
}
private void handleFetch(final InflightRequest inflightRequest) throws IOException {
final var request = (FetchRequest) inflightRequest.getRequest();
final var errorsMap = new HashMap();
final var fetchPartitionMap = new HashMap>();
request.data().topics().forEach(fetchTopic -> {
final var topic = fetchTopic.topic();
fetchTopic.partitions().forEach(fetchPartition -> {
final var topicPartition = new TopicPartition(topic, fetchPartition.partition());
final var leader = leaderCache.getIfPresent(topicPartition);
if (leader == null) {
errorsMap.put(topicPartition, Errors.NOT_LEADER_OR_FOLLOWER);
return;
}
fetchPartitionMap.computeIfAbsent(leader, __ -> new HashMap<>()).computeIfAbsent(topic, __ ->
new FetchTopic().setTopicId(fetchTopic.topicId())
.setTopic(topic)
).partitions().add(fetchPartition);
});
});
if (fetchPartitionMap.isEmpty()) {
log.warn("No leader found for {}", inflightRequest.getRequest());
inflightRequest.complete(createFetchResponse(errorsMap, new FetchResponseData()));
}
if (fetchPartitionMap.size() == 1) {
final var leader = fetchPartitionMap.keySet().iterator().next();
inflightRequest.setSkipParsingResponse(true);
connectionGroup.getLeader(leader).forwardRequest(inflightRequest);
} else {
final var responseFutures = new ArrayList>();
fetchPartitionMap.forEach((leader, fetchTopics) -> {
try {
final var connection = connectionGroup.getLeader(leader);
final var singleRequest = new FetchRequest(new FetchRequestData().setMaxWaitMs(request.maxWait())
.setMaxBytes(request.maxBytes()).setMinBytes(request.minBytes())
.setIsolationLevel(request.isolationLevel().id())
.setSessionEpoch(request.metadata().epoch()).setSessionId(request.metadata().sessionId())
.setReplicaId(request.replicaId()).setRackId(request.rackId())
.setTopics(fetchTopics.values().stream().toList()), request.version());
final var singleInflightRequest = new InflightRequest(
KopResponseUtils.serializeRequestToPooledBuffer(inflightRequest.getHeader(), singleRequest),
inflightRequest.getRemoteAddress(), false);
// The records buffer of a FetchResponse is only valid when the original buffer is valid, so here
// we cannot parse the buffer into a FetchResponse in ConnectionBroker because the buffer will be
// released after the response future is completed.
singleInflightRequest.setSkipParsingResponse(true);
responseFutures.add(singleInflightRequest.getResponseFuture());
connection.forwardRequest(singleInflightRequest);
} catch (IOException e) {
log.warn("[{}] Failed to connect to leader {}: {}", ctx, leader, e.getMessage());
fetchTopics.values().forEach(fetchTopic -> {
fetchTopic.partitions().forEach(partition -> {
final var topicPartition = new TopicPartition(fetchTopic.topic(), partition.partition());
errorsMap.put(topicPartition, Errors.NOT_LEADER_OR_FOLLOWER);
leaderCache.invalidate(topicPartition);
});
});
}
});
FutureUtil.waitForAll(responseFutures).thenAccept(__ -> {
// The records fields are slices of `buf`, so we have to delay the release after the FETCH response is
// serialized to a new allocated buffer.
final var buffersToRelease = responseFutures.stream().map(CompletableFuture::join).toList();
final var map = new HashMap();
buffersToRelease.forEach(buf -> {
final var fetchResponse = (FetchResponse) FetchResponse.parseResponse(buf.nioBuffer(),
inflightRequest.getHeader());
fetchResponse.data().responses().forEach(topic -> {
final var topicResponse = map.get(topic.topic());
if (topicResponse == null) {
map.put(topic.topic(), new FetchableTopicResponse().setTopicId(topic.topicId())
.setTopic(topic.topic()).setPartitions(topic.partitions()));
return;
}
topicResponse.partitions().addAll(topic.partitions());
});
});
final var data = new FetchResponseData().setResponses(map.values().stream().toList())
.setSessionId(request.metadata().sessionId());
inflightRequest.complete(Pair.of(createFetchResponse(errorsMap, data), buffersToRelease));
});
}
}
private static FetchResponse createFetchResponse(final Map errorsMap,
final FetchResponseData responseData) {
errorsMap.forEach((topicPartition, errors) -> {
final var topic = topicPartition.topic();
var topicResponse = responseData.responses().stream().filter(__ -> __.topic().equals(topic))
.findFirst().orElse(null);
if (topicResponse == null) {
topicResponse = new FetchableTopicResponse().setTopic(topic);
responseData.responses().add(topicResponse);
}
var partitionResponse = topicResponse.partitions().stream().filter(__ ->
__.partitionIndex() == topicPartition.partition()).findFirst().orElse(null);
if (partitionResponse == null) {
partitionResponse = new FetchResponseData.PartitionData().setPartitionIndex(topicPartition.partition())
.setErrorCode(errors.code());
topicResponse.partitions().add(partitionResponse);
}
});
return new FetchResponse(responseData);
}
private void handleDeleteRecords(final InflightRequest inflightRequest) throws IOException {
final var request = (DeleteRecordsRequest) inflightRequest.getRequest();
final var deleteRecordsMap = new HashMap>();
final var errorsMap = new HashMap();
request.data().topics().forEach(topic -> topic.partitions().forEach(partition -> {
final var topicPartition = new TopicPartition(topic.name(), partition.partitionIndex());
final var leader = leaderCache.getIfPresent(topicPartition);
if (leader == null) {
errorsMap.put(topicPartition, Errors.NOT_LEADER_OR_FOLLOWER);
return;
}
deleteRecordsMap.computeIfAbsent(leader, __ -> new HashMap<>()).computeIfAbsent(topic.name(), __ ->
new DeleteRecordsTopic().setName(__)).partitions().add(partition);
}));
if (deleteRecordsMap.size() == 1) {
inflightRequest.setResponseMapper(originalResponse -> {
final var response = (DeleteRecordsResponse) originalResponse;
response.data().topics().forEach(topic -> topic.partitions().forEach(partition -> errorsMap.put(
new TopicPartition(topic.name(), partition.partitionIndex()),
Errors.forCode(partition.errorCode()))));
return KafkaResponseUtils.newDeleteRecords(errorsMap);
});
connectionGroup.getLeader(deleteRecordsMap.keySet().iterator().next()).forwardRequest(inflightRequest);
} else {
final var responseFutures = new ArrayList>();
deleteRecordsMap.forEach((leader, topics) -> {
try {
final var connection = connectionGroup.getLeader(leader);
final var singleRequest = new DeleteRecordsRequest.Builder(new DeleteRecordsRequestData()
.setTimeoutMs(request.data().timeoutMs()).setTopics(topics.values().stream().toList())
).build(request.version());
final var singleInflightRequest = new InflightRequest(
KopResponseUtils.serializeRequestToPooledBuffer(inflightRequest.getHeader(), singleRequest),
inflightRequest.getRemoteAddress());
final var responseFuture = new CompletableFuture();
responseFutures.add(responseFuture);
singleInflightRequest.setResponseMapper(response -> {
responseFuture.complete((DeleteRecordsResponse) response);
return response;
});
connection.forwardRequest(singleInflightRequest);
} catch (IOException e) {
log.warn("[{}] Failed to connect to leader {}: {}", ctx, leader, e.getMessage());
topics.values().forEach(topic -> topic.partitions().forEach(partition -> {
final var topicPartition = new TopicPartition(topic.name(), partition.partitionIndex());
errorsMap.put(topicPartition, Errors.NOT_LEADER_OR_FOLLOWER);
leaderCache.invalidate(topicPartition);
}));
}
});
FutureUtil.waitForAll(responseFutures).thenAccept(__ -> {
responseFutures.stream().map(CompletableFuture::join).forEach(singleResponse -> {
singleResponse.data().topics().forEach(topic -> topic.partitions().forEach(partition ->
errorsMap.put(new TopicPartition(topic.name(), partition.partitionIndex()),
Errors.forCode(partition.errorCode()))
));
});
inflightRequest.complete(KafkaResponseUtils.newDeleteRecords(errorsMap));
}).exceptionally(e -> {
log.error("[{}] Failed to wait for the delete records responses", ctx, e);
close(ctx);
return null;
});
}
}
private void handleDescribeCluster(final InflightRequest inflightRequest) throws IOException {
inflightRequest.setResponseMapper(response -> {
final var clusterResponse = (DescribeClusterResponse) response;
clusterResponse.data().setControllerId(selfNode.id());
clusterResponse.data().setBrokers(new DescribeClusterBrokerCollection(Collections.singletonList(
new DescribeClusterBroker().setBrokerId(selfNode.id()).setHost(selfNode.host())
.setPort(selfNode.port())).iterator()));
return response;
});
connectionGroup.getMetadataBroker().forwardRequest(inflightRequest);
}
private void handleGroupRequest(final ApiKeys apiKeys, final InflightRequest inflightRequest) throws IOException {
final var groupId = inflightRequest.groupId();
inflightRequest.setResponseMapper(response -> {
final var error = switch (apiKeys) {
case JOIN_GROUP -> ((JoinGroupResponse) response).error();
case SYNC_GROUP -> ((SyncGroupResponse) response).error();
case LEAVE_GROUP -> ((LeaveGroupResponse) response).error();
case OFFSET_FETCH -> ((OffsetFetchResponse) response).error();
case OFFSET_COMMIT -> ((OffsetCommitResponse) response).errorCounts().keySet().stream()
.filter(__ -> !__.equals(Errors.NONE)).findFirst().orElse(Errors.NONE);
case HEARTBEAT -> ((HeartbeatResponse) response).error();
case OFFSET_DELETE -> Errors.forCode(((OffsetDeleteResponse) response).data().errorCode());
case TXN_OFFSET_COMMIT -> ((TxnOffsetCommitResponse) response).errors().values().stream()
.filter(__ -> !__.equals(Errors.NONE)).findFirst().orElse(Errors.NONE);
default -> throw new IllegalStateException(apiKeys + " is not group request");
};
if (error == Errors.NOT_COORDINATOR) {
log.info("[{}] [group: {}] Disconnect the outdated group coordinator", ctx, groupId);
try {
connectionGroup.getGroupCoordinator(groupId).disconnectBroker();
} catch (IOException e) {
throw new RuntimeException(e);
}
}
return response;
});
connectionGroup.getGroupCoordinator(groupId).forwardRequest(inflightRequest);
}
private void handleTxnRequest(final ApiKeys apiKeys, final InflightRequest inflightRequest) throws IOException {
final var txnId = inflightRequest.txnId();
inflightRequest.setResponseMapper(response -> {
final var error = switch (apiKeys) {
case INIT_PRODUCER_ID -> ((InitProducerIdResponse) response).error();
case ADD_PARTITIONS_TO_TXN -> ((AddPartitionsToTxnResponse) response).errors().values().stream()
.filter(__ -> !__.equals(Errors.NONE)).findFirst().orElse(Errors.NONE);
case ADD_OFFSETS_TO_TXN -> Errors.forCode(((AddOffsetsToTxnResponse) response).data().errorCode());
case END_TXN -> ((EndTxnResponse) response).error();
default -> throw new IllegalStateException(apiKeys + " is not txn request");
};
if (error == Errors.NOT_COORDINATOR) {
log.info("[{}] [txnId: {}] Disconnect the outdated transaction coordinator", ctx, txnId);
try {
connectionGroup.getGroupCoordinator(txnId).disconnectBroker();
} catch (IOException e) {
throw new RuntimeException(e);
}
}
return response;
});
connectionGroup.getTransactionCoordinator(txnId).forwardRequest(inflightRequest);
}
}