io.github.shanqiang.sp.input.KafkaStreamTable Maven / Gradle / Ivy
package io.github.shanqiang.sp.input;
import com.google.gson.Gson;
import com.google.gson.JsonElement;
import com.google.gson.JsonObject;
import io.github.shanqiang.SystemProperty;
import io.github.shanqiang.Threads;
import io.github.shanqiang.exception.UnknownTypeException;
import io.github.shanqiang.offheap.ByteArray;
import io.github.shanqiang.sp.Delay;
import io.github.shanqiang.sp.StreamProcessing;
import io.github.shanqiang.table.TableBuilder;
import io.github.shanqiang.table.Type;
import org.apache.kafka.clients.consumer.Consumer;
import org.apache.kafka.clients.consumer.ConsumerRecord;
import org.apache.kafka.clients.consumer.ConsumerRecords;
import org.apache.kafka.clients.consumer.KafkaConsumer;
import org.apache.kafka.clients.consumer.OffsetAndTimestamp;
import org.apache.kafka.common.PartitionInfo;
import org.apache.kafka.common.TopicPartition;
import org.apache.kafka.common.errors.InterruptException;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.time.Duration;
import java.util.*;
import java.util.concurrent.ArrayBlockingQueue;
import java.util.concurrent.ScheduledExecutorService;
import java.util.concurrent.TimeUnit;
import static io.github.shanqiang.sp.input.kafka.MyKafkaConsumer.newKafkaConsumer;
import static java.util.Arrays.asList;
import static java.util.Collections.sort;
import static java.util.Objects.requireNonNull;
import static java.util.concurrent.Executors.newSingleThreadScheduledExecutor;
import static org.apache.kafka.clients.consumer.ConsumerConfig.*;
import static org.apache.kafka.clients.producer.ProducerConfig.BOOTSTRAP_SERVERS_CONFIG;
public class KafkaStreamTable extends AbstractStreamTable {
private static final Logger logger = LoggerFactory.getLogger(KafkaStreamTable.class);
protected final Properties properties;
private final String topic;
private final long consumeFrom;
protected final long consumeTo;
protected final int myHash;
protected final int serverCount;
protected final Set myPartitions = new HashSet<>();
private final ScheduledExecutorService partitionsDetector;
protected final List consumers = new ArrayList<>();
private final int timeColumnIndex;
private final int receiveTimeColumnIndex;
private final List stringColumns;
private final List types;
private long finishDelayMs = 30000;
private long lastUpdateMs = System.currentTimeMillis();
private final Set partitionSet = new HashSet<>();
//由于partitionSet.size()读取非常频繁且计算代价比较大使用partitionSetSize缓存该值
private int partitionSetSize;
public KafkaStreamTable(String bootstrapServers,
String consumerGroupId,
String topic,
long consumeFrom,
Map columnTypeMap) {
this(bootstrapServers, consumerGroupId, topic, consumeFrom, -1, columnTypeMap);
}
/**
* key是kafka客户端写kafka的时间(int型,秒),kafka服务端的timestamp是接收到数据的时间据此也可评估写入latency
* value是一个json格式的字符串
*
* @param bootstrapServers
* @param consumerGroupId
* @param topic
* @param consumeFrom
* @param consumeTo
* @param columnTypeMap
*/
public KafkaStreamTable(String bootstrapServers,
String consumerGroupId,
String topic,
long consumeFrom,
long consumeTo,
Map columnTypeMap) {
this(bootstrapServers, consumerGroupId, topic,
"org.apache.kafka.common.serialization.LongDeserializer",
"org.apache.kafka.common.serialization.StringDeserializer",
consumeFrom, consumeTo, 100, columnTypeMap);
}
public KafkaStreamTable(String bootstrapServers,
String consumerGroupId,
String topic,
long consumeFrom,
long consumeTo,
int queueDepth,
Map columnTypeMap) {
this(bootstrapServers, consumerGroupId, topic,
"org.apache.kafka.common.serialization.LongDeserializer",
"org.apache.kafka.common.serialization.StringDeserializer",
consumeFrom, consumeTo, queueDepth, columnTypeMap);
}
protected KafkaStreamTable(String bootstrapServers,
String consumerGroupId,
String topic,
String keyDeserializer,
String valueDeserializer,
long consumeFrom,
long consumeTo,
int queueDepth,
Map columnTypeMap) {
super(0, columnTypeMap, "|KafkaStreamTable|" + topic, queueDepth);
this.topic = requireNonNull(topic);
Properties properties = new Properties();
properties.put(BOOTSTRAP_SERVERS_CONFIG, requireNonNull(bootstrapServers));
properties.put(GROUP_ID_CONFIG, requireNonNull(consumerGroupId));
properties.put(KEY_DESERIALIZER_CLASS_CONFIG, keyDeserializer);
properties.put(VALUE_DESERIALIZER_CLASS_CONFIG, valueDeserializer);
properties.put(MAX_POLL_RECORDS_CONFIG, 40000);
properties.put(ENABLE_AUTO_COMMIT_CONFIG, "true");
properties.put(AUTO_OFFSET_RESET_CONFIG, "none");
properties.put(DEFAULT_API_TIMEOUT_MS_CONFIG, 60_000);
this.properties = properties;
this.partitionsDetector = newSingleThreadScheduledExecutor(Threads.threadsNamed("partitions_detector" + sign));
this.consumeFrom = consumeFrom;
this.consumeTo = consumeTo;
myHash = SystemProperty.getMyHash();
serverCount = SystemProperty.getServerCount();
stringColumns = new ArrayList<>(columns.size());
types = new ArrayList<>(columns.size());
timeColumnIndex = columns.indexOf(__time__);
receiveTimeColumnIndex = columns.indexOf(__receive_time__);
for (ByteArray column : columns) {
String columnName = column.toString();
stringColumns.add(columnName);
types.add(columnTypeMap.get(columnName));
}
}
protected void newConsumer(TopicPartition topicPartition, long offset) {
if (topicPartition.partition() % serverCount != myHash) {
return;
}
if (myPartitions.contains(topicPartition.partition())) {
return;
}
myPartitions.add(topicPartition.partition());
addPartition(topicPartition.partition());
int threadId = arrayBlockingQueueList.size();
arrayBlockingQueueList.add(new ArrayBlockingQueue<>(queueDepth));
KafkaStreamTable kafkaStreamTable = this;
Thread thread = new Thread(new Runnable() {
@Override
public void run() {
try (Consumer consumer = new KafkaConsumer<>(properties)) {
consumer.assign(asList(topicPartition));
consumer.seek(topicPartition, offset);
Gson gson = new Gson();
while (!Thread.interrupted()) {
try {
ConsumerRecords records = consumer.poll(Duration.ofMillis(sleepMs));
if (records.isEmpty()) {
continue;
}
TableBuilder tableBuilder = new TableBuilder(columnTypeMap);
for (ConsumerRecord record : records) {
Long time = record.key();
if (-1 != consumeTo && time >= consumeTo) {
kafkaStreamTable.removePartition(topicPartition.partition());
return;
}
long now = System.currentTimeMillis();
Delay.DELAY.log("business-delay" + kafkaStreamTable.sign, time);
Delay.DELAY.log("data-interval" + kafkaStreamTable.sign, now);
Delay.RESIDENCE_TIME.log("data-residence-time" + kafkaStreamTable.sign, now - time);
String value = record.value();
JsonObject jsonObject = gson.fromJson(value, JsonObject.class);
for (int i = 0; i < stringColumns.size(); i++) {
if (i == timeColumnIndex) {
tableBuilder.append(i, time);
} else if (i == receiveTimeColumnIndex) {
tableBuilder.append(i, record.timestamp());
} else {
JsonElement jsonElement = jsonObject.get(stringColumns.get(i));
if (null == jsonElement || jsonElement.isJsonNull()) {
tableBuilder.appendValue(i, null);
} else {
Type type = types.get(i);
switch (type) {
case DOUBLE:
tableBuilder.append(i, jsonElement.getAsDouble());
break;
case BIGINT:
tableBuilder.append(i, jsonElement.getAsLong());
break;
case INT:
tableBuilder.append(i, jsonElement.getAsInt());
break;
case VARBYTE:
tableBuilder.append(i, jsonElement.getAsString());
break;
default:
throw new UnknownTypeException(type.name());
}
}
}
}
}
arrayBlockingQueueList.get(threadId).put(tableBuilder.build());
} catch (InterruptException e) {
break;
} catch (InterruptedException e) {
break;
}
}
} catch (Throwable t) {
StreamProcessing.handleException(t);
}
}
}, topicPartition.topic() + "-" + topicPartition.partition());
thread.start();
consumers.add(thread);
}
protected synchronized void addPartition(int partition) {
partitionSet.add(partition);
partitionSetSize = partitionSet.size();
lastUpdateMs = System.currentTimeMillis();
}
protected synchronized void removePartition(int partition) {
partitionSet.remove(partition);
partitionSetSize = partitionSet.size();
lastUpdateMs = System.currentTimeMillis();
}
@Override
public boolean isFinished() {
if (-1 == consumeTo) {
return false;
}
if (partitionSetSize <= 0 && System.currentTimeMillis() - lastUpdateMs >= finishDelayMs) {
return true;
}
return false;
}
@Override
public void start() {
try (Consumer consumer = newKafkaConsumer(properties)) {
List partitionInfos = consumer.partitionsFor(topic);
Map topicPartitionTimes = new HashMap<>();
for (PartitionInfo partitionInfo : partitionInfos) {
topicPartitionTimes.put(new TopicPartition(partitionInfo.topic(), partitionInfo.partition()), consumeFrom);
}
Map topicPartitionOffsets = consumer.offsetsForTimes(topicPartitionTimes);
for (TopicPartition topicPartition : topicPartitionOffsets.keySet()) {
OffsetAndTimestamp offsetAndTimestamp = topicPartitionOffsets.get(topicPartition);
if (0 == consumeFrom) {
Map offsets = consumer.beginningOffsets(asList(topicPartition));
newConsumer(topicPartition, offsets.get(topicPartition));
} else if (null == offsetAndTimestamp) {
/**
* consumeFrom超出最大时间戳的情况下会返回null,这种情况下从末尾开始消费
* 如果想从起始点消费consumeFrom给0即可
*/
Map offsets = consumer.endOffsets(asList(topicPartition));
newConsumer(topicPartition, offsets.get(topicPartition));
} else {
newConsumer(topicPartition, offsetAndTimestamp.offset());
}
}
}
partitionsDetector.scheduleWithFixedDelay(new Runnable() {
@Override
public void run() {
List sorted = asList(myPartitions.toArray(new Integer[0]));
sort(sorted);
logger.info("{} partitions: {}", sign, sorted);
try (Consumer consumer = newKafkaConsumer(properties)) {
/**
* 该函数返回的是topic的所有partition并不会按相同的consumer group负载均衡
* newConsumer里会按serverCount和myHash对不属于自己消费的partition直接返回
*/
List partitionInfos = consumer.partitionsFor(topic);
for (PartitionInfo partitionInfo : partitionInfos) {
/**
* 消费过程中新创建的partition从0即起始点开始消费
*/
newConsumer(new TopicPartition(topic, partitionInfo.partition()), 0);
}
}
}
}, 0, 5, TimeUnit.SECONDS);
}
@Override
public void stop() {
partitionsDetector.shutdownNow();
for (Thread consumer : consumers) {
consumer.interrupt();
}
consumers.clear();
myPartitions.clear();
partitionSet.clear();
partitionSetSize = 0;
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy