com.alibaba.ververica.cdc.debezium.DebeziumSourceFunction Maven / Gradle / Ivy
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.alibaba.ververica.cdc.debezium;
import org.apache.flink.annotation.PublicEvolving;
import org.apache.flink.api.common.state.ListState;
import org.apache.flink.api.common.state.ListStateDescriptor;
import org.apache.flink.api.common.state.OperatorStateStore;
import org.apache.flink.api.common.typeinfo.BasicTypeInfo;
import org.apache.flink.api.common.typeinfo.PrimitiveArrayTypeInfo;
import org.apache.flink.api.common.typeinfo.TypeInformation;
import org.apache.flink.api.java.typeutils.ResultTypeQueryable;
import org.apache.flink.configuration.Configuration;
import org.apache.flink.runtime.state.FunctionInitializationContext;
import org.apache.flink.runtime.state.FunctionSnapshotContext;
import org.apache.flink.streaming.api.checkpoint.CheckpointedFunction;
import org.apache.flink.streaming.api.functions.source.RichSourceFunction;
import org.apache.flink.util.ExceptionUtils;
import org.apache.flink.shaded.guava18.com.google.common.util.concurrent.ThreadFactoryBuilder;
import com.alibaba.ververica.cdc.debezium.internal.DebeziumChangeConsumer;
import com.alibaba.ververica.cdc.debezium.internal.FlinkDatabaseHistory;
import com.alibaba.ververica.cdc.debezium.internal.FlinkOffsetBackingStore;
import io.debezium.document.DocumentReader;
import io.debezium.document.DocumentWriter;
import io.debezium.embedded.Connect;
import io.debezium.engine.DebeziumEngine;
import io.debezium.relational.history.HistoryRecord;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.io.IOException;
import java.nio.charset.StandardCharsets;
import java.util.Properties;
import java.util.UUID;
import java.util.concurrent.ConcurrentLinkedQueue;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
import java.util.concurrent.ThreadFactory;
import java.util.concurrent.TimeUnit;
import java.util.stream.Collectors;
/**
* The {@link DebeziumSourceFunction} is a streaming data source that pulls captured change data
* from databases into Flink.
*
* The source function participates in checkpointing and guarantees that no data is lost
* during a failure, and that the computation processes elements "exactly once".
*
*
Note: currently, the source function can't run in multiple parallel instances.
*
*
Please refer to Debezium's documentation for the available configuration properties:
* https://debezium.io/documentation/reference/1.2/development/engine.html#engine-properties
*/
@PublicEvolving
public class DebeziumSourceFunction extends RichSourceFunction implements
CheckpointedFunction,
ResultTypeQueryable {
private static final long serialVersionUID = -5808108641062931623L;
protected static final Logger LOG = LoggerFactory.getLogger(DebeziumSourceFunction.class);
/** State name of the consumer's partition offset states. */
public static final String OFFSETS_STATE_NAME = "offset-states";
/** State name of the consumer's history records state. */
public static final String HISTORY_RECORDS_STATE_NAME = "history-records-states";
/** The schema to convert from Debezium's messages into Flink's objects. */
private final DebeziumDeserializationSchema deserializer;
/** User-supplied properties for Kafka. **/
private final Properties properties;
private ExecutorService executor;
private DebeziumEngine> engine;
/** The error from {@link #engine} thread. */
private transient volatile Throwable error;
/** Flag indicating whether the consumer is still running. */
private volatile boolean running = true;
/** The consumer to fetch records from {@link DebeziumEngine}. */
private transient volatile DebeziumChangeConsumer debeziumConsumer;
/**
* The offsets to restore to, if the consumer restores state from a checkpoint.
*
* This map will be populated by the {@link #initializeState(FunctionInitializationContext)} method.
*
*
Using a String because we are encoding the offset state in JSON bytes.
*/
private transient volatile String restoredOffsetState;
/** Accessor for state in the operator state backend. */
private transient ListState offsetState;
/**
* State to store the history records, i.e. schema changes.
*
* @see FlinkDatabaseHistory
*/
private transient ListState historyRecordsState;
/**
* Unique name of this Debezium Engine instance across all the jobs. Currently we randomly generate a UUID for it.
* This is used for {@link FlinkDatabaseHistory}.
*/
private transient String engineInstanceName;
public DebeziumSourceFunction(DebeziumDeserializationSchema deserializer, Properties properties) {
this.deserializer = deserializer;
this.properties = properties;
}
@Override
public void open(Configuration parameters) throws Exception {
super.open(parameters);
ThreadFactory threadFactory = new ThreadFactoryBuilder()
.setNameFormat("debezium-engine")
.build();
this.executor = Executors.newSingleThreadExecutor(threadFactory);
}
// ------------------------------------------------------------------------
// Checkpoint and restore
// ------------------------------------------------------------------------
@Override
public void initializeState(FunctionInitializationContext context) throws Exception {
OperatorStateStore stateStore = context.getOperatorStateStore();
this.offsetState = stateStore.getUnionListState(new ListStateDescriptor<>(
OFFSETS_STATE_NAME,
PrimitiveArrayTypeInfo.BYTE_PRIMITIVE_ARRAY_TYPE_INFO));
this.historyRecordsState = stateStore.getUnionListState(new ListStateDescriptor<>(
HISTORY_RECORDS_STATE_NAME,
BasicTypeInfo.STRING_TYPE_INFO));
if (context.isRestored()) {
restoreOffsetState();
restoreHistoryRecordsState();
} else {
LOG.info("Consumer subtask {} has no restore state.", getRuntimeContext().getIndexOfThisSubtask());
}
}
private void restoreOffsetState() throws Exception {
for (byte[] serializedOffset : offsetState.get()) {
if (restoredOffsetState == null) {
restoredOffsetState = new String(serializedOffset, StandardCharsets.UTF_8);
} else {
throw new RuntimeException("Debezium Source only support single task, " +
"however, this is restored from multiple tasks.");
}
}
LOG.info("Consumer subtask {} restored offset state: {}.", getRuntimeContext().getIndexOfThisSubtask(), restoredOffsetState);
}
private void restoreHistoryRecordsState() throws Exception {
DocumentReader reader = DocumentReader.defaultReader();
ConcurrentLinkedQueue historyRecords = new ConcurrentLinkedQueue<>();
int recordsCount = 0;
boolean firstEntry = true;
for (String record : historyRecordsState.get()) {
if (firstEntry) {
// we store the engine instance name in the first element
this.engineInstanceName = record;
firstEntry = false;
} else {
historyRecords.add(new HistoryRecord(reader.read(record)));
recordsCount++;
}
}
if (engineInstanceName != null) {
FlinkDatabaseHistory.registerHistoryRecords(engineInstanceName, historyRecords);
}
LOG.info("Consumer subtask {} restored history records state: {} with {} records.",
getRuntimeContext().getIndexOfThisSubtask(),
engineInstanceName,
recordsCount);
}
@Override
public void snapshotState(FunctionSnapshotContext functionSnapshotContext) throws Exception {
if (!running) {
LOG.debug("snapshotState() called on closed source");
} else {
snapshotOffsetState();
snapshotHistoryRecordsState();
}
}
private void snapshotOffsetState() throws Exception {
offsetState.clear();
final DebeziumChangeConsumer> consumer = this.debeziumConsumer;
byte[] serializedOffset = null;
if (consumer == null) {
// the consumer has not yet been initialized, which means we need to return the
// originally restored offsets
if (restoredOffsetState != null) {
serializedOffset = restoredOffsetState.getBytes(StandardCharsets.UTF_8);
}
} else {
serializedOffset = consumer.snapshotCurrentState();
}
if (serializedOffset != null) {
offsetState.add(serializedOffset);
}
}
private void snapshotHistoryRecordsState() throws Exception {
historyRecordsState.clear();
if (engineInstanceName != null) {
historyRecordsState.add(engineInstanceName);
ConcurrentLinkedQueue historyRecords = FlinkDatabaseHistory.getRegisteredHistoryRecord(engineInstanceName);
if (historyRecords != null) {
DocumentWriter writer = DocumentWriter.defaultWriter();
for (HistoryRecord record : historyRecords) {
historyRecordsState.add(writer.write(record.document()));
}
}
}
}
@Override
public void run(SourceContext sourceContext) throws Exception {
properties.setProperty("name", "engine");
properties.setProperty("offset.storage", FlinkOffsetBackingStore.class.getCanonicalName());
if (restoredOffsetState != null) {
// restored from state
properties.setProperty(FlinkOffsetBackingStore.OFFSET_STATE_VALUE, restoredOffsetState);
}
// DO NOT include schema payload in change event
properties.setProperty("key.converter.schemas.enable", "false");
properties.setProperty("value.converter.schemas.enable", "false");
// DO NOT include schema change, e.g. DDL
properties.setProperty("include.schema.changes", "false");
// disable the offset flush totally
properties.setProperty("offset.flush.interval.ms", String.valueOf(Long.MAX_VALUE));
// disable tombstones
properties.setProperty("tombstones.on.delete", "false");
// we have to use a persisted DatabaseHistory implementation, otherwise, recovery can't continue to read binlog
// see https://stackoverflow.com/questions/57147584/debezium-error-schema-isnt-know-to-this-connector
// and https://debezium.io/blog/2018/03/16/note-on-database-history-topic-configuration/
properties.setProperty("database.history", FlinkDatabaseHistory.class.getCanonicalName());
// reduce the history records to store
properties.setProperty("database.history.store.only.monitored.tables.ddl", "true");
if (engineInstanceName == null) {
// not restore from recovery
engineInstanceName = UUID.randomUUID().toString();
FlinkDatabaseHistory.registerEmptyHistoryRecord(engineInstanceName);
}
// history instance name to initialize FlinkDatabaseHistory
properties.setProperty(FlinkDatabaseHistory.DATABASE_HISTORY_INSTANCE_NAME, engineInstanceName);
// dump the properties
String propsString = properties.entrySet().stream()
.map(t -> "\t" + t.getKey().toString() + " = " + t.getValue().toString() + "\n")
.collect(Collectors.joining());
LOG.info("Debezium Properties:\n{}", propsString);
this.debeziumConsumer = new DebeziumChangeConsumer<>(
sourceContext,
deserializer,
restoredOffsetState == null, // DB snapshot phase if restore state is null
this::reportError);
// create the engine with this configuration ...
this.engine = DebeziumEngine.create(Connect.class)
.using(properties)
.notifying(debeziumConsumer)
.using((success, message, error) -> {
if (!success && error != null) {
this.reportError(error);
}
})
.build();
if (!running) {
return;
}
// run the engine asynchronously
executor.execute(engine);
// on a clean exit, wait for the runner thread
try {
while (running) {
if (executor.awaitTermination(5, TimeUnit.SECONDS)) {
break;
}
if (error != null) {
running = false;
shutdownEngine();
// rethrow the error from Debezium consumer
ExceptionUtils.rethrow(error);
}
}
}
catch (InterruptedException e) {
// may be the result of a wake-up interruption after an exception.
// we ignore this here and only restore the interruption state
Thread.currentThread().interrupt();
}
}
@Override
public void cancel() {
// flag the main thread to exit. A thread interrupt will come anyways.
running = false;
// safely and gracefully stop the engine
shutdownEngine();
}
@Override
public void close() throws Exception {
cancel();
if (executor != null) {
executor.awaitTermination(Long.MAX_VALUE, TimeUnit.SECONDS);
}
super.close();
}
// --------------------------------------------------------------------------------
// Error callbacks
// --------------------------------------------------------------------------------
private void reportError(Throwable error) {
LOG.error("Reporting error:", error);
this.error = error;
}
/**
* Safely and gracefully stop the Debezium engine.
*/
private void shutdownEngine() {
try {
if (engine != null) {
engine.close();
}
} catch (IOException e) {
ExceptionUtils.rethrow(e);
} finally {
if (executor != null) {
executor.shutdown();
}
}
}
@Override
public TypeInformation getProducedType() {
return deserializer.getProducedType();
}
}