cn.tenmg.cdc.log.connectors.mongodb.MongoDBSource Maven / Gradle / Ivy
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package cn.tenmg.cdc.log.connectors.mongodb;
import org.apache.flink.annotation.PublicEvolving;
import org.apache.flink.annotation.VisibleForTesting;
import com.mongodb.ConnectionString;
import com.mongodb.client.model.changestream.FullDocument;
import com.mongodb.kafka.connect.source.MongoSourceConfig;
import com.mongodb.kafka.connect.source.MongoSourceConfig.ErrorTolerance;
import com.mongodb.kafka.connect.source.MongoSourceConfig.OutputFormat;
import cn.tenmg.cdc.log.debezium.DebeziumDeserializationSchema;
import cn.tenmg.cdc.log.debezium.DebeziumSourceFunction;
import cn.tenmg.cdc.log.debezium.Validator;
import cn.tenmg.cdc.log.connectors.mongodb.internal.MongoDBConnectorSourceConnector;
import cn.tenmg.cdc.log.connectors.mongodb.internal.MongoDBConnectorSourceTask;
import io.debezium.heartbeat.Heartbeat;
import org.apache.commons.lang3.StringUtils;
import java.io.UnsupportedEncodingException;
import java.net.URLEncoder;
import java.nio.charset.StandardCharsets;
import java.util.Arrays;
import java.util.List;
import java.util.Locale;
import java.util.Properties;
import static org.apache.flink.util.Preconditions.checkArgument;
import static org.apache.flink.util.Preconditions.checkNotNull;
/**
* A builder to build a SourceFunction which can read snapshot and continue to consume change stream
* events.
*/
@PublicEvolving
public class MongoDBSource {
public static final String MONGODB_SCHEME = "mongodb";
public static final String ERROR_TOLERANCE_NONE = ErrorTolerance.NONE.value();
public static final String ERROR_TOLERANCE_ALL = ErrorTolerance.ALL.value();
public static final String FULL_DOCUMENT_UPDATE_LOOKUP = FullDocument.UPDATE_LOOKUP.getValue();
public static final int POLL_MAX_BATCH_SIZE_DEFAULT = 1000;
public static final int POLL_AWAIT_TIME_MILLIS_DEFAULT = 1500;
public static final String HEARTBEAT_TOPIC_NAME_DEFAULT = "__mongodb_heartbeats";
public static final String OUTPUT_FORMAT_SCHEMA =
OutputFormat.SCHEMA.name().toLowerCase(Locale.ROOT);
// Add "source" field to adapt to debezium SourceRecord
public static final String OUTPUT_SCHEMA_VALUE_DEFAULT =
"{"
+ " \"name\": \"ChangeStream\","
+ " \"type\": \"record\","
+ " \"fields\": ["
+ " { \"name\": \"_id\", \"type\": \"string\" },"
+ " { \"name\": \"operationType\", \"type\": [\"string\", \"null\"] },"
+ " { \"name\": \"fullDocument\", \"type\": [\"string\", \"null\"] },"
+ " { \"name\": \"source\","
+ " \"type\": [{\"name\": \"source\", \"type\": \"record\", \"fields\": ["
+ " {\"name\": \"ts_ms\", \"type\": \"long\"},"
+ " {\"name\": \"snapshot\", \"type\": [\"string\", \"null\"] } ]"
+ " }, \"null\" ] },"
+ " { \"name\": \"ns\","
+ " \"type\": [{\"name\": \"ns\", \"type\": \"record\", \"fields\": ["
+ " {\"name\": \"db\", \"type\": \"string\"},"
+ " {\"name\": \"coll\", \"type\": [\"string\", \"null\"] } ]"
+ " }, \"null\" ] },"
+ " { \"name\": \"to\","
+ " \"type\": [{\"name\": \"to\", \"type\": \"record\", \"fields\": ["
+ " {\"name\": \"db\", \"type\": \"string\"},"
+ " {\"name\": \"coll\", \"type\": [\"string\", \"null\"] } ]"
+ " }, \"null\" ] },"
+ " { \"name\": \"documentKey\", \"type\": [\"string\", \"null\"] },"
+ " { \"name\": \"updateDescription\","
+ " \"type\": [{\"name\": \"updateDescription\", \"type\": \"record\", \"fields\": ["
+ " {\"name\": \"updatedFields\", \"type\": [\"string\", \"null\"]},"
+ " {\"name\": \"removedFields\","
+ " \"type\": [{\"type\": \"array\", \"items\": \"string\"}, \"null\"]"
+ " }] }, \"null\"] },"
+ " { \"name\": \"clusterTime\", \"type\": [\"string\", \"null\"] },"
+ " { \"name\": \"txnNumber\", \"type\": [\"long\", \"null\"]},"
+ " { \"name\": \"lsid\", \"type\": [{\"name\": \"lsid\", \"type\": \"record\","
+ " \"fields\": [ {\"name\": \"id\", \"type\": \"string\"},"
+ " {\"name\": \"uid\", \"type\": \"string\"}] }, \"null\"] }"
+ " ]"
+ "}";
public static Builder builder() {
return new Builder<>();
}
private static String encodeValue(String value) {
try {
return URLEncoder.encode(value, StandardCharsets.UTF_8.name());
} catch (UnsupportedEncodingException e) {
throw new IllegalArgumentException(e);
}
}
/** Builder class of {@link MongoDBSource}. */
public static class Builder {
private String hosts;
private String username;
private String password;
private List databaseList;
private List collectionList;
private String connectionOptions;
private Integer batchSize;
private Integer pollAwaitTimeMillis = POLL_AWAIT_TIME_MILLIS_DEFAULT;
private Integer pollMaxBatchSize = POLL_MAX_BATCH_SIZE_DEFAULT;
private Boolean copyExisting = true;
private Integer copyExistingMaxThreads;
private Integer copyExistingQueueSize;
private String copyExistingPipeline;
private Boolean errorsLogEnable;
private String errorsTolerance;
private Integer heartbeatIntervalMillis;
private DebeziumDeserializationSchema deserializer;
/** The comma-separated list of hostname and port pairs of mongodb servers. */
public Builder hosts(String hosts) {
this.hosts = hosts;
return this;
}
/**
* Ampersand (i.e. &) separated MongoDB connection options eg
* replicaSet=test&connectTimeoutMS=300000
* https://docs.mongodb.com/manual/reference/connection-string/#std-label-connections-connection-options
*/
public Builder connectionOptions(String connectionOptions) {
this.connectionOptions = connectionOptions;
return this;
}
/** Name of the database user to be used when connecting to MongoDB. */
public Builder username(String username) {
this.username = username;
return this;
}
/** Password to be used when connecting to MongoDB. */
public Builder password(String password) {
this.password = password;
return this;
}
/** Regular expressions list that match database names to be monitored. */
public Builder databaseList(String... databaseList) {
this.databaseList = Arrays.asList(databaseList);
return this;
}
/**
* Regular expressions that match fully-qualified collection identifiers for collections to
* be monitored. Each identifier is of the form {@code .}.
*/
public Builder collectionList(String... collectionList) {
this.collectionList = Arrays.asList(collectionList);
return this;
}
/**
* batch.size
*
* The cursor batch size. Default: 0
*/
public Builder batchSize(int batchSize) {
checkArgument(batchSize >= 0);
this.batchSize = batchSize;
return this;
}
/**
* poll.await.time.ms
*
* The amount of time to wait before checking for new results on the change stream.
* Default: 3000
*/
public Builder pollAwaitTimeMillis(int pollAwaitTimeMillis) {
checkArgument(pollAwaitTimeMillis > 0);
this.pollAwaitTimeMillis = pollAwaitTimeMillis;
return this;
}
/**
* poll.max.batch.size
*
* Maximum number of change stream documents to include in a single batch when polling
* for new data. This setting can be used to limit the amount of data buffered internally in
* the connector. Default: 1000
*/
public Builder pollMaxBatchSize(int pollMaxBatchSize) {
checkArgument(pollMaxBatchSize > 0);
this.pollMaxBatchSize = pollMaxBatchSize;
return this;
}
/**
* copy.existing
*
* Copy existing data from source collections and convert them to Change Stream events on
* their respective topics. Any changes to the data that occur during the copy process are
* applied once the copy is completed.
*/
public Builder copyExisting(boolean copyExisting) {
this.copyExisting = copyExisting;
return this;
}
/**
* copy.existing.max.threads
*
* The number of threads to use when performing the data copy. Defaults to the number of
* processors. Default: defaults to the number of processors
*/
public Builder copyExistingMaxThreads(int copyExistingMaxThreads) {
checkArgument(copyExistingMaxThreads > 0);
this.copyExistingMaxThreads = copyExistingMaxThreads;
return this;
}
/**
* copy.existing.queue.size
*
* The max size of the queue to use when copying data. Default: 16000
*/
public Builder copyExistingQueueSize(int copyExistingQueueSize) {
checkArgument(copyExistingQueueSize > 0);
this.copyExistingQueueSize = copyExistingQueueSize;
return this;
}
/**
* copy.existing.pipeline eg. [ { "$match": { "closed": "false" } } ]
*
* An array of JSON objects describing the pipeline operations to run when copying
* existing data. This can improve the use of indexes by the copying manager and make
* copying more efficient.
*/
public Builder copyExistingPipeline(String copyExistingPipeline) {
this.copyExistingPipeline = copyExistingPipeline;
return this;
}
/**
* errors.log.enable
*
* Whether details of failed operations should be written to the log file. When set to
* true, both errors that are tolerated (determined by the errors.tolerance setting) and not
* tolerated are written. When set to false, errors that are tolerated are omitted.
*/
public Builder errorsLogEnable(boolean errorsLogEnable) {
this.errorsLogEnable = errorsLogEnable;
return this;
}
/**
* errors.tolerance
*
* Whether to continue processing messages if an error is encountered. When set to none,
* the connector reports an error and blocks further processing of the rest of the records
* when it encounters an error. When set to all, the connector silently ignores any bad
* messages.
*
*
Default: "none" Accepted Values: "none" or "all"
*/
public Builder errorsTolerance(String errorsTolerance) {
this.errorsTolerance = errorsTolerance;
return this;
}
/**
* heartbeat.interval.ms
*
* The length of time in milliseconds between sending heartbeat messages. Heartbeat
* messages contain the post batch resume token and are sent when no source records have
* been published in the specified interval. This improves the resumability of the connector
* for low volume namespaces. Use 0 to disable.
*/
public Builder heartbeatIntervalMillis(int heartbeatIntervalMillis) {
checkArgument(heartbeatIntervalMillis >= 0);
this.heartbeatIntervalMillis = heartbeatIntervalMillis;
return this;
}
/**
* The deserializer used to convert from consumed {@link
* org.apache.kafka.connect.source.SourceRecord}.
*/
public Builder deserializer(DebeziumDeserializationSchema deserializer) {
this.deserializer = deserializer;
return this;
}
/** Build connection uri. */
@VisibleForTesting
public ConnectionString buildConnectionUri() {
StringBuilder sb = new StringBuilder(MONGODB_SCHEME).append("://");
if (StringUtils.isNotEmpty(username) && StringUtils.isNotEmpty(password)) {
sb.append(encodeValue(username))
.append(":")
.append(encodeValue(password))
.append("@");
}
sb.append(checkNotNull(hosts));
if (StringUtils.isNotEmpty(connectionOptions)) {
sb.append("/?").append(connectionOptions);
}
return new ConnectionString(sb.toString());
}
/**
* The properties of mongodb kafka connector.
* https://docs.mongodb.com/kafka-connector/current/kafka-source
*/
public DebeziumSourceFunction build() {
Properties props = new Properties();
props.setProperty(
"connector.class", MongoDBConnectorSourceConnector.class.getCanonicalName());
props.setProperty("name", "mongodb_binlog_source");
ConnectionString connectionString = buildConnectionUri();
props.setProperty(
MongoSourceConfig.CONNECTION_URI_CONFIG, String.valueOf(connectionString));
if (databaseList != null) {
props.setProperty(
MongoDBConnectorSourceTask.DATABASE_INCLUDE_LIST,
String.join(",", databaseList));
}
if (collectionList != null) {
props.setProperty(
MongoDBConnectorSourceTask.COLLECTION_INCLUDE_LIST,
String.join(",", collectionList));
}
props.setProperty(MongoSourceConfig.FULL_DOCUMENT_CONFIG, FULL_DOCUMENT_UPDATE_LOOKUP);
props.setProperty(
MongoSourceConfig.PUBLISH_FULL_DOCUMENT_ONLY_CONFIG,
String.valueOf(Boolean.FALSE));
props.setProperty(MongoSourceConfig.OUTPUT_FORMAT_KEY_CONFIG, OUTPUT_FORMAT_SCHEMA);
props.setProperty(MongoSourceConfig.OUTPUT_FORMAT_VALUE_CONFIG, OUTPUT_FORMAT_SCHEMA);
props.setProperty(
MongoSourceConfig.OUTPUT_SCHEMA_INFER_VALUE_CONFIG,
String.valueOf(Boolean.FALSE));
props.setProperty(
MongoSourceConfig.OUTPUT_SCHEMA_VALUE_CONFIG, OUTPUT_SCHEMA_VALUE_DEFAULT);
if (batchSize != null) {
props.setProperty(MongoSourceConfig.BATCH_SIZE_CONFIG, String.valueOf(batchSize));
}
if (pollAwaitTimeMillis != null) {
props.setProperty(
MongoSourceConfig.POLL_AWAIT_TIME_MS_CONFIG,
String.valueOf(pollAwaitTimeMillis));
}
if (pollMaxBatchSize != null) {
props.setProperty(
MongoSourceConfig.POLL_MAX_BATCH_SIZE_CONFIG,
String.valueOf(pollMaxBatchSize));
}
if (errorsLogEnable != null) {
props.setProperty(
MongoSourceConfig.ERRORS_LOG_ENABLE_CONFIG,
String.valueOf(errorsLogEnable));
}
if (errorsTolerance != null) {
props.setProperty(MongoSourceConfig.ERRORS_TOLERANCE_CONFIG, errorsTolerance);
}
if (copyExisting != null) {
props.setProperty(
MongoSourceConfig.COPY_EXISTING_CONFIG, String.valueOf(copyExisting));
}
if (copyExistingMaxThreads != null) {
props.setProperty(
MongoSourceConfig.COPY_EXISTING_MAX_THREADS_CONFIG,
String.valueOf(copyExistingMaxThreads));
}
if (copyExistingQueueSize != null) {
props.setProperty(
MongoSourceConfig.COPY_EXISTING_QUEUE_SIZE_CONFIG,
String.valueOf(copyExistingQueueSize));
}
if (copyExistingPipeline != null) {
props.setProperty(
MongoSourceConfig.COPY_EXISTING_PIPELINE_CONFIG, copyExistingPipeline);
}
if (heartbeatIntervalMillis != null) {
props.setProperty(
MongoSourceConfig.HEARTBEAT_INTERVAL_MS_CONFIG,
String.valueOf(heartbeatIntervalMillis));
}
props.setProperty(
MongoSourceConfig.HEARTBEAT_TOPIC_NAME_CONFIG, HEARTBEAT_TOPIC_NAME_DEFAULT);
// Let DebeziumChangeFetcher recognize heartbeat record
props.setProperty(
Heartbeat.HEARTBEAT_TOPICS_PREFIX.name(), HEARTBEAT_TOPIC_NAME_DEFAULT);
return new DebeziumSourceFunction<>(
deserializer, props, null, Validator.getDefaultValidator());
}
}
}