All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.hazelcast.jet.mongodb.impl.ReadMongoP Maven / Gradle / Ivy

There is a newer version: 5.5.0
Show newest version
/*
 * Copyright 2023 Hazelcast Inc.
 *
 * Licensed under the Hazelcast Community License (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://hazelcast.com/hazelcast-community-license
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package com.hazelcast.jet.mongodb.impl;

import com.hazelcast.function.BiFunctionEx;
import com.hazelcast.function.FunctionEx;
import com.hazelcast.jet.JetException;
import com.hazelcast.jet.Traverser;
import com.hazelcast.jet.core.AbstractProcessor;
import com.hazelcast.jet.core.BroadcastKey;
import com.hazelcast.jet.core.EventTimeMapper;
import com.hazelcast.jet.impl.util.Util;
import com.hazelcast.jet.mongodb.impl.CursorTraverser.EmptyItem;
import com.hazelcast.logging.ILogger;
import com.hazelcast.spi.impl.NodeEngineImpl;
import com.mongodb.MongoException;
import com.mongodb.MongoServerException;
import com.mongodb.MongoSocketException;
import com.mongodb.client.ChangeStreamIterable;
import com.mongodb.client.MongoClient;
import com.mongodb.client.MongoCollection;
import com.mongodb.client.MongoCursor;
import com.mongodb.client.MongoDatabase;
import com.mongodb.client.MongoIterable;
import com.mongodb.client.model.changestream.ChangeStreamDocument;
import org.bson.BsonDateTime;
import org.bson.BsonDocument;
import org.bson.BsonTimestamp;
import org.bson.Document;
import org.bson.conversions.Bson;

import javax.annotation.Nonnull;
import javax.annotation.Nullable;
import java.io.Closeable;
import java.util.ArrayList;
import java.util.List;
import java.util.Map.Entry;

import static com.hazelcast.internal.nio.IOUtil.closeResource;
import static com.hazelcast.internal.util.Preconditions.checkNotNull;
import static com.hazelcast.internal.util.Preconditions.checkState;
import static com.hazelcast.jet.Traversers.singleton;
import static com.hazelcast.jet.Traversers.traverseIterable;
import static com.hazelcast.jet.Util.entry;
import static com.hazelcast.jet.core.BroadcastKey.broadcastKey;
import static com.hazelcast.jet.mongodb.impl.MongoUtilities.checkCollectionExists;
import static com.hazelcast.jet.mongodb.impl.MongoUtilities.checkDatabaseExists;
import static com.hazelcast.jet.mongodb.impl.MongoUtilities.partitionAggregate;
import static com.mongodb.client.model.Aggregates.match;
import static com.mongodb.client.model.Aggregates.sort;
import static com.mongodb.client.model.Filters.gt;
import static com.mongodb.client.model.Sorts.ascending;
import static com.mongodb.client.model.changestream.FullDocument.UPDATE_LOOKUP;

/**
 * Processor for reading from MongoDB
 * 

* Reading is done by one of two readers: *

    *
  • batch reader, which uses {@linkplain MongoCollection#aggregate} to find matching documents.
  • *
  • streaming reader, which uses {@linkplain MongoClient#watch} (or same function on database or collection level) * to find matching documents, as they arrive in the stream.
  • *

    * All processing guarantees are supported via standard snapshotting mechanism. Each instance of this processor * will save it's state (last read key or resumeToken) with the key being global processor index mod total parallelism. *

* * @param type of emitted item */ public class ReadMongoP extends AbstractProcessor { private static final int BATCH_SIZE = 1000; private final boolean throwOnNonExisting; private ILogger logger; private int totalParallelism; private int processorIndex; private boolean snapshotsEnabled; private boolean snapshotInProgress; private final MongoChunkedReader reader; private final MongoConnection connection; /** * Means that user requested the query to be executed in non-distributed way. * This property set to true should mean that * {@link com.hazelcast.jet.core.ProcessorMetaSupplier#forceTotalParallelismOne} was used. */ private final boolean nonDistributed; private Traverser traverser; private Traverser, Object>> snapshotTraverser; /** * Parallelization of reading is possible only when * user didn't mark the processor as nonDistributed * and when totalParallelism is higher than 1. */ private boolean canParallelize; public ReadMongoP(ReadMongoParams params) { if (params.isStream()) { EventTimeMapper eventTimeMapper = new EventTimeMapper<>(params.eventTimePolicy); eventTimeMapper.addPartitions(1); this.reader = new StreamMongoReader(params.databaseName, params.collectionName, params.mapStreamFn, params.getStartAtTimestamp(), params.getAggregates(), eventTimeMapper); } else { this.reader = new BatchMongoReader(params.databaseName, params.collectionName, params.mapItemFn, params.getAggregates()); } this.connection = new MongoConnection( params.clientSupplier, params.dataConnectionRef, client -> reader.connect(client, snapshotsEnabled) ); this.nonDistributed = params.isNonDistributed(); this.throwOnNonExisting = params.isThrowOnNonExisting(); } @Override protected void init(@Nonnull Context context) { logger = context.logger(); totalParallelism = context.totalParallelism(); canParallelize = !nonDistributed && totalParallelism > 1; processorIndex = context.globalProcessorIndex(); this.snapshotsEnabled = context.snapshottingEnabled(); NodeEngineImpl nodeEngine = Util.getNodeEngine(context.hazelcastInstance()); connection.assembleSupplier(nodeEngine); try { connection.reconnectIfNecessary(); } catch (MongoException e) { throw new JetException(e); } } /** * Source cannot be cooperative; the async driver is much older than sync driver, probably it's not Mongo's team * priority to keep it up to date. Sync version seems to be better for us then. */ @Override public boolean isCooperative() { return false; } @Override public boolean complete() { if (!connection.reconnectIfNecessary()) { return false; } if (traverser == null) { this.traverser = reader.nextChunkTraverser() .onFirstNull(() -> traverser = null); } if (!emitFromTraverser(traverser)) { return false; } if (snapshotInProgress) { return false; } return reader.everCompletes(); } @Override public void close() { closeResource(reader); closeResource(connection); } @Override public boolean saveToSnapshot() { if (!snapshotsEnabled) { return true; } if (traverser != null && !emitFromTraverser(traverser)) { return false; } snapshotInProgress = true; if (snapshotTraverser == null) { int partition = processorIndex % totalParallelism; Object snapshot = reader.snapshot(); if (snapshot == null) { return true; } snapshotTraverser = singleton(entry(broadcastKey(partition), snapshot)) .onFirstNull(() -> { snapshotTraverser = null; getLogger().finest("Finished saving snapshot."); }); if (reader.supportsWatermarks()) { Object watermark = reader.watermark(); snapshotTraverser = snapshotTraverser.append(entry(broadcastKey(-partition), watermark)); } } return emitFromTraverserToSnapshot(snapshotTraverser); } @Override public boolean snapshotCommitFinish(boolean success) { if (logger.isFineEnabled()) { logger.fine("Snapshot commit finished"); } snapshotInProgress = false; return true; } @Override @SuppressWarnings("unchecked") protected void restoreFromSnapshot(@Nonnull Object key, @Nonnull Object value) { int keyInteger = ((BroadcastKey) key).key(); boolean wm = keyInteger < 0; int keyAb = Math.abs(keyInteger); boolean forThisProcessor = keyAb % totalParallelism == processorIndex; if (forThisProcessor) { if (!wm) { reader.restore(value); reader.connect(connection.client(), true); } else if (reader.supportsWatermarks()) { reader.restoreWatermark((Long) value); } } } private abstract class MongoChunkedReader implements Closeable { protected MongoDatabase database; protected MongoCollection collection; private final String databaseName; private final String collectionName; protected MongoChunkedReader( String databaseName, String collectionName ) { this.databaseName = databaseName; this.collectionName = collectionName; } void onConnect(MongoClient mongoClient, boolean snapshotsEnabled) { } void connect(MongoClient newClient, boolean snapshotsEnabled) { try { logger.fine("(Re)connecting to MongoDB"); if (databaseName != null) { if (throwOnNonExisting) { checkDatabaseExists(newClient, databaseName); } this.database = newClient.getDatabase(databaseName); } if (collectionName != null) { checkState(databaseName != null, "you have to provide database name if collection name" + " is specified"); //noinspection ConstantValue false warn by intellij checkState(database != null, "database " + databaseName + " does not exists"); if (throwOnNonExisting) { checkCollectionExists(database, collectionName); } this.collection = database.getCollection(collectionName); } onConnect(newClient, snapshotsEnabled); } catch (MongoSocketException | MongoServerException e) { logger.warning("Could not connect to MongoDB", e); } } @Nonnull abstract Traverser nextChunkTraverser(); @Nullable abstract Object snapshot(); abstract void restore(Object value); abstract boolean everCompletes(); boolean supportsWatermarks() { return !everCompletes(); } public abstract void restoreWatermark(Long value); public abstract Object watermark(); } private final class BatchMongoReader extends MongoChunkedReader { private final FunctionEx mapItemFn; private final List aggregates; private Traverser delegate; private Object lastKey; private BatchMongoReader( String databaseName, String collectionName, FunctionEx mapItemFn, List aggregates) { super(databaseName, collectionName); this.mapItemFn = mapItemFn; this.aggregates = aggregates; } @Override void onConnect(MongoClient mongoClient, boolean supportsSnapshots) { List aggregateList = new ArrayList<>(aggregates); if (supportsSnapshots && !hasSorts(aggregateList)) { aggregateList.add(sort(ascending("_id")).toBsonDocument()); } if (supportsSnapshots && lastKey != null) { aggregateList.add(match(gt("_id", lastKey)).toBsonDocument()); } if (canParallelize) { aggregateList.addAll(0, partitionAggregate(totalParallelism, processorIndex, false)); } if (collection != null) { this.delegate = delegateForCollection(collection, aggregateList); } else if (database != null) { this.delegate = delegateForDb(database, aggregateList); } else { final MongoClient clientLocal = mongoClient; this.delegate = traverseIterable(mongoClient.listDatabaseNames()) .flatMap(name -> { MongoDatabase db = clientLocal.getDatabase(name); return delegateForDb(db, aggregateList); }); } checkNotNull(this.delegate, "unable to connect to Mongo"); } private boolean hasSorts(List aggregateList) { return aggregateList.stream().anyMatch(agg -> agg.toBsonDocument().get("$sort") != null); } private Traverser delegateForCollection(MongoCollection collection, List aggregateList) { return traverseIterable(collection.aggregate(aggregateList)); } private Traverser delegateForDb(MongoDatabase database, List aggregateList) { MongoIterable collectionsIterable = database.listCollectionNames(); return traverseIterable(collectionsIterable) .flatMap(colName -> delegateForCollection(database.getCollection(colName), aggregateList)); } @Nonnull @Override public Traverser nextChunkTraverser() { Traverser localDelegate = this.delegate; checkNotNull(localDelegate, "unable to connect to Mongo"); return localDelegate .map(item -> { lastKey = item.get("_id"); return mapItemFn.apply(item); }); } @Override boolean everCompletes() { return true; } @Override public void restoreWatermark(Long value) { throw new UnsupportedOperationException("watermarks are only in streaming case"); } @Override public Object watermark() { throw new UnsupportedOperationException("watermarks are only in streaming case"); } @Nonnull @Override public Object snapshot() { return lastKey; } @Override public void restore(Object value) { lastKey = value; } @Override public void close() { } } private final class StreamMongoReader extends MongoChunkedReader { private final BiFunctionEx, Long, I> mapFn; private final BsonTimestamp startTimestamp; private final List aggregates; private final EventTimeMapper eventTimeMapper; private MongoCursor> cursor; private BsonDocument resumeToken; private StreamMongoReader( String databaseName, String collectionName, BiFunctionEx, Long, I> mapFn, BsonTimestamp startTimestamp, List aggregates, EventTimeMapper eventTimeMapper ) { super(databaseName, collectionName); this.mapFn = mapFn; this.startTimestamp = startTimestamp; this.aggregates = aggregates; this.eventTimeMapper = eventTimeMapper; } @Override public void onConnect(MongoClient mongoClient, boolean snapshotsEnabled) { List aggregateList = new ArrayList<>(aggregates); if (canParallelize) { aggregateList.addAll(0, partitionAggregate(totalParallelism, processorIndex, true)); } ChangeStreamIterable changeStream; if (collection != null) { changeStream = collection.watch(aggregateList); } else if (database != null) { changeStream = database.watch(aggregateList); } else { changeStream = mongoClient.watch(aggregateList); } if (resumeToken != null) { changeStream.resumeAfter(resumeToken); } else if (startTimestamp != null) { changeStream.startAtOperationTime(startTimestamp); } cursor = changeStream.batchSize(BATCH_SIZE).fullDocument(UPDATE_LOOKUP).iterator(); } @Override boolean everCompletes() { return false; } @SuppressWarnings("unchecked") @Nonnull @Override public Traverser nextChunkTraverser() { try { MongoCursor> localCursor = this.cursor; checkNotNull(localCursor, "unable to connect to Mongo"); return new CursorTraverser(localCursor) .flatMap(input -> { if (input instanceof EmptyItem) { return eventTimeMapper.flatMapIdle(); } ChangeStreamDocument doc = (ChangeStreamDocument) input; resumeToken = doc.getResumeToken(); long eventTime = clusterTime(doc); I item = mapFn.apply(doc, eventTime); return eventTimeMapper.flatMapEvent(item, 0, eventTime); }); } catch (MongoException e) { throw new JetException("error while reading from mongodb", e); } } private long clusterTime(ChangeStreamDocument changeStreamDocument) { BsonDateTime time = changeStreamDocument.getWallTime(); return time == null ? System.currentTimeMillis() : time.getValue(); } @Nullable @Override public Object snapshot() { return resumeToken; } @Nonnull @Override public Object watermark() { return eventTimeMapper.getWatermark(0); } @Override public void restore(Object value) { if (value != null) { if (value instanceof BsonDocument) { this.resumeToken = (BsonDocument) value; } } } @Override public void restoreWatermark(Long value) { eventTimeMapper.restoreWatermark(0, value); } @Override public void close() { if (cursor != null) { cursor.close(); cursor = null; } } } }