All Downloads are FREE. Search and download functionalities are using the official Maven repository.

net.pincette.mongo.streams.Deduplicate Maven / Gradle / Ivy

The newest version!
package net.pincette.mongo.streams;

import static com.mongodb.client.model.Filters.eq;
import static com.mongodb.client.model.Updates.combine;
import static com.mongodb.client.model.Updates.set;
import static java.time.Duration.ofMillis;
import static java.time.Instant.now;
import static net.pincette.json.JsonUtil.from;
import static net.pincette.json.JsonUtil.isObject;
import static net.pincette.json.JsonUtil.string;
import static net.pincette.mongo.BsonUtil.fromBson;
import static net.pincette.mongo.BsonUtil.fromJson;
import static net.pincette.mongo.BsonUtil.toBsonDocument;
import static net.pincette.mongo.Collection.bulkWrite;
import static net.pincette.mongo.Collection.findOne;
import static net.pincette.mongo.Expression.function;
import static net.pincette.mongo.streams.Pipeline.DEDUPLICATE;
import static net.pincette.mongo.streams.Util.ID;
import static net.pincette.mongo.streams.Util.tryForever;
import static net.pincette.rs.Async.mapAsyncSequential;
import static net.pincette.rs.Commit.commit;
import static net.pincette.rs.Filter.filter;
import static net.pincette.rs.Mapper.map;
import static net.pincette.rs.Pipe.pipe;
import static net.pincette.rs.Util.duplicateFilter;
import static net.pincette.util.Pair.pair;
import static net.pincette.util.Util.must;

import com.mongodb.bulk.BulkWriteResult;
import com.mongodb.client.model.UpdateOneModel;
import com.mongodb.client.model.UpdateOptions;
import com.mongodb.reactivestreams.client.MongoCollection;
import java.time.Duration;
import java.util.List;
import java.util.Optional;
import java.util.concurrent.CompletionStage;
import java.util.concurrent.Flow.Processor;
import java.util.function.Function;
import java.util.stream.Stream;
import javax.json.JsonObject;
import javax.json.JsonValue;
import net.pincette.mongo.BsonUtil;
import net.pincette.rs.streams.Message;
import net.pincette.util.Pair;
import org.bson.BsonDocument;
import org.bson.BsonInt64;
import org.bson.BsonValue;
import org.bson.Document;
import org.bson.conversions.Bson;

/**
 * The $deduplicate operator.
 *
 * @author Werner Donné
 * @since 3.0
 */
class Deduplicate {
  private static final String CACHE_WINDOW = "cacheWindow";
  private static final String COLLECTION = "collection";
  private static final String EXPRESSION = "expression";
  private static final String TIMESTAMP = "_timestamp";

  private Deduplicate() {}

  private static CompletionStage exists(
      final MongoCollection collection, final Bson filter, final Context context) {
    return tryForever(
        () -> findOne(collection, filter, BsonDocument.class, null).thenApply(Optional::isPresent),
        DEDUPLICATE,
        () ->
            "Collection "
                + collection
                + ", findOne with filter: "
                + string(fromBson(toBsonDocument(filter))),
        context);
  }

  private static CompletionStage save(
      final MongoCollection collection,
      final List values,
      final Context context) {
    return tryForever(
        () ->
            bulkWrite(collection, values.stream().map(Deduplicate::updateObject).toList())
                .thenApply(result -> must(result, BulkWriteResult::wasAcknowledged))
                .thenApply(result -> true),
        DEDUPLICATE,
        () ->
            "Collection "
                + collection
                + ", save: "
                + string(from(values.stream().map(BsonUtil::fromBson))),
        context);
  }

  private static  Stream second(final List> pairs) {
    return pairs.stream().map(pair -> pair.second);
  }

  static Processor, Message> stage(
      final JsonValue expression, final Context context) {
    must(isObject(expression));

    final JsonObject expr = expression.asJsonObject();

    must(expr.containsKey(COLLECTION));

    final Duration cacheWindow = ofMillis(expr.getInt(CACHE_WINDOW, 3000));
    final MongoCollection collection =
        context.database.getCollection(expr.getString(COLLECTION));
    final Function fn = function(expr.get(EXPRESSION), context.features);

    // The duplicate filter is needed because when down stream is buffered, the request size will
    // be larger than 1. An upstream batch may contain duplicates.
    return pipe(duplicateFilter((Message m) -> fn.apply(m.value), cacheWindow))
        .then(map(m -> pair(m, fromJson(fn.apply(m.value)))))
        .then(
            mapAsyncSequential(
                pair ->
                    exists(collection, eq(ID, pair.second), context)
                        .thenApply(result -> pair(pair, result))))
        .then(filter(pair -> !pair.second))
        .then(map(pair -> pair.first))
        .then(commit(list -> save(collection, second(list).toList(), context)))
        .then(map(pair -> pair.first));
  }

  private static UpdateOneModel updateObject(final BsonValue value) {
    return new UpdateOneModel<>(
        eq(ID, value),
        combine(set(ID, value), set(TIMESTAMP, new BsonInt64(now().toEpochMilli()))),
        new UpdateOptions().upsert(true));
  }
}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy