Please wait. This can take some minutes ...
Many resources are needed to download a project. Please understand that we have to compensate our server costs. Thank you in advance.
Project price only 1 $
You can buy this project and download/modify it how often you want.
edu.byu.hbll.box.impl.MongoDatabase Maven / Gradle / Ivy
package edu.byu.hbll.box.impl;
import static com.mongodb.client.model.Filters.and;
import static com.mongodb.client.model.Filters.eq;
import static com.mongodb.client.model.Filters.exists;
import static com.mongodb.client.model.Filters.gte;
import static com.mongodb.client.model.Filters.in;
import static com.mongodb.client.model.Filters.lt;
import static com.mongodb.client.model.Projections.include;
import static com.mongodb.client.model.Updates.set;
import static java.util.stream.Collectors.groupingBy;
import static java.util.stream.Collectors.mapping;
import static java.util.stream.Collectors.toList;
import com.fasterxml.jackson.annotation.JsonProperty;
import com.fasterxml.jackson.databind.JsonNode;
import com.fasterxml.jackson.databind.node.ObjectNode;
import com.mongodb.MongoClient;
import com.mongodb.MongoClientURI;
import com.mongodb.client.FindIterable;
import com.mongodb.client.MongoCollection;
import com.mongodb.client.model.DeleteManyModel;
import com.mongodb.client.model.IndexOptions;
import com.mongodb.client.model.ReplaceOneModel;
import com.mongodb.client.model.ReplaceOptions;
import com.mongodb.client.model.UpdateManyModel;
import com.mongodb.client.model.UpdateOneModel;
import com.mongodb.client.model.UpdateOptions;
import com.mongodb.client.model.WriteModel;
import com.mongodb.util.JSON;
import edu.byu.hbll.box.Box;
import edu.byu.hbll.box.BoxDatabase;
import edu.byu.hbll.box.BoxDocument;
import edu.byu.hbll.box.BoxQuery;
import edu.byu.hbll.box.ConstructConfig;
import edu.byu.hbll.box.DocumentId;
import edu.byu.hbll.box.Facet;
import edu.byu.hbll.box.InitConfig;
import edu.byu.hbll.box.ObjectType;
import edu.byu.hbll.box.QueryResult;
import edu.byu.hbll.box.QueueEntry;
import edu.byu.hbll.box.Source;
import edu.byu.hbll.box.internal.core.DocumentHandler;
import edu.byu.hbll.box.internal.util.CursorUtils;
import edu.byu.hbll.box.internal.util.JsonUtils;
import edu.byu.hbll.json.ObjectMapperFactory;
import edu.byu.hbll.json.UncheckedObjectMapper;
import java.nio.ByteBuffer;
import java.time.Duration;
import java.time.Instant;
import java.time.temporal.ChronoUnit;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Date;
import java.util.HashMap;
import java.util.HashSet;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.Map;
import java.util.Random;
import java.util.Set;
import java.util.concurrent.TimeUnit;
import java.util.function.Consumer;
import java.util.stream.Collectors;
import org.bson.Document;
import org.bson.conversions.Bson;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
/** A box database that persists to MongoDB. */
public class MongoDatabase implements BoxDatabase, AutoCloseable {
static final Logger logger = LoggerFactory.getLogger(MongoDatabase.class);
private static final UncheckedObjectMapper mapper = ObjectMapperFactory.newUnchecked();
private static final long NEXT_ATTEMPT_JITTER = TimeUnit.HOURS.toMillis(12);
private static final int ADD_TO_QUEUE_BATCH_SIZE = 1000;
private static final int UPGRADE_BATCH_SIZE = 100;
private MongoClient mongo;
private Random random = new Random();
private String database;
@JsonProperty private String sourceName;
private String documentsName;
private MongoCollection queue;
private MongoCollection cursor;
private MongoCollection metadata;
private MongoCollection groups;
private MongoCollection documents;
private MongoCollection registry;
/** Constructor for serializers/deserializers. */
MongoDatabase() {}
/**
* Constructs a new {@link MongoDatabase} that uses the given database and source. Uses the given
* mongo client for communication.
*
* @param mongo the mongo client
* @param database the name of the database to use
* @param sourceName the source name
*/
public MongoDatabase(MongoClient mongo, String database, String sourceName) {
this.mongo = mongo;
this.database = database;
this.sourceName = sourceName;
init();
}
/**
* Constructs a new {@link MongoDatabase} that uses the given database and source. Connects to
* localhost with default connection parameters.
*
* @param database the name of the database to use
* @param sourceName the source name
*/
public MongoDatabase(String database, String sourceName) {
this(new MongoClient(), database, sourceName);
}
/**
* Constructs a new {@link MongoDatabase} that uses the given database and source. Uses the given
* connection string to connect to mongo.
*
* @param uri the mongo connection string
* @param database the name of the database to use
* @param sourceName the source name
*/
public MongoDatabase(String uri, String database, String sourceName) {
this(new MongoClient(new MongoClientURI(uri)), database, sourceName);
}
@Override
public void postInit(InitConfig config) {
if (config.getObjectType() == ObjectType.BOX_DATABASE) {
upgradeAsync(config.getBox(), config.getSource());
}
}
@Override
public void addToQueue(Collection extends QueueEntry> entries) {
List> updates = new ArrayList<>();
int count = 0;
for (QueueEntry entry : entries) {
Document updateDoc = new Document("_id", entry.getId()).append("requested", new Date());
Document insertDoc = new Document();
Document attemptDoc = entry.isOverwrite() ? updateDoc : insertDoc;
attemptDoc
.append("priority", entry.getPriority())
.append("attempt", Date.from(entry.getAttempt()));
insertDoc.append("attempts", 0);
Document element = new Document("$set", updateDoc).append("$setOnInsert", insertDoc);
updates.add(
new UpdateOneModel<>(
eq("_id", entry.getId()), element, new UpdateOptions().upsert(true)));
count++;
if (updates.size() == ADD_TO_QUEUE_BATCH_SIZE || count == entries.size()) {
queue.bulkWrite(updates);
updates.clear();
}
}
}
@Override
public int addToQueue(Duration olderThan) {
Date oldDate = Date.from(Instant.now().minus(olderThan));
Document query =
new Document("processed", new Document("$lt", oldDate)).append("parentId", null);
List batch = new ArrayList<>();
int count = 0;
for (Document document : metadata.find(query)) {
batch.add(
new QueueEntry(document.getString("_id"))
.setPriority(DocumentHandler.QUEUE_PRIORITY_MAINTENANCE));
if (batch.size() == ADD_TO_QUEUE_BATCH_SIZE) {
addToQueue(batch);
batch.clear();
}
count++;
}
if (!batch.isEmpty()) {
addToQueue(batch);
}
return count;
}
@Override
public void clear() {
// drop all collections
queue.drop();
cursor.drop();
metadata.drop();
groups.drop();
documents.drop();
registry.drop();
// recreate collections with proper indexes
init();
}
@Override
public void close() {
if (mongo != null) {
mongo.close();
}
}
/**
* Converts a mongo document to a box document.
*
* @param result the mongo document to convert
* @return the resulting box document
*/
private BoxDocument convert(Document result) {
if (result == null) {
return null;
}
Document metadata = result;
Document document;
if (result.containsKey("document")) {
try {
@SuppressWarnings("unchecked")
List docs = ((List) result.get("document"));
if (docs.isEmpty()) {
document = new Document();
} else {
document = docs.get(0);
document.remove("_id");
}
} catch (Exception e) {
logger.warn(e.toString(), e);
return null;
}
} else {
document = new Document();
}
metadata.remove("document");
metadata.put("id", metadata.remove("_id"));
document.put("@box", metadata);
if (metadata.containsKey("modified")) {
try {
metadata.put("modified", ((Date) metadata.get("modified")).toInstant().toString());
} catch (Exception e) {
logger.warn(e.toString(), e);
}
}
if (metadata.containsKey("processed")) {
try {
metadata.put("processed", ((Date) metadata.get("processed")).toInstant().toString());
} catch (Exception e) {
logger.warn(e.toString(), e);
}
}
if (metadata.containsKey("cursor")) {
try {
metadata.put("cursor", metadata.getLong("cursor").toString());
} catch (Exception e) {
logger.warn(e.toString(), e);
}
}
BoxDocument boxDocument = mapper.readValue(JSON.serialize(document), BoxDocument.class);
// for some reason jackson deserializes the facets into a HashSet instead of a LinkedHashSet, so
// the facets get out of order, this corrects that
@SuppressWarnings("unchecked")
List facets = (List) metadata.get("facets");
if (facets != null) {
boxDocument.clearFacets();
for (Document facet : facets) {
boxDocument.addFacets(mapper.readValue(JSON.serialize(facet), Facet.class));
}
}
return boxDocument;
}
/**
* Creates an index on the mongo collection.
*
* @param collection the collection to be indexed
* @param path the path to be indexed
*/
private void createIndex(MongoCollection collection, String... path) {
Document index = new Document();
IndexOptions options = new IndexOptions().background(true);
for (String field : path) {
index.append(field, 1);
}
collection.createIndex(index, options);
}
@Override
public void deleteFromQueue(Collection ids) {
queue.deleteMany(eq("_id", new Document("$in", ids)));
}
@Override
public QueryResult find(BoxQuery query) {
List pipeline = new ArrayList<>();
if (query.isIdQuery()) {
pipeline.add(
new Document("$match", new Document("_id", new Document("$in", query.getIds()))));
} else {
String comparison = query.isAscendingOrder() ? "$gte" : "$lte";
pipeline.add(
new Document(
"$match",
new Document("cursor", new Document(comparison, query.getCursorOrDefault()))));
}
List statuses = new ArrayList<>();
query.getStatusesOrDefault().forEach(s -> statuses.add(s.toString()));
pipeline.add(new Document("$match", new Document("status", new Document("$in", statuses))));
if (!query.getFacets().isEmpty()) {
// group facets by facet name
Map> facetMap = new HashMap<>();
query
.getFacets()
.stream()
.forEach(
f ->
facetMap
.computeIfAbsent(f.getName(), k -> new ArrayList<>())
.add(new Document("name", f.getName()).append("value", f.getValue())));
// query facets so that OR logic is used within a facet group, but AND logic is used between
// groups
facetMap
.entrySet()
.stream()
.forEach(
f ->
pipeline.add(
new Document(
"$match", new Document("facets", new Document("$in", f.getValue())))));
}
if (query.isHarvestQuery()) {
int order = query.isAscendingOrder() ? 1 : -1;
pipeline.add(new Document("$sort", new Document("cursor", order)));
query.getOffset().ifPresent(it -> pipeline.add(new Document("$skip", it)));
if (query.getLimitOrDefault() >= 0) {
pipeline.add(new Document("$limit", query.getLimitOrDefault()));
}
}
if (!isMetadataOnly(query)) {
Document lookup =
new Document("from", documentsName)
.append("localField", "_id")
.append("foreignField", "_id")
.append("as", "document");
pipeline.add(new Document("$lookup", lookup));
}
Document projection = projectFields(query);
if (projection != null) {
pipeline.add(new Document("$project", projection));
}
Map documentMap = new LinkedHashMap<>();
for (Document document : metadata.aggregate(pipeline)) {
BoxDocument processDocument = convert(document);
if (processDocument != null) {
documentMap.put(processDocument.getId(), processDocument);
}
}
QueryResult result = new QueryResult();
if (query.isIdQuery()) {
for (String id : query.getIds()) {
BoxDocument processDocument = documentMap.get(id);
if (processDocument == null) {
processDocument = new BoxDocument(id);
}
result.add(processDocument);
}
} else {
result.addAll(documentMap.values());
}
result.updateNextCursor(query);
return result;
}
@Override
public Map> findDependencies(Collection ids) {
Map> dependencies = new HashMap<>();
for (Document registry : metadata.find(new Document("_id", new Document("$in", ids)))) {
BoxDocument document = convert(registry);
dependencies.put(document.getId(), document.getDependencies());
}
return dependencies;
}
@Override
public Map> findDependents(Collection dependencies) {
Map> map = new HashMap<>();
for (DocumentId dependency : dependencies) {
Set ids = new HashSet<>();
String id = dependency.getId();
String sourceName = dependency.getSourceName();
Document query =
new Document("dependencies", new Document("sourceName", sourceName).append("id", id));
for (Document document : metadata.find(query)) {
ids.add(document.getString("_id"));
}
map.put(dependency, ids);
}
return map;
}
@Override
public JsonNode findRegistryValue(String id) {
Document document = registry.find(eq("_id", id)).first();
if (document != null) {
Object data = document.get("value");
if (data != null) {
JsonNode dataNode = mapper.readTree(JSON.serialize(data));
return dataNode;
}
}
return null;
}
/**
* Returns the mongo collection given the name.
*
* @param name name of the collection
* @return the mongo collection
*/
private MongoCollection getCollection(String name) {
return mongo.getDatabase(database).getCollection(getCollectionName(name));
}
/**
* Returns the full collection name by prepending the source name.
*
* @param name the name of type of collection
* @return the full collection name
*/
private String getCollectionName(String name) {
return sourceName + "_" + name;
}
@Override
public ObjectNode getHarvestCursor() {
for (Document doc : cursor.find(eq("_id", "cursor"))) {
if (doc.get("cursor") != null) {
return (ObjectNode) JsonUtils.deserialize(((Document) doc.get("cursor")).toJson());
}
}
return mapper.createObjectNode();
}
/** Initializes the database. */
private void init() {
documentsName = getCollectionName("documents");
queue = getCollection("queue");
cursor = getCollection("cursor");
metadata = getCollection("metadata");
documents = getCollection("documents");
groups = getCollection("groups");
registry = getCollection("registry");
// find documents dependent on a source document
createIndex(metadata, "dependencies");
// find dependent sources (findDependentSources)
createIndex(metadata, "dependencies.sourceName");
// harvest documents
createIndex(metadata, "status", "cursor");
// reprocess old documents
createIndex(metadata, "processed");
// remove old deleted documents
createIndex(metadata, "modified", "status");
// harvest documents by facet
createIndex(metadata, "facets", "status", "cursor");
// for finding orphans
createIndex(metadata, "groupId", "processed");
// poll the next tasks from the queue
createIndex(queue, "priority", "attempt");
}
private boolean isMetadataOnly(BoxQuery query) {
if (query.getFields().isEmpty()) {
return false;
}
for (String field : query.getFields()) {
if (!field.equals("@box") && !field.startsWith("@box.")) {
return false;
}
}
return true;
}
@Override
public Set listSourceDependencies() {
Set sourceDependencies = new HashSet<>();
for (String dependency : metadata.distinct("dependencies.sourceName", String.class)) {
// mongo 3.6 returns null when no matches are found
if (dependency != null) {
sourceDependencies.add(dependency);
}
}
return sourceDependencies;
}
@Override
public List nextFromQueue(int limit) {
FindIterable tasks =
queue
.find(new Document("attempt", new Document("$lte", new Date())))
.sort(new Document("priority", 1).append("attempt", 1))
.limit(limit);
List ids = new ArrayList<>();
for (Document task : tasks) {
String id = task.getString("_id");
int newAttempts = task.getInteger("attempts") + 1;
// push next attempt `newAttempts` days into the future with some jitter
Instant newAttempt =
Instant.now()
.plus(newAttempts, ChronoUnit.DAYS)
.plusMillis(random.nextLong() % NEXT_ATTEMPT_JITTER);
ids.add(id);
queue.updateOne(
eq("_id", id),
new Document(
"$set",
new Document("attempt", Date.from(newAttempt)).append("attempts", newAttempts)));
}
return ids;
}
@Override
public void postConstruct(ConstructConfig config) {
this.sourceName = config.getSourceName();
ObjectNode params = config.getParams();
if (!params.has("database")) {
throw new IllegalArgumentException(
"The parameter `database` is required. This should be the name of the database to use.");
}
String uri = params.path("uri").asText("mongodb://localhost/");
database = params.path("database").asText();
MongoClientURI clientUri = new MongoClientURI(uri);
mongo = new MongoClient(clientUri);
init();
}
@Override
public void processOrphans(String groupId, Consumer function) {
Document groupDocument = groups.find(eq("_id", groupId)).first();
if (groupDocument != null) {
Date start = groupDocument.getDate("start");
for (Document document : metadata.find(and(eq("groupId", groupId), lt("processed", start)))) {
function.accept(convert(document));
}
}
}
private Document projectFields(BoxQuery query) {
if (query.getFields().isEmpty()) {
// no projection
return null;
}
boolean boxField = query.getFields().contains("@box");
boolean boxFields = query.getFields().stream().anyMatch(f -> f.startsWith("@box."));
Document projection = new Document("_id", 1).append("cursor", 1).append("status", 1);
if (boxField && !boxFields) {
projection.append("modified", 1);
projection.append("processed", 1);
projection.append("message", 1);
projection.append("facets", 1);
projection.append("dependencies", 1);
projection.append("groupId", 1);
}
for (String field : query.getFields()) {
if (field.equals("@doc")) {
projection.append("document", 1);
} else if (field.startsWith("@box.")) {
projection.append(field.substring(5), 1);
} else if (field.startsWith("@doc.")) {
projection.append("document." + field.substring(5), 1);
} else {
projection.append("document." + field, 1);
}
}
return projection;
}
@Override
public void removeDeleted(Duration olderThan) {
Date oldDate = Date.from(Instant.now().minus(olderThan));
Document query =
new Document("modified", new Document("$lt", oldDate))
.append("status", BoxDocument.Status.DELETED.toString());
for (Document document : metadata.find(query)) {
String id = document.getString("_id");
queue.deleteOne(eq("_id", id));
this.documents.deleteOne(eq("_id", id));
metadata.deleteOne(eq("_id", id));
}
}
private Long hashToLong(BoxDocument document) {
return ByteBuffer.wrap(document.hash()).getLong();
}
@Override
public void save(Collection extends BoxDocument> documents) {
List> documentsWrites = new ArrayList<>();
List> metadataWrites = new ArrayList<>();
// first gather all the hashes
List ids = documents.stream().map(d -> d.getId()).collect(Collectors.toList());
Map documentHashes = new HashMap<>();
Set processed = new HashSet<>();
for (Document doc :
metadata.find(in("_id", ids)).projection(new Document("hash", 1).append("status", 1))) {
String id = doc.getString("_id");
Long hash = doc.getLong("hash");
String status = doc.getString("status");
if (hash != null) {
documentHashes.put(id, hash);
}
if (status != null && (status.equals("READY") || status.equals("DELETED"))) {
processed.add(id);
}
}
List unchanged = new ArrayList<>();
List deleted = new ArrayList<>();
for (BoxDocument boxDocument : documents) {
String id = boxDocument.getId();
long hash = hashToLong(boxDocument);
if (documentHashes.containsKey(id) && documentHashes.get(id) == hash) {
unchanged.add(id);
} else {
ObjectNode document = boxDocument.toJson();
ObjectNode metadata = (ObjectNode) document.remove("@box");
document.put("_id", id);
metadata.put("_id", id);
metadata.remove("id");
metadata.put("hash", hash);
// handle the document
if (boxDocument.isDeleted()) {
deleted.add(id);
} else if (boxDocument.isReady()) {
documentsWrites.add(
new ReplaceOneModel<>(
eq("_id", id),
Document.parse(document.toString()),
new ReplaceOptions().upsert(true)));
}
// handle the metadata
if (boxDocument.isProcessed() || !processed.contains(id)) {
Document metadataDocument = Document.parse(metadata.toString());
Instant now = Instant.now();
metadataDocument.put("cursor", CursorUtils.nextCursor());
metadataDocument.put("modified", Date.from(now));
metadataDocument.put("processed", Date.from(now));
metadataWrites.add(
new ReplaceOneModel<>(
eq("_id", id), metadataDocument, new ReplaceOptions().upsert(true)));
}
}
}
documentsWrites.add(new DeleteManyModel<>(in("_id", deleted)));
metadataWrites.add(new UpdateManyModel<>(in("_id", unchanged), set("processed", new Date())));
this.documents.bulkWrite(documentsWrites);
this.metadata.bulkWrite(metadataWrites);
}
@Override
public void saveRegistryValue(String id, JsonNode data) {
Document document = new Document("_id", id);
document.append("value", Document.parse(data.toString()));
registry.replaceOne(eq("_id", id), document, new ReplaceOptions().upsert(true));
}
@Override
public void setHarvestCursor(ObjectNode cursor) {
this.cursor.replaceOne(
eq("_id", "cursor"),
new Document("_id", "cursor").append("cursor", Document.parse(cursor.toString())),
new ReplaceOptions().upsert(true));
}
@Override
public void startGroup(String groupId) {
groups.replaceOne(
eq("_id", groupId),
new Document("_id", groupId).append("start", new Date()),
new ReplaceOptions().upsert(true));
}
@Override
public void updateDependencies(Collection extends BoxDocument> documents) {
for (BoxDocument boxDocument : documents) {
ObjectNode document = boxDocument.toJson();
Object dependencies = Document.parse(document.path("@box").toString()).get("dependencies");
String operation = dependencies == null ? "$unset" : "$set";
this.metadata.updateOne(
eq("_id", boxDocument.getId()),
new Document(operation, new Document("dependencies", dependencies)));
}
}
@Override
public void updateProcessed(Collection ids) {
metadata.updateMany(
eq("_id", new Document("$in", ids)),
new Document("$set", new Document("processed", new Date())));
}
@Override
public long count(BoxQuery query) {
if (query.isIdQuery()) {
return query.getIds().size();
}
List pipeline = new ArrayList<>();
pipeline.add(gte("cursor", query.getCursorOrDefault()));
Set statuses = query.getStatusesOrDefault();
if (!statuses.isEmpty()) {
pipeline.add(in("status", statuses.stream().map(s -> s.toString()).collect(toList())));
}
if (!query.getFacets().isEmpty()) {
Map> facetMap =
query
.getFacets()
.stream()
.collect(
groupingBy(
Facet::getName,
mapping(
f -> and(eq("name", f.getName()), eq("value", f.getValue())), toList())));
facetMap.entrySet().forEach(f -> pipeline.add(in("facets", f.getValue())));
}
long count = metadata.countDocuments(and(pipeline));
return count;
}
/** Upgrades the database from earlier versions asynchronously. */
private void upgradeAsync(Box box, Source source) {
box.getThreadFactory().newThread(() -> upgrade(source)).start();
}
/** Upgrades the database from earlier versions. */
private void upgrade(Source source) {
Instant giveUp = Instant.now().plus(1, ChronoUnit.MINUTES);
while (Instant.now().isBefore(giveUp)) {
if (source.isPrimary()) {
upgradeTo2Dot2();
break;
}
try {
Thread.sleep(10000);
} catch (InterruptedException e) {
break;
}
}
}
/** Upgrading to version 2.2. This adds the hash if not present. */
private void upgradeTo2Dot2() {
boolean more = true;
while (more) {
List batch = new ArrayList<>();
for (Document doc :
metadata
.find(exists("hash", false))
.projection(include("_id"))
.limit(UPGRADE_BATCH_SIZE)) {
batch.add(doc.getString("_id"));
}
if (batch.isEmpty()) {
more = false;
} else {
List> hashUpdates = new ArrayList<>();
for (BoxDocument document : find(new BoxQuery(batch))) {
long hash = hashToLong(document);
hashUpdates.add(
new UpdateOneModel<>(
and(eq("_id", document.getId()), exists("hash", false)), set("hash", hash)));
}
metadata.bulkWrite(hashUpdates);
}
}
}
}