All Downloads are FREE. Search and download functionalities are using the official Maven repository.

edu.byu.hbll.box.BoxDatabase Maven / Gradle / Ivy

There is a newer version: 2.5.3
Show newest version
package edu.byu.hbll.box;

import com.fasterxml.jackson.databind.JsonNode;
import com.fasterxml.jackson.databind.node.ObjectNode;
import java.time.Duration;
import java.time.Instant;
import java.util.Collection;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.function.Consumer;
import java.util.stream.Collectors;

/**
 * A database to hold processed documents for later retrieval and all other metadata and controls
 * used for processing the documents.
 *
 * 

Implementations should have a document repository with Box metadata, a queue for documents to * be processed, and maintain harvest cursors and group start times and end times. * *

This api is intended for Box itself to use. It is not intended as a general use api. See * {@link Source} for the general use api. * * @author Charles Draper */ public interface BoxDatabase extends BoxConfigurable { /** * Used to mark old documents for reprocessing. Any ready documents that were last processed * before (NOW - olderThan) will be placed on the queue. * * @param olderThan target documents older than this age * @return number of documents added to queue */ int addToQueue(Duration olderThan); /** * Used to mark old documents for reprocessing. Any ready documents that were last processed * before (NOW - olderThan) will be placed on the queue. * *

Adding items to the queue can be costly if there are many items to be added and this method * is called often. You can specify to useHeuristics to speed up calls to this method at the cost * of not being thorough about adding all relevant items. This is useful for ongoing maintenance * such as for reprocessAge and reprocessSchedule. The heuristic is implementation specific. An * example heuristic is to not add anything if there is still an item on the queue from a previous * maintenance addToQueue call. Eventually all relevant items should be added to the queue. * * @param olderThan target documents older than this age * @param useHeuristics a hint to use estimating when adding * @return number of documents added to queue */ default int addToQueue(Duration olderThan, boolean useHeuristics) { return addToQueue(olderThan); } /** * Adds elements to the process queue. If the id already exists in the queue, this operation has * no effect unless overwrite is true. * * @param elements the elements to be added */ default void addToQueue(Collection elements) { throw new UnsupportedOperationException(); } /** * Adds a collection of ids to the queue to be processed at the given time. If the id already * exists in the queue, this operation has no effect unless overwrite is true. * * @param ids the ids to be added * @param attempt do not attempt to process until this time * @param overwrite true to force an update on the attempt time of an existing entry, false to * keep the existing attempt time * @deprecated use {@link #addToQueue(Collection)} instead if implemented */ @Deprecated default void addToQueue(Collection ids, Instant attempt, boolean overwrite) { addToQueue( ids.stream() .map(id -> new QueueEntry(id).setAttempt(attempt).setOverwrite(overwrite)) .collect(Collectors.toList())); } /** * Deletes the given document ids from the queue signifying that the processing of the documents * was successful. * * @param ids the document ids to delete */ void deleteFromQueue(Collection ids); /** * Finds documents dependent on the given dependencies. * * @param dependencies the dependencies * @return all dependent documents per dependency, key is dependency, value is dependents */ Map> findDependents(Collection dependencies); /** * Finds documents in the database according to the given query. If the query specifies ids, * documents should be returned in the same order as ids and an unprocessed document should be * created for missing documents. For id queries, all corresponding documents are returned in one * page so limit is ignored. The database is not responsible for processing documents so the * process and wait directives are ignored. The metadataLevel and metadataOnly directives are * honored. * * @param query the query to use * @return matching documents */ QueryResult find(BoxQuery query); /** * Returns the harvest cursor for this source. Whatever is set using {@link * #setHarvestCursor(ObjectNode)} should be returned here. * * @return the harvest cursor */ ObjectNode getHarvestCursor(); /** * Finds the unique set of all sources this source is dependent on. * * @return the set of dependency source names */ Set listSourceDependencies(); /** * Return the next batch of ids from the queue. * * @param limit size of the batch to return * @return the next ids in the queue */ List nextFromQueue(int limit); /** * Using the timestamp saved in the database when {@link #startGroup(String)} was called, executes * the given function on each document processed before that group start time. Removes the * timestamp. * * @param groupId documents belonging to this group should be processed * @param function the function to run on each document */ void processOrphans(String groupId, Consumer function); /** * Removes all traces of documents that were deleted more than olderThan ago. * * @param olderThan the age of the deleted documents to remove */ void removeDeleted(Duration olderThan); /** * Save this collection of documents to the database. * *

Implementations should store appropriate cursor, modified and * processed fields regardless of the current values in the provided documents. The * cursor and modified fields should only be updated if the document's non-volatile fields have * changed. A simple way to check this is to use the document hash (see {@link * BoxDocument#hash()}). The processed field should be updated on every save * regardless of changes. * *

Note that unprocessed documents (UNPROCESSED|ERROR) never * overwrite processed documents (READY| DELETED), but unprocessed * documents can be saved otherwise. This can be useful when a processor is building a document's * dependency list prior to making it ready. * * @param documents documents to save */ void save(Collection documents); /** * Updates this collection of documents with their specified dependencies. Nothing else is * updated. * * @param documents documents to update * @deprecated this is now handled by the save operation */ @Deprecated void updateDependencies(Collection documents); /** * Stores this cursor in the database for later retrieval by {@link #getHarvestCursor()}. * * @param cursor the cursor object to store */ void setHarvestCursor(ObjectNode cursor); /** * Marks time in the database that the given group has started. If the group has already been * started, but not finished when this is called, leave the existing start time. * * @param groupId the id of the group */ void startGroup(String groupId); /** * Simply sets the processed date of the documents identified by the given ids to NOW. Does * nothing if the document doesn't exist. * * @param ids the ids of the document * @deprecated this functionality is now handled in the save operation */ @Deprecated void updateProcessed(Collection ids); /** * Finds the dependencies for the given document ids. * * @param ids the document ids * @return the dependencies for the given documents, key is the document id, value is the * dependencies */ Map> findDependencies(Collection ids); /** * Retrieves a single entry from the registry. The registry is a general place for individual * entries important for the normal function of box. IDs should uniquely identify the type of * entry. * * @param id the id of the entry * @return the entry value denoted by the id or null if not found */ JsonNode findRegistryValue(String id); /** * Saves an entry to the registry denoted by id. The registry is a general place for individual * entries important for the normal function of box. IDs should uniquely identify the type of * entry. * * @param id the id of the entry * @param value the value of the entry */ void saveRegistryValue(String id, JsonNode value); /** * Clears the box database for the source. * *

Default operation is to throw an {@link UnsupportedOperationException}. */ default void clear() { throw new UnsupportedOperationException(); } /** * Returns a count of the number of documents that match the query. For id type queries, the count * is the same as the number of requested ids. For harvest type queries, the limit * parameter is ignored, but all other parameters are honored. * *

Default operation is to throw an {@link UnsupportedOperationException}. * * @param query the query * @return number of documents matching the query */ default long count(BoxQuery query) { throw new UnsupportedOperationException(); } }





© 2015 - 2024 Weber Informatics LLC | Privacy Policy