edu.byu.hbll.box.BoxDatabase Maven / Gradle / Ivy
package edu.byu.hbll.box;
import com.fasterxml.jackson.databind.JsonNode;
import com.fasterxml.jackson.databind.node.ObjectNode;
import java.time.Duration;
import java.time.Instant;
import java.util.Collection;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.function.Consumer;
import java.util.stream.Collectors;
/**
* A database to hold processed documents for later retrieval and all other metadata and controls
* used for processing the documents.
*
* Implementations should have a document repository with Box metadata, a queue for documents to
* be processed, and maintain harvest cursors and group start times and end times.
*
*
This api is intended for Box itself to use. It is not intended as a general use api. See
* {@link Source} for the general use api.
*
* @author Charles Draper
*/
public interface BoxDatabase extends BoxConfigurable {
/**
* Creates a {@link BoxDatabase} given the configuration.
*
* @param sourceName the source name
* @param objectFactory the object factory to use
* @param config the box object configuration
* @return the newly created box database
*/
public static BoxDatabase create(
String sourceName, ObjectFactory objectFactory, JsonNode config) {
return (BoxDatabase)
BoxConfigurable.create(sourceName, objectFactory, ObjectType.BOX_DATABASE, config);
}
/**
* Used to mark old documents for reprocessing. Any ready documents that were last processed
* before (NOW - olderThan) will be placed on the queue.
*
* @param olderThan target documents older than this age
* @return number of documents added to queue
*/
int addToQueue(Duration olderThan);
/**
* Used to mark old documents for reprocessing. Any ready documents that were last processed
* before (NOW - olderThan) will be placed on the queue.
*
*
Adding items to the queue can be costly if there are many items to be added and this method
* is called often. You can specify to useHeuristics to speed up calls to this method at the cost
* of not being thorough about adding all relevant items. This is useful for ongoing maintenance
* such as for reprocessAge and reprocessSchedule. The heuristic is implementation specific. An
* example heuristic is to not add anything if there is still an item on the queue from a previous
* maintenance addToQueue call. Eventually all relevant items should be added to the queue.
*
* @param olderThan target documents older than this age
* @param useHeuristics a hint to use estimating when adding
* @return number of documents added to queue
*/
default int addToQueue(Duration olderThan, boolean useHeuristics) {
return addToQueue(olderThan);
}
/**
* Adds elements to the process queue. If the id already exists in the queue, this operation has
* no effect unless overwrite is true.
*
* @param elements the elements to be added
*/
default void addToQueue(Collection extends QueueEntry> elements) {
throw new UnsupportedOperationException();
}
/**
* Adds a collection of ids to the queue to be processed at the given time. If the id already
* exists in the queue, this operation has no effect unless overwrite is true.
*
* @param ids the ids to be added
* @param attempt do not attempt to process until this time
* @param overwrite true to force an update on the attempt time of an existing entry, false to
* keep the existing attempt time
* @deprecated use {@link #addToQueue(Collection)} instead if implemented
*/
@Deprecated
default void addToQueue(Collection ids, Instant attempt, boolean overwrite) {
addToQueue(
ids.stream()
.map(id -> new QueueEntry(id).setAttempt(attempt).setOverwrite(overwrite))
.collect(Collectors.toList()));
}
/**
* Deletes the given document ids from the queue signifying that the processing of the documents
* was successful.
*
* @param ids the document ids to delete
*/
void deleteFromQueue(Collection ids);
/**
* Finds documents dependent on the given dependencies.
*
* @param dependencies the dependencies
* @return all dependent documents per dependency, key is dependency, value is dependents
*/
Map> findDependents(Collection dependencies);
/**
* Finds documents in the database according to the given query. If the query specifies ids,
* documents should be returned in the same order as ids and an unprocessed document should be
* created for missing documents. For id queries, all corresponding documents are returned in one
* page so limit is ignored. The database is not responsible for processing documents so the
* process and wait directives are ignored. The metadataLevel and metadataOnly directives are
* honored.
*
* @param query the query to use
* @return matching documents
*/
QueryResult find(BoxQuery query);
/**
* Returns the harvest cursor for this source. Whatever is set using {@link
* #setHarvestCursor(ObjectNode)} should be returned here.
*
* @return the harvest cursor
*/
ObjectNode getHarvestCursor();
/**
* Finds the unique set of all sources this source is dependent on.
*
* @return the set of dependency source names
*/
Set listSourceDependencies();
/**
* Return the next batch of ids from the queue.
*
* @param limit size of the batch to return
* @return the next ids in the queue
*/
List nextFromQueue(int limit);
/**
* Using the timestamp saved in the database when {@link #startGroup(String)} was called, executes
* the given function on each document processed before that group start time. Removes the
* timestamp.
*
* @param groupId documents belonging to this group should be processed
* @param function the function to run on each document
*/
void processOrphans(String groupId, Consumer function);
/**
* Removes all traces of documents that were deleted more than olderThan ago.
*
* @param olderThan the age of the deleted documents to remove
*/
void removeDeleted(Duration olderThan);
/**
* Save this collection of documents to the database.
*
* Implementations should store appropriate cursor
, modified
and
* processed
fields regardless of the current values in the provided documents. The
* cursor and modified fields should only be updated if the document's non-volatile fields have
* changed. A simple way to check this is to use the document hash (see {@link
* BoxDocument#hash()}). The processed
field should be updated on every save
* regardless of changes.
*
*
Note that unprocessed documents (UNPROCESSED
|ERROR
) never
* overwrite processed documents (READY
| DELETED
), but unprocessed
* documents can be saved otherwise. This can be useful when a processor is building a document's
* dependency list prior to making it ready.
*
* @param documents documents to save
*/
void save(Collection extends BoxDocument> documents);
/**
* Updates this collection of documents with their specified dependencies. Nothing else is
* updated.
*
* @param documents documents to update
* @deprecated this is now handled by the save operation
*/
@Deprecated
void updateDependencies(Collection extends BoxDocument> documents);
/**
* Stores this cursor in the database for later retrieval by {@link #getHarvestCursor()}.
*
* @param cursor the cursor object to store
*/
void setHarvestCursor(ObjectNode cursor);
/**
* Marks time in the database that the given group has started. If the group has already been
* started, but not finished when this is called, leave the existing start time.
*
* @param groupId the id of the group
*/
void startGroup(String groupId);
/**
* Simply sets the processed date of the documents identified by the given ids to NOW. Does
* nothing if the document doesn't exist.
*
* @param ids the ids of the document
* @deprecated this functionality is now handled in the save operation
*/
@Deprecated
void updateProcessed(Collection ids);
/**
* Finds the dependencies for the given document ids.
*
* @param ids the document ids
* @return the dependencies for the given documents, key is the document id, value is the
* dependencies
*/
Map> findDependencies(Collection ids);
/**
* Retrieves a single entry from the registry. The registry is a general place for individual
* entries important for the normal function of box. IDs should uniquely identify the type of
* entry.
*
* @param id the id of the entry
* @return the entry value denoted by the id or null if not found
*/
JsonNode findRegistryValue(String id);
/**
* Saves an entry to the registry denoted by id. The registry is a general place for individual
* entries important for the normal function of box. IDs should uniquely identify the type of
* entry.
*
* @param id the id of the entry
* @param value the value of the entry
*/
void saveRegistryValue(String id, JsonNode value);
/**
* Clears the box database for the source.
*
* Default operation is to throw an {@link UnsupportedOperationException}.
*/
default void clear() {
throw new UnsupportedOperationException();
}
/**
* Returns a count of the number of documents that match the query. For id type queries, the count
* is the same as the number of requested ids. For harvest type queries, the limit
* parameter is ignored, but all other parameters are honored.
*
*
Default operation is to throw an {@link UnsupportedOperationException}.
*
* @param query the query
* @return number of documents matching the query
*/
default long count(BoxQuery query) {
throw new UnsupportedOperationException();
}
}