edu.byu.hbll.box.BoxDatabase Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of box-core Show documentation
There is a newer version: 2.5.3
package edu.byu.hbll.box;

import com.fasterxml.jackson.databind.JsonNode;
import com.fasterxml.jackson.databind.node.ObjectNode;
import java.time.Duration;
import java.time.Instant;
import java.util.Collection;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.function.Consumer;
import java.util.stream.Collectors;

/**
 * A database to hold processed documents for later retrieval and all other metadata and controls
 * used for processing the documents.
 *
 * Implementations should have a document repository with Box metadata, a queue for documents to
 * be processed, and maintain harvest cursors and group start times and end times.
 *
 * 
This api is intended for Box itself to use. It is not intended as a general use api. See
 * {@link Source} for the general use api.
 *
 * @author Charles Draper
 */
public interface BoxDatabase extends BoxConfigurable {

  /**
   * Used to mark old documents for reprocessing. Any ready documents that were last processed
   * before (NOW - olderThan) will be placed on the queue.
   *
   * @param olderThan target documents older than this age
   * @return number of documents added to queue
   */
  int addToQueue(Duration olderThan);

  /**
   * Used to mark old documents for reprocessing. Any ready documents that were last processed
   * before (NOW - olderThan) will be placed on the queue.
   *
   * 
Adding items to the queue can be costly if there are many items to be added and this method
   * is called often. You can specify to useHeuristics to speed up calls to this method at the cost
   * of not being thorough about adding all relevant items. This is useful for ongoing maintenance
   * such as for reprocessAge and reprocessSchedule. The heuristic is implementation specific. An
   * example heuristic is to not add anything if there is still an item on the queue from a previous
   * maintenance addToQueue call. Eventually all relevant items should be added to the queue.
   *
   * @param olderThan target documents older than this age
   * @param useHeuristics a hint to use estimating when adding
   * @return number of documents added to queue
   */
  default int addToQueue(Duration olderThan, boolean useHeuristics) {
    return addToQueue(olderThan);
  }

  /**
   * Adds elements to the process queue. If the id already exists in the queue, this operation has
   * no effect unless overwrite is true.
   *
   * @param elements the elements to be added
   */
  default void addToQueue(Collection elements) {
    throw new UnsupportedOperationException();
  }

  /**
   * Adds a collection of ids to the queue to be processed at the given time. If the id already
   * exists in the queue, this operation has no effect unless overwrite is true.
   *
   * @param ids the ids to be added
   * @param attempt do not attempt to process until this time
   * @param overwrite true to force an update on the attempt time of an existing entry, false to
   *     keep the existing attempt time
   * @deprecated use {@link #addToQueue(Collection)} instead if implemented
   */
  @Deprecated
  default void addToQueue(Collection ids, Instant attempt, boolean overwrite) {
    addToQueue(
        ids.stream()
            .map(id -> new QueueEntry(id).setAttempt(attempt).setOverwrite(overwrite))
            .collect(Collectors.toList()));
  }

  /**
   * Deletes the given document ids from the queue signifying that the processing of the documents
   * was successful.
   *
   * @param ids the document ids to delete
   */
  void deleteFromQueue(Collection ids);

  /**
   * Finds documents dependent on the given dependencies.
   *
   * @param dependencies the dependencies
   * @return all dependent documents per dependency, key is dependency, value is dependents
   */
  Map> findDependents(Collection dependencies);

  /**
   * Finds documents in the database according to the given query. If the query specifies ids,
   * documents should be returned in the same order as ids and an unprocessed document should be
   * created for missing documents. For id queries, all corresponding documents are returned in one
   * page so limit is ignored. The database is not responsible for processing documents so the
   * process and wait directives are ignored. The metadataLevel and metadataOnly directives are
   * honored.
   *
   * @param query the query to use
   * @return matching documents
   */
  QueryResult find(BoxQuery query);

  /**
   * Returns the harvest cursor for this source. Whatever is set using {@link
   * #setHarvestCursor(ObjectNode)} should be returned here.
   *
   * @return the harvest cursor
   */
  ObjectNode getHarvestCursor();

  /**
   * Finds the unique set of all sources this source is dependent on.
   *
   * @return the set of dependency source names
   */
  Set listSourceDependencies();

  /**
   * Return the next batch of ids from the queue.
   *
   * @param limit size of the batch to return
   * @return the next ids in the queue
   */
  List nextFromQueue(int limit);

  /**
   * Using the timestamp saved in the database when {@link #startGroup(String)} was called, executes
   * the given function on each document processed before that group start time. Removes the
   * timestamp.
   *
   * @param groupId documents belonging to this group should be processed
   * @param function the function to run on each document
   */
  void processOrphans(String groupId, Consumer function);

  /**
   * Removes all traces of documents that were deleted more than olderThan ago.
   *
   * @param olderThan the age of the deleted documents to remove
   */
  void removeDeleted(Duration olderThan);

  /**
   * Save this collection of documents to the database.
   *
   * 
Implementations should store appropriate cursor, modified and
   * processed fields regardless of the current values in the provided documents. The
   * cursor and modified fields should only be updated if the document's non-volatile fields have
   * changed. A simple way to check this is to use the document hash (see {@link
   * BoxDocument#hash()}). The processed field should be updated on every save
   * regardless of changes.
   *
   * 
Note that unprocessed documents (UNPROCESSED|ERROR) never
   * overwrite processed documents (READY| DELETED), but unprocessed
   * documents can be saved otherwise. This can be useful when a processor is building a document's
   * dependency list prior to making it ready.
   *
   * @param documents documents to save
   */
  void save(Collection documents);

  /**
   * Updates this collection of documents with their specified dependencies. Nothing else is
   * updated.
   *
   * @param documents documents to update
   * @deprecated this is now handled by the save operation
   */
  @Deprecated
  void updateDependencies(Collection documents);

  /**
   * Stores this cursor in the database for later retrieval by {@link #getHarvestCursor()}.
   *
   * @param cursor the cursor object to store
   */
  void setHarvestCursor(ObjectNode cursor);

  /**
   * Marks time in the database that the given group has started. If the group has already been
   * started, but not finished when this is called, leave the existing start time.
   *
   * @param groupId the id of the group
   */
  void startGroup(String groupId);

  /**
   * Simply sets the processed date of the documents identified by the given ids to NOW. Does
   * nothing if the document doesn't exist.
   *
   * @param ids the ids of the document
   * @deprecated this functionality is now handled in the save operation
   */
  @Deprecated
  void updateProcessed(Collection ids);

  /**
   * Finds the dependencies for the given document ids.
   *
   * @param ids the document ids
   * @return the dependencies for the given documents, key is the document id, value is the
   *     dependencies
   */
  Map> findDependencies(Collection ids);

  /**
   * Retrieves a single entry from the registry. The registry is a general place for individual
   * entries important for the normal function of box. IDs should uniquely identify the type of
   * entry.
   *
   * @param id the id of the entry
   * @return the entry value denoted by the id or null if not found
   */
  JsonNode findRegistryValue(String id);

  /**
   * Saves an entry to the registry denoted by id. The registry is a general place for individual
   * entries important for the normal function of box. IDs should uniquely identify the type of
   * entry.
   *
   * @param id the id of the entry
   * @param value the value of the entry
   */
  void saveRegistryValue(String id, JsonNode value);

  /**
   * Clears the box database for the source.
   *
   * 
Default operation is to throw an {@link UnsupportedOperationException}.
   */
  default void clear() {
    throw new UnsupportedOperationException();
  }

  /**
   * Returns a count of the number of documents that match the query. For id type queries, the count
   * is the same as the number of requested ids. For harvest type queries, the limit
   * parameter is ignored, but all other parameters are honored.
   *
   * Default operation is to throw an {@link UnsupportedOperationException}.
   *
   * @param query the query
   * @return number of documents matching the query
   */
  default long count(BoxQuery query) {
    throw new UnsupportedOperationException();
  }
}