com.marklogic.client.datamovement.WriteBatcher Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of marklogic-client-api Show documentation
The official MarkLogic Java client API.
The newest version!
/*
 * Copyright © 2024 MarkLogic Corporation. All Rights Reserved.
 */
package com.marklogic.client.datamovement;

import java.util.concurrent.TimeUnit;
import java.util.stream.Stream;

import com.marklogic.client.document.DocumentWriteOperation;
import com.marklogic.client.document.ServerTransform;
import com.marklogic.client.io.DocumentMetadataHandle;
import com.marklogic.client.io.marker.AbstractWriteHandle;
import com.marklogic.client.io.marker.DocumentMetadataWriteHandle;

/**
 * To facilitate long-running write jobs, batches documents added by many
 * external threads and coordinates internal threads to send the batches
 * round-robin to all appropriate hosts in the cluster.  Appropriate hosts are
 * those containing a forest associated with the database for the
 * DatabaseClient provided to DataMovementManager.  Many external threads
 * (threads not managed by WriteBatcher) can concurrently add documents by
 * calling WriteBatcher {@link #add add} or {@link #addAs addAs}.  Each
 * time enough documents are added to make a batch, the batch is added to an
 * internal queue where the first available internal thread will pick it up and
 * write it to the server.  Since batches are not written until they are full,
 * you should always call {@link #flushAsync} or {@link #flushAndWait} when no
 * more documents will be written to ensure that any partial batch is written.
 *
 * Sample Usage:
 *
 * {@code
 *     WriteBatcher whb = dataMovementManager.newWriteBatcher()
 *         .withBatchSize(100)
 *         .withThreadCount(20)
 *         .onBatchSuccess(batch -> {
 *             logger.debug("batch # {}, so far: {}", batch.getJobBatchNumber(), batch.getJobWritesSoFar());
 *         })
 *         .onBatchFailure((batch,throwable) -> throwable.printStackTrace() );
 *     JobTicket ticket = dataMovementManager.startJob(whb);
 *     whb.add  ("doc1.txt", new StringHandle("doc1 contents"));
 *     whb.addAs("doc2.txt", "doc2 contents");
 *
 *     whb.flushAndWait(); // send the two docs even though they're not a full batch
 *     dataMovementManager.stopJob(ticket);
 *}
 *
 * Note: All Closeable content or metadata handles passed to {@link #add add}
 * methods will be closed as soon as possible (after the batch is written).
 * This is to avoid IO resource leakage.  This differs from the normal usage of
 * the Java Client API because WriteBatcher is asynchronous so there's no
 * easy way to know which handles have finished writing and can therefore be
 * closed.  So to save confusion we close all handles for you.  If you have a
 * resource that must be closed after a batch is written, but is not closed by
 * your handle, override the close method of any Closeable handle and close
 * your resource there.
 */
public interface WriteBatcher extends Batcher {
  /**
   * Sets the DocumentMetadataHandle for write operations.
   * @param handle the passed in DocumentMetadataHandle
   * @return this write batcher for chaining configuration
   */
    WriteBatcher withDefaultMetadata(DocumentMetadataHandle handle);

    /**
     * Writes a document stream to the database.
     * @param operations is the DocumentWriteOperation stream passed in.
     */
    void addAll(Stream operations);

    /**
     @return the documentMetadatHandle associated with the WriteeBatcher.
     */
    DocumentMetadataHandle getDocumentMetadata();

  /**
   * Add a document to be batched then written to the server when a batch is full
   * or {@link #flushAsync} or {@link #flushAndWait} is called.
   *
   * See Also:

   *   the Java Guide
   *   for more on using handles
   *
   * @param uri the document uri
   * @param contentHandle the document contents
   * @return WriteBatcher the batcher containing the documents added
   */

  WriteBatcher add(String uri, AbstractWriteHandle contentHandle);

  /**
   * Add a document to be batched then written to the server when a batch is full
   * or {@link #flushAsync} or {@link #flushAndWait} is called.
   *
   * See Also:

   *   IO Shortcut in MarkLogic Java Client API
   *   for more on using the *As shortcut methods
   *
   * @param uri the document uri
   * @param content the document contents
   * @return WriteBatcher the batcher containing the documents added
   */
  WriteBatcher addAs(String uri, Object content);

  /**
   * Add a document to be batched then written to the server when a batch is full
   * or {@link #flushAsync} or {@link #flushAndWait} is called.
   *
   * See Also:

   *   the Java Guide
   *   for more on using handles
   *
   * @param uri the document uri
   * @param metadataHandle the metadata (collection, permissions, metdata values, properties, quality)
   * @param contentHandle the document contents
   * @return WriteBatcher the batcher containing the documents added
   */
  WriteBatcher add(String uri, DocumentMetadataWriteHandle metadataHandle,
                   AbstractWriteHandle contentHandle);

  /**
   * Add a document to be batched then written to the server when a batch is full
   * or {@link #flushAsync} or {@link #flushAndWait} is called.
   *
   * See Also:

   *   IO Shortcut in MarkLogic Java Client API
   *   for more on using the *As shortcut methods
   *
   * @param uri the document uri
   * @param metadataHandle the metadata (collection, permissions, metdata values, properties, quality)
   * @param content the document contents
   * @return WriteBatcher the batcher containing the documents added
   */
  WriteBatcher addAs(String uri, DocumentMetadataWriteHandle metadataHandle,
                     Object content);

  /**
   * Add docs in the form of WriteEvents.  This is a convenience method for re-adding
   * documents from failed batches.
   *
   * @param docs the batch of WriteEvents where each WriteEvent represents one document
   * @return WriteBatcher the batcher containing the documents added
   */
  WriteBatcher add(WriteEvent... docs);

  /**
   * Add a document, by passing in a
   * {@link com.marklogic.client.document.DocumentWriteOperation DocumentWriteOperation},
   * to be batched and then written to the server when a batch is full
   * or {@link #flushAsync} or {@link #flushAndWait} is called.
   *
   * @param writeOperation the DocumentWriteOperation object containing
   *          the document's details to be written to the server
   * @return WriteBatcher the batcher containing the documents added
   */
  WriteBatcher add(DocumentWriteOperation writeOperation);
  /**
   * Add a listener to run each time a batch is successfully written.
   *
   * @param listener the action which has to be done when the batch gets written
   *        successfully
   * @return this instance for method chaining
   */
  WriteBatcher onBatchSuccess(WriteBatchListener listener);

  /**
   * Add a listener to run each time there is an exception writing a batch.
   *
   * These listeners will not run when an exception is thrown by a listener
   * registered with onBatchSuccess.  To learn more, please see
   * Handling Exceptions in Listeners
   *
   * @param listener the code to run when a failure occurs
   * @return this instance for method chaining
   */
  WriteBatcher onBatchFailure(WriteFailureListener listener);

  /**
   * Retry in the same thread to send a batch that failed. This method will
   * throw an Exception if it fails again, so it can be wrapped in a try-catch
   * block.
   *
   * @param queryEvent the information about the batch that failed
   */
  public void retry(WriteBatch queryEvent);

  /*
  public WriteBatcher withTransactionSize(int transactionSize);
  public int getTransactionSize();
  */

  /**
   * Get the array of WriteBatchListener instances registered via
   * onBatchSuccess.
   *
   * @return the WriteBatchListener instances this batcher
   *   is using
   */
  WriteBatchListener[] getBatchSuccessListeners();

  /**
   * Get the array of WriteFailureListener instances
   * registered via onBatchFailure including the HostAvailabilityListener
   * registered by default.
   *
   * @return the WriteFailureListener instances this batcher
   *   is using
   */
  WriteFailureListener[] getBatchFailureListeners();

  /**
   * Remove any existing WriteBatchListener instances registered
   * via onBatchSuccess and replace them with the provided listeners.
   *
   * @param listeners the WriteBatchListener instances this
   *   batcher should use
   */
  void setBatchSuccessListeners(WriteBatchListener... listeners);

  /**
   * Remove any existing WriteFailureListener instances
   * registered via onBatchFailure including the HostAvailabilityListener
   * registered by default and replace them with the provided listeners.
   *
   * @param listeners the WriteFailureListener instances this
   *   batcher should use
   */
  void setBatchFailureListeners(WriteFailureListener... listeners);

  /**
   * The temporal collection to use for a temporal document insert
   *
   * @param collection The temporal collection to use for a temporal document insert
   *
   * @return this instance for method chaining
   */
  WriteBatcher withTemporalCollection(String collection);

  /**
   * The temporal collection configured for temporal document inserts
   *
   * @return The temporal collection configured for temporal document inserts
   */
  String getTemporalCollection();

  /**
   * The ServerTransform to modify each document from each batch before it is
   * written to the database.
   *
   * @param transform The ServerTransform to run on each document from each batch.
   *
   * @return this instance for method chaining
   */
  WriteBatcher withTransform(ServerTransform transform);
  ServerTransform getTransform();

  /**
   * If the server forest configuration changes mid-job, it can be re-fetched
   * with {@link DataMovementManager#readForestConfig} then set via
   * withForestConfig.
   *
   * @param forestConfig the updated ForestConfiguration
   *
   * @return this instance for method chaining
   */
  @Override
  WriteBatcher withForestConfig(ForestConfiguration forestConfig);

  /**
   * Sets the job name.  Eventually, this may become useful for seeing named
   * jobs in ops director.
   *
   * @return this instance for method chaining
   */
  @Override
  WriteBatcher withJobName(String jobName);

  /**
   * Sets the unique id of the job to help with managing multiple concurrent jobs and
   * start the job with the specified job id.
   *
   * @param jobId the unique id you would like to assign to this job
   * @return this instance (for method chaining)
   */
  WriteBatcher withJobId(String jobId);

  /**
   * Sets the number of documents to send per batch. Since documents are large
   * relative to uris, this number should be much lower than the batch size for
   * QueryBatcher. The default batch size is 100.
   *
   * @return this instance for method chaining
   */
  @Override
  WriteBatcher withBatchSize(int batchSize);

  /**
   * Sets the number of threads added to the internal thread pool for this
   * instance to use for writing or reporting on batches of uris.  Each time
   * enough documents are added to fill a batch, a batch is created and a task
   * is queued to write the batch.  As a thread becomes available it grabs a
   * task from the queue and performs the task (usually writing the batch to
   * the server then reporting on the batch to listeners registered with
   * onBatchSuccess and onBatchFailure).  By default the number of threads is
   * the number of hosts containing applicable forests.  More threads should
   * accommodate more throughput.
   *
   * @return this instance for method chaining
   */
  @Override
  WriteBatcher withThreadCount(int threadCount);

  /** Create a batch from any unbatched documents and write that batch
   * asynchronously.
   */
  void flushAsync();

  /** Create a batch from any unbatched documents and write that batch, then
   * wait for all batches to complete (the same as awaitCompletion().
   */
  void flushAndWait();

  /**
   * Blocks until the job has finished or cancelled all queued tasks.
   *
   * @return true if the queue completed without InterruptedException, false if
   *         we hit the time limit or InterruptedException was thrown while waiting
   */
  boolean awaitCompletion();

  /**
   * Blocks until the job has finished or cancelled all queued tasks.
   *
   * @param timeout the maximum time to wait
   * @param unit the time unit of the timeout argument
   *
   * @return true if the queue completed without timing out, false if we hit the time limit
   * @throws InterruptedException if interrupted while waiting
   */
  boolean awaitCompletion(long timeout, TimeUnit unit) throws InterruptedException;

  /**
   * After the job has been started, returns the JobTicket generated when the
   * job was started.
   *
   * @return the JobTicket generated when this job was started
   *
   * @throws IllegalStateException if this job has not yet been started
   */
  JobTicket getJobTicket();

  /**
   * Retry in the same thread to send a batch that failed. If it fails again,
   * all the failure listeners associated with the batcher using onBatchFailure
   * method would be processed.
   *
   * Note : Use this method with caution as there is a possibility of infinite
   * loops. If a batch fails and one of the failure listeners calls this method
   * to retry with failure listeners and if the batch again fails, this would go
   * on as an infinite loop until the batch succeeds.
   *
   * @param writeBatch the information about the batch that failed
   */
  public void retryWithFailureListeners(WriteBatch writeBatch);
}