com.marklogic.spark.writer.WriteContext Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of marklogic-spark-connector Show documentation
Spark 3 connector for MarkLogic
The newest version!
/*
 * Copyright © 2025 MarkLogic Corporation. All Rights Reserved.
 */
package com.marklogic.spark.writer;

import com.marklogic.client.DatabaseClient;
import com.marklogic.client.datamovement.DataMovementManager;
import com.marklogic.client.datamovement.WriteBatch;
import com.marklogic.client.datamovement.WriteBatcher;
import com.marklogic.client.datamovement.WriteEvent;
import com.marklogic.client.document.GenericDocumentManager;
import com.marklogic.client.document.ServerTransform;
import com.marklogic.client.impl.GenericDocumentImpl;
import com.marklogic.client.io.Format;
import com.marklogic.spark.*;
import com.marklogic.spark.core.splitter.ChunkAssemblerFactory;
import com.marklogic.spark.reader.document.DocumentRowSchema;
import com.marklogic.spark.reader.file.TripleRowSchema;
import org.apache.spark.sql.types.StructType;

import java.util.*;
import java.util.function.BiConsumer;
import java.util.stream.Stream;

public class WriteContext extends ContextSupport {

    static final long serialVersionUID = 1;

    private final StructType schema;
    private final boolean usingFileSchema;
    private final int batchSize;

    private int fileSchemaContentPosition;
    private int fileSchemaPathPosition;

    // This unfortunately is not final as we don't know it when this object is created.
    private int numPartitions;

    public WriteContext(StructType schema, Map properties) {
        super(properties);
        this.schema = schema;
        this.batchSize = getIntOption(Options.WRITE_BATCH_SIZE, 100, 1);

        // We support the Spark binaryFile schema - https://spark.apache.org/docs/latest/sql-data-sources-binaryFile.html -
        // so that reader can be reused for loading files as-is.
        final List names = Arrays.asList(this.schema.fieldNames());
        this.usingFileSchema = names.size() == 4 && names.contains("path") && names.contains("content")
            && names.contains("modificationTime") && names.contains("length");
        if (this.usingFileSchema) {
            // Per the Spark docs, we expect positions 0 and 3 here, but looking them up just to be safe.
            this.fileSchemaPathPosition = names.indexOf("path");
            this.fileSchemaContentPosition = names.indexOf("content");
        }
    }

    public StructType getSchema() {
        return schema;
    }

    /**
     * @return the total number of threads to use across all partitions. This is typically how a user thinks in terms
     * of, as they are not likely to know how many partitions will be created. But they will typically know how many
     * hosts are in their MarkLogic cluster and how many threads are available to an app server on each host.
     */
    int getTotalThreadCount() {
        return getIntOption(Options.WRITE_THREAD_COUNT, 4, 1);
    }

    /**
     * @return the thread count to use per partition where a user has specified the total thread count across all
     * partitions.
     */
    int getCalculatedThreadCountPerPartition() {
        int threadCount = getTotalThreadCount();
        if (this.numPartitions > 0) {
            return (int) Math.ceil((double) threadCount / (double) numPartitions);
        }
        return threadCount;
    }

    /**
     * @return the thread count to use per partition where a user has used an option to explicitly define how many
     * threads should be used by a partition.
     */
    int getUserDefinedThreadCountPerPartition() {
        return getIntOption(Options.WRITE_THREAD_COUNT_PER_PARTITION, 0, 1);
    }

    WriteBatcher newWriteBatcher(DataMovementManager dataMovementManager) {
        // If the user told us how many threads they want per partition (we expect this to be rare), then use that.
        // Otherwise, use the calculated number of threads per partition based on the total thread count that either
        // the user configured or using the default value for that option.
        final int threadCount = getUserDefinedThreadCountPerPartition() > 0 ?
            getUserDefinedThreadCountPerPartition() : getCalculatedThreadCountPerPartition();

        if (Util.MAIN_LOGGER.isDebugEnabled()) {
            Util.MAIN_LOGGER.debug("Creating new batcher with thread count of {} and batch size of {}.", threadCount, batchSize);
        }
        WriteBatcher writeBatcher = dataMovementManager
            .newWriteBatcher()
            .withBatchSize(batchSize)
            .withThreadCount(threadCount)
            .withTemporalCollection(getStringOption(Options.WRITE_TEMPORAL_COLLECTION))
            .onBatchSuccess(this::logBatchOnSuccess);

        Optional transform = makeRestTransform();
        if (transform.isPresent()) {
            writeBatcher.withTransform(transform.get());
        }
        return writeBatcher;
    }

    /**
     * @param client
     * @return a {@code GenericDocumentImpl}, which exposes the methods that accept a temporal collection as an input.
     * Has the same configuration as the {@code WriteBatcher} created by this class as well, thus allowing for documents
     * in a failed batch to be retried via this document manager.
     */
    GenericDocumentImpl newDocumentManager(DatabaseClient client) {
        GenericDocumentManager mgr = client.newDocumentManager();
        Optional transform = makeRestTransform();
        if (transform.isPresent()) {
            mgr.setWriteTransform(transform.get());
        }
        return (GenericDocumentImpl) mgr;
    }

    DocBuilder newDocBuilder() {
        final String permissions = getStringOption(Options.WRITE_PERMISSIONS);
        DocBuilderFactory factory = new DocBuilderFactory()
            .withCollections(getStringOption(Options.WRITE_COLLECTIONS))
            .withPermissions(permissions)
            .withExtractedTextDocumentType(getStringOption(Options.WRITE_EXTRACTED_TEXT_DOCUMENT_TYPE, "json"))
            .withExtractedTextCollections(getStringOption(Options.WRITE_EXTRACTED_TEXT_COLLECTIONS))
            .withExtractedTextPermissions(getStringOption(Options.WRITE_EXTRACTED_TEXT_PERMISSIONS, permissions))
            .withExtractedTextDropSource(getBooleanOption(Options.WRITE_EXTRACTED_TEXT_DROP_SOURCE, false))
            .withChunkAssembler(ChunkAssemblerFactory.makeChunkAssembler(this));

        if (hasOption(Options.WRITE_URI_TEMPLATE)) {
            configureTemplateUriMaker(factory);
        } else {
            configureStandardUriMaker(factory);
        }

        forEachOptionStartingWith(Options.WRITE_METADATA_VALUES_PREFIX, factory::withMetadataValue);
        forEachOptionStartingWith(Options.WRITE_DOCUMENT_PROPERTIES_PREFIX, factory::withDocumentProperty);

        return factory.newDocBuilder();
    }

    /**
     * Convenience for finding and processing dynamic options that start with a certain prefix.
     *
     * @param prefix
     * @param consumer processes the name (the option minus the prefix) and the option value
     */
    private void forEachOptionStartingWith(final String prefix, BiConsumer consumer) {
        getProperties().entrySet().stream()
            .filter(entry -> entry.getKey().startsWith(prefix))
            .forEach(entry -> {
                String name = entry.getKey().substring(prefix.length());
                consumer.accept(name, entry.getValue());
            });
    }

    public Format getDocumentFormat() {
        if (hasOption(Options.WRITE_DOCUMENT_TYPE)) {
            String value = getStringOption(Options.WRITE_DOCUMENT_TYPE);
            Objects.requireNonNull(value);
            try {
                return Format.valueOf(value.toUpperCase());
            } catch (IllegalArgumentException e) {
                String message = "Invalid value for %s: %s; must be one of 'JSON', 'XML', or 'TEXT'.";
                String optionAlias = getOptionNameForMessage(Options.WRITE_DOCUMENT_TYPE);
                if (optionAlias == null) {
                    optionAlias = Options.WRITE_DOCUMENT_TYPE;
                }
                throw new ConnectorException(String.format(message, optionAlias, value));
            }
        }
        return null;
    }

    /**
     * @deprecated since 2.3.0; users should use getDocumentFormat instead.
     */
    @Deprecated(since = "2.3.0")
    // We don't need Sonar to remind us of this deprecation.
    @SuppressWarnings({"java:S1133", "removal"})
    Format getDeprecatedFileRowsDocumentFormat() {
        final String deprecatedOption = Options.WRITE_FILE_ROWS_DOCUMENT_TYPE;
        if (hasOption(deprecatedOption)) {
            String value = getStringOption(deprecatedOption);
            Objects.requireNonNull(value);
            try {
                return Format.valueOf(value.toUpperCase());
            } catch (IllegalArgumentException e) {
                String message = "Invalid value for %s: %s; must be one of 'JSON', 'XML', or 'TEXT'.";
                String optionAlias = getOptionNameForMessage(deprecatedOption);
                throw new ConnectorException(String.format(message, optionAlias, value));
            }
        }
        return null;
    }

    /**
     * The URI template approach will typically be used with rows with an "arbitrary" schema where each column value
     * may be useful in constructing a URI.
     *
     * @param factory
     */
    private void configureTemplateUriMaker(DocBuilderFactory factory) {
        String uriTemplate = getProperties().get(Options.WRITE_URI_TEMPLATE);
        String optionAlias = getOptionNameForMessage(Options.WRITE_URI_TEMPLATE);
        factory.withUriMaker(new SparkRowUriMaker(uriTemplate, optionAlias));
        Stream.of(Options.WRITE_URI_PREFIX, Options.WRITE_URI_SUFFIX, Options.WRITE_URI_REPLACE).forEach(option -> {
            String value = getProperties().get(option);
            if (value != null && !value.trim().isEmpty()) {
                Util.MAIN_LOGGER.warn("Option {} will be ignored since option {} was specified.", option, Options.WRITE_URI_TEMPLATE);
            }
        });
    }

    /**
     * For rows with an "arbitrary" schema, the URI suffix defaults to ".json" or ".xml" as we know there won't be an
     * initial URI for these rows.
     *
     * @param factory
     */
    private void configureStandardUriMaker(DocBuilderFactory factory) {
        String uriSuffix = null;
        if (hasOption(Options.WRITE_URI_SUFFIX)) {
            uriSuffix = getProperties().get(Options.WRITE_URI_SUFFIX);
        } else if (!isUsingFileSchema() && !DocumentRowSchema.hasDocumentFields(this.schema) && !TripleRowSchema.SCHEMA.equals(this.schema)) {
            String xmlRootName = getStringOption(Options.WRITE_XML_ROOT_NAME);
            if (xmlRootName != null && getStringOption(Options.WRITE_JSON_ROOT_NAME) != null) {
                throw new ConnectorException(String.format("Cannot specify both %s and %s",
                    getOptionNameForMessage(Options.WRITE_JSON_ROOT_NAME), getOptionNameForMessage(Options.WRITE_XML_ROOT_NAME)));
            }
            uriSuffix = xmlRootName != null ? ".xml" : ".json";
        }
        factory.withUriMaker(new StandardUriMaker(
            getProperties().get(Options.WRITE_URI_PREFIX), uriSuffix,
            getProperties().get(Options.WRITE_URI_REPLACE)
        ));
    }

    private Optional makeRestTransform() {
        String transformName = getProperties().get(Options.WRITE_TRANSFORM_NAME);
        if (transformName != null && !transformName.trim().isEmpty()) {
            ServerTransform transform = new ServerTransform(transformName);
            String paramsValue = getProperties().get(Options.WRITE_TRANSFORM_PARAMS);
            if (paramsValue != null && !paramsValue.trim().isEmpty()) {
                addRestTransformParams(transform, paramsValue);
            }
            return Optional.of(transform);
        }
        return Optional.empty();
    }

    private void addRestTransformParams(ServerTransform transform, String paramsValue) {
        String delimiterValue = getProperties().get(Options.WRITE_TRANSFORM_PARAMS_DELIMITER);
        String delimiter = delimiterValue != null && !delimiterValue.trim().isEmpty() ? delimiterValue : ",";
        String[] params = paramsValue.split(delimiter);
        if (params.length % 2 != 0) {
            throw new ConnectorException(
                String.format("The %s option must contain an equal number of parameter names and values; received: %s",
                    getOptionNameForMessage(Options.WRITE_TRANSFORM_PARAMS), paramsValue)
            );
        }
        for (int i = 0; i < params.length; i += 2) {
            transform.add(params[i], params[i + 1]);
        }
    }

    private void logBatchOnSuccess(WriteBatch batch) {
        int documentCount = batch.getItems().length;
        if (documentCount > 0) {
            WriteEvent firstEvent = batch.getItems()[0];
            // If the first event is the item added by DMSDK for the default metadata object, ignore it when showing
            // the count of documents in the batch.
            if (firstEvent.getTargetUri() == null && firstEvent.getMetadata() != null) {
                documentCount--;
            }
        }
        logBatchOnSuccess(documentCount, batch.getJobBatchNumber());
    }

    public void logBatchOnSuccess(int documentCount, long optionalJobBatchNumber) {
        WriteProgressLogger.logProgressIfNecessary(documentCount);
        if (Util.MAIN_LOGGER.isTraceEnabled() && optionalJobBatchNumber > 0) {
            Util.MAIN_LOGGER.trace("Wrote batch; length: {}; job batch number: {}", documentCount, optionalJobBatchNumber);
        }
    }

    boolean isUsingFileSchema() {
        return this.usingFileSchema;
    }

    int getFileSchemaContentPosition() {
        return fileSchemaContentPosition;
    }

    int getFileSchemaPathPosition() {
        return fileSchemaPathPosition;
    }

    public void setNumPartitions(int numPartitions) {
        this.numPartitions = numPartitions;
    }
}