com.marklogic.spark.writer.file.ContentWriter Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of marklogic-spark-connector Show documentation
Spark 3 connector for MarkLogic
There is a newer version: 2.4.2
Show newest version
/*
 * Copyright © 2024 MarkLogic Corporation. All Rights Reserved.
 */
package com.marklogic.spark.writer.file;

import com.fasterxml.jackson.databind.JsonNode;
import com.fasterxml.jackson.databind.ObjectMapper;
import com.marklogic.client.document.GenericDocumentManager;
import com.marklogic.client.io.DocumentMetadataHandle;
import com.marklogic.client.io.InputStreamHandle;
import com.marklogic.spark.ConnectorException;
import com.marklogic.spark.ContextSupport;
import com.marklogic.spark.Options;
import com.marklogic.spark.Util;
import com.marklogic.spark.reader.document.DocumentRowSchema;
import org.apache.commons.io.IOUtils;
import org.apache.spark.sql.catalyst.InternalRow;

import javax.xml.XMLConstants;
import javax.xml.transform.*;
import javax.xml.transform.stream.StreamResult;
import javax.xml.transform.stream.StreamSource;
import java.io.ByteArrayInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStream;
import java.nio.charset.Charset;
import java.util.Map;

/**
 * Knows how to write the value in the "content" column of a row conforming to our {@code DocumentRowSchema}. Supports
 * pretty-printing as well. This keeps an instance of a JAXP Transformer, which is safe for one thread to use
 * multiple times.
 */
class ContentWriter {

    private final Transformer transformer;
    private final ObjectMapper objectMapper;
    private final boolean prettyPrint;
    private final Charset encoding;

    private final boolean isStreamingFiles;
    // Only used when streaming.
    private final GenericDocumentManager documentManager;

    ContentWriter(Map properties) {
        ContextSupport context = new ContextSupport(properties);
        this.encoding = determineEncoding(context);
        this.prettyPrint = "true".equalsIgnoreCase(context.getStringOption(Options.WRITE_FILES_PRETTY_PRINT));
        if (this.prettyPrint) {
            this.objectMapper = new ObjectMapper();
            this.transformer = newTransformer();
        } else {
            this.transformer = null;
            this.objectMapper = null;
        }

        this.isStreamingFiles = context.isStreamingFiles();
        if (this.isStreamingFiles) {
            this.documentManager = context.connectToMarkLogic().newDocumentManager();
            if (context.hasOption(Options.READ_DOCUMENTS_CATEGORIES)) {
                this.documentManager.setMetadataCategories(Util.getRequestedMetadata(context));
            }
        } else {
            this.documentManager = null;
        }
    }

    void writeContent(InternalRow row, OutputStream outputStream) throws IOException {
        if (this.isStreamingFiles) {
            streamDocumentToFile(row, outputStream);
        } else if (this.prettyPrint) {
            prettyPrintContent(row, outputStream);
        } else if (this.encoding != null) {
            // We know the string from MarkLogic is UTF-8, so we use getBytes to convert it to the user's
            // specified encoding (as opposed to new String(bytes, encoding)).
            outputStream.write(new String(row.getBinary(1)).getBytes(this.encoding));
        } else {
            outputStream.write(row.getBinary(1));
        }
    }

    void writeMetadata(InternalRow row, OutputStream outputStream) throws IOException {
        String metadataXml = DocumentRowSchema.makeDocumentMetadata(row).toString();
        writeMetadata(metadataXml, outputStream);
    }

    /**
     * When streaming documents to an archive, the metadata unfortunately has to be retrieved in a separate request
     * per document. This is due to the Java Client hardcoding "content" as a category in a POST to v1/search. A
     * future fix to the Java Client to not hardcode this will allow for the metadata to be retrieved during the
     * reader phase.
     */
    void writeMetadataWhileStreaming(String documentUri, OutputStream outputStream) throws IOException {
        DocumentMetadataHandle metadata = this.documentManager.readMetadata(documentUri, new DocumentMetadataHandle());
        writeMetadata(metadata.toString(), outputStream);
    }

    private void writeMetadata(String metadataXml, OutputStream outputStream) throws IOException {
        // Must honor the encoding here as well, as a user could easily have values that require encoding in metadata
        // values or in a properties fragment.
        if (this.encoding != null) {
            outputStream.write(metadataXml.getBytes(this.encoding));
        } else {
            outputStream.write(metadataXml.getBytes());
        }
    }

    private Charset determineEncoding(ContextSupport context) {
        if (context.hasOption(Options.WRITE_FILES_ENCODING)) {
            String encodingValue = context.getStringOption(Options.WRITE_FILES_ENCODING);
            try {
                return Charset.forName(encodingValue);
            } catch (Exception ex) {
                throw new ConnectorException(String.format("Unsupported encoding value: %s", encodingValue), ex);
            }
        }
        return null;
    }

    private Transformer newTransformer() {
        try {
            TransformerFactory factory = TransformerFactory.newInstance();
            // Disables certain features as recommended by Sonar to prevent security vulnerabilities.
            // Also see https://stackoverflow.com/questions/32178558/how-to-prevent-xml-external-entity-injection-on-transformerfactory .
            factory.setFeature(XMLConstants.FEATURE_SECURE_PROCESSING, true);
            factory.setAttribute(XMLConstants.ACCESS_EXTERNAL_DTD, "");
            factory.setAttribute(XMLConstants.ACCESS_EXTERNAL_STYLESHEET, "");
            final Transformer t = factory.newTransformer();
            if (this.encoding != null) {
                t.setOutputProperty(OutputKeys.ENCODING, this.encoding.name());
            } else {
                t.setOutputProperty(OutputKeys.ENCODING, "UTF-8");
            }
            t.setOutputProperty(OutputKeys.OMIT_XML_DECLARATION, "yes");
            t.setOutputProperty(OutputKeys.INDENT, "yes");
            return t;
        } catch (TransformerConfigurationException e) {
            throw new ConnectorException(
                String.format("Unable to instantiate transformer for pretty-printing XML; cause: %s", e.getMessage()), e
            );
        }
    }

    private void prettyPrintContent(InternalRow row, OutputStream outputStream) throws IOException {
        final byte[] content = row.getBinary(1);
        final String format = row.isNullAt(2) ? null : row.getString(2);
        if ("JSON".equalsIgnoreCase(format)) {
            prettyPrintJson(content, outputStream);
        } else if ("XML".equalsIgnoreCase(format)) {
            prettyPrintXml(content, outputStream);
        } else {
            if (this.encoding != null) {
                outputStream.write(new String(content).getBytes(this.encoding));
            } else {
                outputStream.write(content);
            }
        }
    }

    private void prettyPrintJson(byte[] content, OutputStream outputStream) throws IOException {
        JsonNode node = this.objectMapper.readTree(content);
        String prettyJson = node.toPrettyString();
        if (this.encoding != null) {
            outputStream.write(prettyJson.getBytes(this.encoding));
        } else {
            outputStream.write(prettyJson.getBytes());
        }
    }

    private void prettyPrintXml(byte[] content, OutputStream outputStream) {
        Result result = new StreamResult(outputStream);
        Source source = new StreamSource(new ByteArrayInputStream(content));
        try {
            this.transformer.transform(source, result);
        } catch (TransformerException e) {
            throw new ConnectorException(String.format("Unable to pretty print XML; cause: %s", e.getMessage()), e);
        }
    }

    private void streamDocumentToFile(InternalRow row, OutputStream outputStream) throws IOException {
        String uri = row.getString(0);
        InputStream inputStream = documentManager.read(uri, new InputStreamHandle()).get();
        // commons-io is a dependency of Spark and a common utility for copying between two steams.
        IOUtils.copy(inputStream, outputStream);
    }
}