com.marklogic.spark.reader.file.xml.AggregateXmlSplitter Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of marklogic-spark-connector Show documentation
Spark 3 connector for MarkLogic
There is a newer version: 2.4.2
Show newest version
/*
 * Copyright © 2024 MarkLogic Corporation. All Rights Reserved.
 */
package com.marklogic.spark.reader.file.xml;

import com.marklogic.client.datamovement.XMLSplitter;
import com.marklogic.client.io.StringHandle;
import com.marklogic.spark.ConnectorException;
import com.marklogic.spark.Options;
import com.marklogic.spark.reader.file.FileContext;
import org.apache.spark.sql.catalyst.InternalRow;
import org.apache.spark.sql.catalyst.expressions.GenericInternalRow;
import org.apache.spark.unsafe.types.ByteArray;
import org.apache.spark.unsafe.types.UTF8String;

import javax.xml.stream.XMLInputFactory;
import javax.xml.stream.XMLStreamException;
import javax.xml.stream.XMLStreamReader;
import java.io.IOException;
import java.io.InputStream;
import java.util.Iterator;

/**
 * Knows how to split an aggregate XML document and return a row for each user-defined child element. Each row has
 * a schema matching that of {@code DocumentRowSchema}.
 */
class AggregateXmlSplitter {

    private final Iterator contentStream;
    private final String identifierForErrors;

    // Optional, both can be null.
    private final String uriElement;
    private final String uriNamespace;

    private int rowCounter = 1;

    private static XMLInputFactory xmlInputFactory;

    static {
        xmlInputFactory = XMLInputFactory.newFactory();
        // The following prevents XXE attacks, per Sonar java:S2755 rule.
        // Note that setting XMLConstants.ACCESS_EXTERNAL_DTD and XMLConstants.ACCESS_EXTERNAL_SCHEMA to empty
        // strings is also suggested by the Sonar S2755 docs and will work fine in this connector project - but it
        // will result in warnings in the Flux application that oddly cause no data to be read. So do not set those
        // to empty strings here. The below config satisfies Sonar in terms of preventing XXE attacks and does not
        // impact functionality.
        xmlInputFactory.setProperty(XMLInputFactory.IS_SUPPORTING_EXTERNAL_ENTITIES, Boolean.FALSE);
        xmlInputFactory.setProperty(XMLInputFactory.SUPPORT_DTD, false);
    }

    /**
     * @param identifierForErrors allows the caller of this class to provide a useful description to be included in
     *                            any errors to help users with debugging.
     * @param inputStream         the stream of aggregate XML data
     * @param fileContext
     */
    AggregateXmlSplitter(String identifierForErrors, InputStream inputStream, FileContext fileContext) {
        this.identifierForErrors = identifierForErrors;
        this.uriElement = fileContext.getStringOption(Options.READ_AGGREGATES_XML_URI_ELEMENT);
        this.uriNamespace = fileContext.getStringOption(Options.READ_AGGREGATES_XML_URI_NAMESPACE);
        final String namespace = fileContext.getStringOption(Options.READ_AGGREGATES_XML_NAMESPACE);
        final String element = fileContext.getStringOption(Options.READ_AGGREGATES_XML_ELEMENT);
        final String encoding = fileContext.getStringOption(Options.READ_FILES_ENCODING);

        final XMLSplitter splitter = this.uriElement != null ?
            new XMLSplitter<>(new UriElementExtractingVisitor(namespace, element, uriNamespace, uriElement)) :
            XMLSplitter.makeSplitter(namespace, element);

        try {
            XMLStreamReader reader = xmlInputFactory.createXMLStreamReader(inputStream, encoding);
            this.contentStream = splitter.split(reader).iterator();
        } catch (IOException | XMLStreamException e) {
            throw new ConnectorException(
                String.format("Unable to read XML at %s; cause: %s", this.identifierForErrors, e.getMessage()), e
            );
        }
    }

    boolean hasNext() {
        try {
            return this.contentStream.hasNext();
        } catch (Exception e) {
            String message = String.format("Unable to read XML from %s; cause: %s", identifierForErrors, e.getMessage());
            throw new ConnectorException(message, e);
        }
    }

    /**
     * @param uriPrefix used to construct a URI if no uriElement was specified
     * @return a row corresponding to the {@code DocumentRowSchema}
     */
    InternalRow nextRow(String uriPrefix) {
        StringHandle stringHandle;
        try {
            stringHandle = this.contentStream.next();
        } catch (RuntimeException ex) {
            String message = String.format("Unable to read XML from %s; cause: %s",
                this.identifierForErrors, ex.getMessage());
            throw new ConnectorException(message, ex);
        }

        final String initialUri = determineInitialUri(stringHandle, uriPrefix);
        rowCounter++;
        return new GenericInternalRow(new Object[]{
            UTF8String.fromString(initialUri),
            ByteArray.concat(stringHandle.get().getBytes()),
            UTF8String.fromString("xml"),
            null, null, null, null, null
        });
    }

    private String determineInitialUri(StringHandle stringHandle, String uriPrefix) {
        if (stringHandle instanceof StringHandleWithUriValue) {
            String uriValue = ((StringHandleWithUriValue) stringHandle).getUriValue();
            if (uriValue == null) {
                String message = String.format("No occurrence of URI element '%s' found in aggregate element %d in %s",
                    this.uriElement, rowCounter, this.identifierForErrors);
                throw new ConnectorException(message);
            }
            return uriValue;
        }
        return String.format("%s-%d.xml", uriPrefix, rowCounter);
    }
}