All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.marklogic.spark.writer.ArbitraryRowConverter Maven / Gradle / Ivy

There is a newer version: 2.4.2
Show newest version
/*
 * Copyright © 2024 MarkLogic Corporation. All Rights Reserved.
 */
package com.marklogic.spark.writer;

import com.fasterxml.jackson.core.JsonProcessingException;
import com.fasterxml.jackson.databind.JsonNode;
import com.fasterxml.jackson.databind.ObjectMapper;
import com.fasterxml.jackson.databind.node.ObjectNode;
import com.fasterxml.jackson.dataformat.xml.XmlMapper;
import com.marklogic.client.io.Format;
import com.marklogic.client.io.JacksonHandle;
import com.marklogic.client.io.StringHandle;
import com.marklogic.client.io.marker.AbstractWriteHandle;
import com.marklogic.spark.ConnectorException;
import com.marklogic.spark.JsonRowSerializer;
import com.marklogic.spark.Options;
import org.apache.spark.sql.catalyst.InternalRow;
import org.apache.spark.sql.types.StructField;
import org.apache.spark.sql.types.StructType;

import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;
import java.util.UUID;
import java.util.stream.Stream;

/**
 * Handles building a document from an "arbitrary" row - i.e. one with an unknown schema, where the row will be
 * serialized by Spark to a JSON object.
 */
class ArbitraryRowConverter implements RowConverter {

    private static final String MARKLOGIC_SPARK_FILE_PATH_COLUMN_NAME = "marklogic_spark_file_path";

    private final ObjectMapper objectMapper;
    private final XmlMapper xmlMapper;
    private final JsonRowSerializer jsonRowSerializer;
    private final String uriTemplate;
    private final String jsonRootName;
    private final String xmlRootName;
    private final String xmlNamespace;
    private final int filePathIndex;

    ArbitraryRowConverter(WriteContext writeContext) {
        this.filePathIndex = determineFilePathIndex(writeContext.getSchema());
        this.uriTemplate = writeContext.getStringOption(Options.WRITE_URI_TEMPLATE);
        this.jsonRootName = writeContext.getStringOption(Options.WRITE_JSON_ROOT_NAME);
        this.xmlRootName = writeContext.getStringOption(Options.WRITE_XML_ROOT_NAME);
        this.xmlNamespace = writeContext.getStringOption(Options.WRITE_XML_NAMESPACE);
        this.objectMapper = new ObjectMapper();
        this.xmlMapper = this.xmlRootName != null ? new XmlMapper() : null;
        this.jsonRowSerializer = new JsonRowSerializer(writeContext.getSchema(), writeContext.getProperties());
    }

    @Override
    public Iterator convertRow(InternalRow row) {
        String initialUri = null;
        if (this.filePathIndex > -1) {
            initialUri = row.getString(this.filePathIndex) + "/" + UUID.randomUUID();
            row.setNullAt(this.filePathIndex);
        }

        final String json = this.jsonRowSerializer.serializeRowToJson(row);

        AbstractWriteHandle contentHandle = null;
        ObjectNode deserializedJson = null;
        ObjectNode uriTemplateValues = null;
        final boolean mustRemoveFilePathField = this.filePathIndex > 1 && jsonRowSerializer.isIncludeNullFields();

        if (this.jsonRootName != null || this.xmlRootName != null || this.uriTemplate != null || mustRemoveFilePathField) {
            deserializedJson = readTree(json);
            if (mustRemoveFilePathField) {
                deserializedJson.remove(MARKLOGIC_SPARK_FILE_PATH_COLUMN_NAME);
            }
        }

        if (this.uriTemplate != null) {
            uriTemplateValues = deserializedJson;
        }

        if (this.jsonRootName != null) {
            ObjectNode jsonObjectWithRootName = objectMapper.createObjectNode();
            jsonObjectWithRootName.set(jsonRootName, deserializedJson);
            contentHandle = new JacksonHandle(jsonObjectWithRootName);
            if (this.uriTemplate != null) {
                uriTemplateValues = jsonObjectWithRootName;
            }
        }

        if (contentHandle == null) {
            // If the user wants XML, then we've definitely deserialized the JSON and removed the file path if
            // needed. So use that JsonNode to produce an XML string.
            if (xmlRootName != null) {
                contentHandle = new StringHandle(convertJsonToXml(deserializedJson)).withFormat(Format.XML);
            }
            // If we've already gone to the effort of creating deserializedJson, use it for the content.
            else if (deserializedJson != null) {
                contentHandle = new JacksonHandle(deserializedJson);
            } else {
                // Simplest scenario where we never have a reason to incur the expense of deserializing the JSON string,
                // so we can just use StringHandle.
                contentHandle = new StringHandle(json).withFormat(Format.JSON);
            }
        }

        return Stream.of(new DocBuilder.DocumentInputs(initialUri, contentHandle, uriTemplateValues, null)).iterator();
    }

    @Override
    public List getRemainingDocumentInputs() {
        return new ArrayList<>();
    }

    /**
     * A Spark user can add a column via:
     * withColumn("marklogic_spark_file_path", new Column("_metadata.file_path"))
     * 

* This allows access to the file path when using a Spark data source - e.g. CSV, Parquet - to read a file. * The column will be used to generate an initial URI for the corresponding document, and the column will then * be removed after that so that it's not included in the document. * * @return */ private int determineFilePathIndex(StructType schema) { StructField[] fields = schema.fields(); for (int i = 0; i < fields.length; i++) { if (MARKLOGIC_SPARK_FILE_PATH_COLUMN_NAME.equals(fields[i].name())) { return i; } } return -1; } private ObjectNode readTree(String json) { // We don't ever expect this to fail, as the JSON is produced by Spark's JacksonGenerator and should always // be valid JSON. But Jackson throws a checked exception, so gotta handle it. try { return (ObjectNode) objectMapper.readTree(json); } catch (JsonProcessingException e) { throw new ConnectorException(String.format("Unable to read JSON row: %s", e.getMessage()), e); } } /** * jackson-xml-mapper unfortunately does not yet support a root namespace. Nor does it allow for the root element * to be omitted. So we always end up with "ObjectNode" as a root element. See * https://github.com/FasterXML/jackson-dataformat-xml/issues/541 for more information. So this method does some * work to replace that root element with one based on user inputs. * * @param doc * @return */ private String convertJsonToXml(JsonNode doc) { try { String xml = xmlMapper.writer().writeValueAsString(doc); String startTag = this.xmlNamespace != null ? String.format("<%s xmlns='%s'>", this.xmlRootName, this.xmlNamespace) : String.format("<%s>", this.xmlRootName); return new StringBuilder(startTag) .append(xml.substring("".length(), xml.length() - "".length())) .append(String.format("", this.xmlRootName)) .toString(); } catch (JsonProcessingException e) { // We don't expect this occur; Jackson should be able to convert any JSON object that it created into // a valid XML document. throw new ConnectorException(String.format("Unable to convert JSON to XML for doc: %s", doc), e); } } }





© 2015 - 2024 Weber Informatics LLC | Privacy Policy