com.marklogic.spark.JsonRowSerializer Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of marklogic-spark-connector Show documentation
Spark 3 connector for MarkLogic
There is a newer version: 2.4.2
Show newest version
/*
 * Copyright © 2024 MarkLogic Corporation. All Rights Reserved.
 */
package com.marklogic.spark;

import org.apache.spark.sql.catalyst.InternalRow;
import org.apache.spark.sql.catalyst.json.JSONOptions;
import org.apache.spark.sql.catalyst.json.JacksonGenerator;
import org.apache.spark.sql.types.StructType;
import scala.Predef;
import scala.collection.JavaConverters;

import java.io.StringWriter;
import java.util.HashMap;
import java.util.Map;

/**
 * Handles serializing a Spark row into a JSON string. Includes support for all the options defined in Spark's
 * JSONOptions.scala class.
 */
public class JsonRowSerializer {

    private final StructType schema;
    private final JSONOptions jsonOptions;
    private final boolean includeNullFields;

    public JsonRowSerializer(StructType schema, Map connectorProperties) {
        this.schema = schema;

        final Map options = buildOptionsForJsonOptions(connectorProperties);
        this.includeNullFields = "false".equalsIgnoreCase(options.get("ignoreNullFields"));

        this.jsonOptions = new JSONOptions(
            // Funky code to convert a Java map into a Scala immutable Map.
            JavaConverters.mapAsScalaMapConverter(options).asScala().toMap(Predef.$conforms()),

            // As verified via tests, this default timezone ID is overridden by a user via
            // the spark.sql.session.timeZone option.
            "Z",

            // We don't expect corrupted records - i.e. corrupted values - to be present in the index. But Spark
            // requires this to be set. See
            // https://medium.com/@sasidharan-r/how-to-handle-corrupt-or-bad-record-in-apache-spark-custom-logic-pyspark-aws-430ddec9bb41
            // for more information.
            "_corrupt_record"
        );
    }

    public String serializeRowToJson(InternalRow row) {
        StringWriter writer = new StringWriter();
        JacksonGenerator jacksonGenerator = new JacksonGenerator(this.schema, writer, this.jsonOptions);
        jacksonGenerator.write(row);
        jacksonGenerator.flush();
        return writer.toString();
    }

    /**
     * A user can specify any of the options found in the JSONOptions.scala class - though it's not yet clear where
     * a user finds out about these except via the Spark source code. "ignoreNullFields" however is expected to be the
     * primary one that is configured.
     */
    private Map buildOptionsForJsonOptions(Map connectorProperties) {
        Map options = new HashMap<>();
        // Default to include null fields, as they are easily queried in MarkLogic.
        options.put("ignoreNullFields", "false");
        connectorProperties.forEach((key, value) -> {
            if (key.startsWith(Options.WRITE_JSON_SERIALIZATION_OPTION_PREFIX)) {
                String optionName = key.substring(Options.WRITE_JSON_SERIALIZATION_OPTION_PREFIX.length());
                options.put(optionName, value);
            }
        });
        return options;
    }

    public JSONOptions getJsonOptions() {
        return jsonOptions;
    }

    public boolean isIncludeNullFields() {
        return this.includeNullFields;
    }
}