com.marklogic.spark.DefaultSource Maven / Gradle / Ivy
Show all versions of marklogic-spark-connector Show documentation
/*
* Copyright © 2024 MarkLogic Corporation. All Rights Reserved.
*/
package com.marklogic.spark;
import com.marklogic.client.io.StringHandle;
import com.marklogic.client.row.RawQueryDSLPlan;
import com.marklogic.client.row.RowManager;
import com.marklogic.spark.reader.document.DocumentRowSchema;
import com.marklogic.spark.reader.document.DocumentTable;
import com.marklogic.spark.reader.file.TripleRowSchema;
import com.marklogic.spark.reader.optic.SchemaInferrer;
import com.marklogic.spark.writer.WriteContext;
import org.apache.spark.sql.SparkSession;
import org.apache.spark.sql.connector.catalog.Table;
import org.apache.spark.sql.connector.catalog.TableProvider;
import org.apache.spark.sql.connector.expressions.Transform;
import org.apache.spark.sql.sources.DataSourceRegister;
import org.apache.spark.sql.types.DataTypes;
import org.apache.spark.sql.types.StructType;
import org.apache.spark.sql.util.CaseInsensitiveStringMap;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import scala.collection.JavaConverters;
import java.util.Arrays;
import java.util.List;
import java.util.Map;
/**
* The name "DefaultSource" is used here so that this connector can be loaded using the Spark V2 approach, where the
* user specifies a package name and the class name is assumed to be "DefaultSource".
*/
public class DefaultSource implements TableProvider, DataSourceRegister {
private static final Logger logger = LoggerFactory.getLogger("com.marklogic.spark");
@Override
public String shortName() {
// Allows for "marklogic" to be used instead of "com.marklogic.spark".
return "marklogic";
}
/**
* If no schema is provided when reading data, Spark invokes this before getTable is invoked.
*
* This will not be invoked during a write operation, as the schema in the Dataset being written will be used
* instead.
*/
@Override
public StructType inferSchema(CaseInsensitiveStringMap options) {
final Map properties = options.asCaseSensitiveMap();
if (isFileOperation(properties)) {
final String type = properties.get(Options.READ_FILES_TYPE);
return "rdf".equalsIgnoreCase(type) ? TripleRowSchema.SCHEMA : DocumentRowSchema.SCHEMA;
}
if (isReadDocumentsOperation(properties)) {
return DocumentRowSchema.SCHEMA;
} else if (isReadTriplesOperation(properties)) {
return TripleRowSchema.SCHEMA;
} else if (Util.isReadWithCustomCodeOperation(properties)) {
return new StructType().add("URI", DataTypes.StringType);
}
return inferSchemaFromOpticQuery(properties);
}
@Override
public Table getTable(StructType schema, Transform[] partitioning, Map properties) {
if (isFileOperation(properties)) {
// Not yet supporting progress logging for file operations.
return new MarkLogicFileTable(SparkSession.active(),
new CaseInsensitiveStringMap(properties),
JavaConverters.asScalaBuffer(getPaths(properties)), schema
);
}
final ContextSupport tempContext = new ContextSupport(properties);
// The appropriate progress logger is reset here so that when the connector is used repeatedly in an
// environment like PySpark, the counts start with zero on each new Spark job.
final long readProgressInterval = tempContext.getNumericOption(Options.READ_LOG_PROGRESS, 0, 0);
if (isReadDocumentsOperation(properties)) {
ReadProgressLogger.initialize(readProgressInterval, "Documents read: {}");
return new DocumentTable(DocumentRowSchema.SCHEMA);
} else if (isReadTriplesOperation(properties)) {
ReadProgressLogger.initialize(readProgressInterval, "Triples read: {}");
return new DocumentTable(TripleRowSchema.SCHEMA);
} else if (properties.get(Options.READ_OPTIC_QUERY) != null) {
ReadProgressLogger.initialize(readProgressInterval, "Rows read: {}");
return new MarkLogicTable(schema, properties);
} else if (Util.isReadWithCustomCodeOperation(properties)) {
ReadProgressLogger.initialize(readProgressInterval, "Items read: {}");
return new MarkLogicTable(schema, properties);
}
final long writeProgressInterval = tempContext.getNumericOption(Options.WRITE_LOG_PROGRESS, 0, 0);
String message = Util.isWriteWithCustomCodeOperation(properties) ? "Items processed: {}" : "Documents written: {}";
WriteProgressLogger.initialize(writeProgressInterval, message);
return new MarkLogicTable(new WriteContext(schema, properties));
}
/**
* Per https://spark.apache.org/docs/3.2.4/api/java/org/apache/spark/sql/connector/catalog/TableProvider.html#supportsExternalMetadata-- ,
* this returns true as we allow for a user to provide their own schema instead of requiring schema inference.
*
* @return
*/
@Override
public boolean supportsExternalMetadata() {
return true;
}
private boolean isFileOperation(Map properties) {
return properties.containsKey("path") || properties.containsKey("paths");
}
private boolean isReadDocumentsOperation(Map properties) {
return properties.containsKey(Options.READ_DOCUMENTS_QUERY) ||
properties.containsKey(Options.READ_DOCUMENTS_STRING_QUERY) ||
properties.containsKey(Options.READ_DOCUMENTS_COLLECTIONS) ||
properties.containsKey(Options.READ_DOCUMENTS_DIRECTORY) ||
properties.containsKey(Options.READ_DOCUMENTS_OPTIONS) ||
properties.containsKey(Options.READ_DOCUMENTS_URIS);
}
private boolean isReadTriplesOperation(Map properties) {
return Util.hasOption(properties,
Options.READ_TRIPLES_GRAPHS,
Options.READ_TRIPLES_COLLECTIONS,
Options.READ_TRIPLES_QUERY,
Options.READ_TRIPLES_STRING_QUERY,
Options.READ_TRIPLES_URIS,
Options.READ_TRIPLES_DIRECTORY
);
}
private StructType inferSchemaFromOpticQuery(Map caseSensitiveOptions) {
final String query = caseSensitiveOptions.get(Options.READ_OPTIC_QUERY);
if (query == null || query.trim().length() < 1) {
throw new ConnectorException(Util.getOptionNameForErrorMessage("spark.marklogic.read.noOpticQuery"));
}
RowManager rowManager = new ContextSupport(caseSensitiveOptions).connectToMarkLogic().newRowManager();
RawQueryDSLPlan dslPlan = rowManager.newRawQueryDSLPlan(new StringHandle(query));
try {
// columnInfo is what forces a minimum MarkLogic version of 10.0-9 or higher.
StringHandle columnInfoHandle = rowManager.columnInfo(dslPlan, new StringHandle());
StructType schema = SchemaInferrer.inferSchema(columnInfoHandle.get());
if (Util.MAIN_LOGGER.isDebugEnabled()) {
logger.debug("Inferred schema from Optic columnInfo: {}", schema);
}
return schema;
} catch (Exception ex) {
throw new ConnectorException(String.format("Unable to run Optic query %s; cause: %s", query, ex.getMessage()), ex);
}
}
private List getPaths(Map properties) {
return properties.containsKey("path") ?
Arrays.asList(properties.get("path")) :
Util.parsePaths(properties.get("paths"));
}
}