com.marklogic.spark.reader.document.ForestReader Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of marklogic-spark-connector Show documentation
Show all versions of marklogic-spark-connector Show documentation
Spark 3 connector for MarkLogic
/*
* Copyright © 2024 MarkLogic Corporation. All Rights Reserved.
*/
package com.marklogic.spark.reader.document;
import com.marklogic.client.DatabaseClient;
import com.marklogic.client.document.DocumentManager;
import com.marklogic.client.document.DocumentPage;
import com.marklogic.client.document.DocumentRecord;
import com.marklogic.client.document.GenericDocumentManager;
import com.marklogic.client.io.BytesHandle;
import com.marklogic.client.io.DocumentMetadataHandle;
import com.marklogic.client.io.Format;
import com.marklogic.client.query.QueryDefinition;
import com.marklogic.client.query.SearchQueryDefinition;
import com.marklogic.client.query.StructuredQueryBuilder;
import com.marklogic.spark.Options;
import com.marklogic.spark.ReadProgressLogger;
import com.marklogic.spark.Util;
import org.apache.spark.sql.catalyst.InternalRow;
import org.apache.spark.sql.connector.read.PartitionReader;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.util.List;
import java.util.Set;
/**
* This uses the same technique as QueryBatcher in getting back an ordered list of URIs without having to paginate.
* It does involve 2x calls for each batch - one to get the URIs, and then one to get the documents for those URIs.
* Will performance test this later to determine if just using documentManager.search with pagination is generally
* faster. That's just 1 call, but it incurs the cost of finding page N.
*/
class ForestReader implements PartitionReader {
private static final Logger logger = LoggerFactory.getLogger(ForestReader.class);
private final UriBatcher uriBatcher;
private final GenericDocumentManager documentManager;
private final StructuredQueryBuilder queryBuilder;
private final Set requestedMetadata;
private final boolean contentWasRequested;
private final Integer limit;
private final boolean isStreamingFiles;
// Only used for logging.
private final ForestPartition forestPartition;
private long startTime;
private DocumentPage currentDocumentPage;
// Used for logging and for ensuring a non-null limit is not exceeded.
private int docCount;
ForestReader(ForestPartition forestPartition, DocumentContext context) {
this.forestPartition = forestPartition;
this.limit = context.getLimit();
this.isStreamingFiles = context.isStreamingFiles();
DatabaseClient client = context.isDirectConnection() ?
context.connectToMarkLogic(forestPartition.getHost()) :
context.connectToMarkLogic();
if (logger.isDebugEnabled()) {
logger.debug("Will read from host {} for partition: {}", client.getHost(), forestPartition);
}
SearchQueryDefinition query = context.buildSearchQuery(client);
boolean filtered = false;
if (context.hasOption(Options.READ_DOCUMENTS_FILTERED)) {
filtered = Boolean.parseBoolean(context.getProperties().get(Options.READ_DOCUMENTS_FILTERED));
}
this.uriBatcher = new UriBatcher(client, query, forestPartition, context.getBatchSize(), filtered);
this.documentManager = client.newDocumentManager();
this.documentManager.setReadTransform(query.getResponseTransform());
this.contentWasRequested = context.contentWasRequested();
this.requestedMetadata = Util.getRequestedMetadata(context);
this.documentManager.setMetadataCategories(this.requestedMetadata);
this.queryBuilder = client.newQueryManager().newStructuredQueryBuilder();
}
@Override
public boolean next() {
if (startTime == 0) {
startTime = System.currentTimeMillis();
}
if (limit != null && docCount >= limit) {
// No logging here as this block may never be hit, depending on whether Spark first detects that the limit
// has been reached.
return false;
}
if (currentDocumentPage == null || !currentDocumentPage.hasNext()) {
closeCurrentDocumentPage();
List uris = getNextBatchOfUris();
if (uris.isEmpty()) {
if (logger.isDebugEnabled()) {
long duration = System.currentTimeMillis() - startTime;
logger.debug("Read {} documents from partition {} in {}ms", docCount, forestPartition, duration);
}
return false;
}
// When streaming, we don't want to retrieve the documents yet - they'll be retrieved in the writer phase.
this.currentDocumentPage = this.isStreamingFiles ? new UrisPage(uris.iterator()) : readPage(uris);
}
return currentDocumentPage.hasNext();
}
@Override
public InternalRow get() {
DocumentRecord document = this.currentDocumentPage.next();
DocumentRowBuilder builder = new DocumentRowBuilder(requestedMetadata).withUri(document.getUri());
if (this.contentWasRequested) {
builder.withContent(document.getContent(new BytesHandle()).get());
builder.withFormat(document.getFormat() != null ? document.getFormat().toString() : Format.UNKNOWN.toString());
}
if (!requestedMetadata.isEmpty()) {
builder.withMetadata(document.getMetadata(new DocumentMetadataHandle()));
}
docCount++;
return builder.buildRow();
}
@Override
public void close() {
closeCurrentDocumentPage();
}
private List getNextBatchOfUris() {
long start = System.currentTimeMillis();
List uris = uriBatcher.nextBatchOfUris();
if (logger.isTraceEnabled()) {
logger.trace("Retrieved {} URIs in {}ms from partition {}", uris.size(),
(System.currentTimeMillis() - start), this.forestPartition);
}
return uris;
}
private DocumentPage readPage(List uris) {
long start = System.currentTimeMillis();
String[] uriArray = uris.toArray(new String[]{});
QueryDefinition queryDefinition = this.queryBuilder.document(uriArray);
this.documentManager.setPageLength(uriArray.length);
// Must do a search so that a POST is sent instead of a GET. A GET can fail with a Request-URI error if too
// many URIs are included in the querystring. However, content is always retrieved with a search, so there's
// some inefficiency if the caller only wants metadata and no content.
DocumentPage page = this.documentManager.search(queryDefinition, 0);
if (logger.isTraceEnabled()) {
logger.trace("Retrieved page of documents in {}ms from partition {}", (System.currentTimeMillis() - start), this.forestPartition);
}
ReadProgressLogger.logProgressIfNecessary(page.getPageSize());
return page;
}
private void closeCurrentDocumentPage() {
if (currentDocumentPage != null) {
currentDocumentPage.close();
currentDocumentPage = null;
}
}
}