All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.marklogic.spark.reader.optic.OpticMicroBatchStream Maven / Gradle / Ivy

There is a newer version: 2.4.2
Show newest version
/*
 * Copyright © 2024 MarkLogic Corporation. All Rights Reserved.
 */
package com.marklogic.spark.reader.optic;

import com.marklogic.spark.Util;
import org.apache.spark.sql.connector.read.InputPartition;
import org.apache.spark.sql.connector.read.PartitionReaderFactory;
import org.apache.spark.sql.connector.read.streaming.MicroBatchStream;
import org.apache.spark.sql.connector.read.streaming.Offset;
import org.apache.spark.sql.execution.streaming.LongOffset;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import java.util.List;

/**
 * Interprets a "micro batch" as a bucket. This gives the user control over how many micro batches will be created, as
 * the user can adjust the number of partitions and the batch size to affect how many buckets are created.
 * 

* Within the scope of this class, an offset is equivalent to an index in the list of all buckets across all partitions * present in the {@code PlanAnalysis}. Each bucket is defined by lower/upper row ID bounds. So to refer to a bucket, * we simply need to know the index of the bucket in the list of all buckets. And thus, an offset is simply the index of * a bucket in that list. */ class OpticMicroBatchStream implements MicroBatchStream { private static final Logger logger = LoggerFactory.getLogger(OpticMicroBatchStream.class); private OpticReadContext opticReadContext; private List allBuckets; private int bucketIndex; OpticMicroBatchStream(OpticReadContext opticReadContext) { this.opticReadContext = opticReadContext; this.allBuckets = this.opticReadContext.getPlanAnalysis().getAllBuckets(); } @Override public Offset latestOffset() { if (bucketIndex >= this.allBuckets.size()) { return null; } if (logger.isTraceEnabled()) { logger.trace("Returning latest offset: {}", bucketIndex); } return new LongOffset(bucketIndex++); } /** * The offset is treated as the index of a bucket in the list of buckets. Thus, the concept of "start" and "end" * isn't relevant here - just the "end" offset is needed, which identifies the next bucket to process. * * @param start * @param end * @return */ @Override public InputPartition[] planInputPartitions(Offset start, Offset end) { int index = (int) ((LongOffset) end).offset(); return index >= allBuckets.size() ? null : new InputPartition[]{new PlanAnalysis.Partition(index + "", allBuckets.get(index))}; } @Override public PartitionReaderFactory createReaderFactory() { return new OpticPartitionReaderFactory(this.opticReadContext); } @Override public Offset initialOffset() { return new LongOffset(0); } @Override public Offset deserializeOffset(String json) { return new LongOffset(Long.parseLong(json)); } @Override public void commit(Offset end) { if (logger.isDebugEnabled()) { logger.debug("Committing offset: {}", end); } } @Override public void stop() { Util.MAIN_LOGGER.info("Stopping"); } }





© 2015 - 2024 Weber Informatics LLC | Privacy Policy