
org.broadinstitute.hellbender.engine.spark.IntervalWalkerSpark Maven / Gradle / Ivy
The newest version!
package org.broadinstitute.hellbender.engine.spark;
import htsjdk.samtools.SAMSequenceDictionary;
import org.apache.spark.SparkFiles;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.broadcast.Broadcast;
import org.broadinstitute.barclay.argparser.Argument;
import org.broadinstitute.hellbender.engine.*;
import org.broadinstitute.hellbender.engine.spark.datasources.ReferenceMultiSparkSource;
import org.broadinstitute.hellbender.utils.IntervalUtils;
import org.broadinstitute.hellbender.utils.SimpleInterval;
import org.broadinstitute.hellbender.utils.io.IOUtils;
import org.broadinstitute.hellbender.utils.read.GATKRead;
import java.util.Iterator;
import java.util.List;
import java.util.stream.Collectors;
import java.util.stream.StreamSupport;
/**
* A Spark version of {@link IntervalWalker}. Subclasses should implement {@link #processIntervals(JavaRDD, JavaSparkContext)}
* and operate on the passed in RDD.
*/
public abstract class IntervalWalkerSpark extends GATKSparkTool {
private static final long serialVersionUID = 1L;
@Override
public boolean requiresIntervals() {
return true;
}
@Argument(doc = "whether to use the shuffle implementation or not", shortName = "shuffle", fullName = "shuffle", optional = true)
public boolean shuffle = false;
private String referenceFileName;
/**
* Customize initialization of the Feature data source for this traversal type to disable query lookahead.
*/
@Override
void initializeFeatures() {
// Disable query lookahead in our FeatureManager for this traversal type. Query lookahead helps
// when our query intervals are overlapping and gradually increasing in position (as they are
// with ReadWalkers, typically), but with IntervalWalkers our query intervals are guaranteed
// to be non-overlapping, since our interval parsing code always merges overlapping intervals.
features = new FeatureManager(this, 0);
if ( features.isEmpty() ) { // No available sources of Features for this tool
features = null;
}
}
/**
* Loads intervals and the corresponding reads, reference and features into a {@link JavaRDD}.
*
* @return all intervals as a {@link JavaRDD}.
*/
public JavaRDD getIntervals(JavaSparkContext ctx) {
SAMSequenceDictionary sequenceDictionary = getBestAvailableSequenceDictionary();
// don't shard the intervals themselves, since we want each interval to be processed by a single task
final List intervalShardBoundaries = getIntervals().stream()
.map(i -> new ShardBoundary(i, i)).collect(Collectors.toList());
JavaRDD> shardedReads = SparkSharder.shard(ctx, getReads(), GATKRead.class, sequenceDictionary, intervalShardBoundaries, Integer.MAX_VALUE, shuffle);
Broadcast bFeatureManager = features == null ? null : ctx.broadcast(features);
return shardedReads.map(getIntervalsFunction(referenceFileName, bFeatureManager));
}
private static org.apache.spark.api.java.function.Function, IntervalWalkerContext> getIntervalsFunction(
String referenceFileName, Broadcast bFeatureManager) {
return (org.apache.spark.api.java.function.Function, IntervalWalkerContext>) shard -> {
// get reference bases for this shard (padded)
SimpleInterval interval = shard.getInterval();
ReadsContext readsContext = new ReadsContext(new GATKDataSource() {
@Override
public Iterator iterator() {
return shard.iterator();
}
@Override
public Iterator query(SimpleInterval interval) {
return StreamSupport.stream(shard.spliterator(), false).filter(
r -> IntervalUtils.overlaps(r, interval)).iterator();
}
}, shard.getInterval());
ReferenceDataSource reference = referenceFileName == null ? null : new ReferenceFileSource(IOUtils.getPath(SparkFiles.get(referenceFileName)));
FeatureManager features = bFeatureManager == null ? null : bFeatureManager.getValue();
return new IntervalWalkerContext(interval, readsContext, new ReferenceContext(reference, interval), new FeatureContext(features, interval));
};
}
@Override
protected void runTool(JavaSparkContext ctx) {
referenceFileName = addReferenceFilesForSpark(ctx, referenceArguments.getReferencePath());
processIntervals(getIntervals(ctx), ctx);
}
/**
* Process the intervals and write output. Must be implemented by subclasses.
*
* @param rdd a distributed collection of {@link IntervalWalkerContext}
* @param ctx our Spark context
*/
protected abstract void processIntervals(JavaRDD rdd, JavaSparkContext ctx);
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy