org.broadinstitute.hellbender.engine.spark.IntervalWalkerSpark Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of gatk Show documentation
Development on GATK 4
The newest version!
package org.broadinstitute.hellbender.engine.spark;

import htsjdk.samtools.SAMSequenceDictionary;
import org.apache.spark.SparkFiles;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.broadcast.Broadcast;
import org.broadinstitute.barclay.argparser.Argument;
import org.broadinstitute.hellbender.engine.*;
import org.broadinstitute.hellbender.engine.spark.datasources.ReferenceMultiSparkSource;
import org.broadinstitute.hellbender.utils.IntervalUtils;
import org.broadinstitute.hellbender.utils.SimpleInterval;
import org.broadinstitute.hellbender.utils.io.IOUtils;
import org.broadinstitute.hellbender.utils.read.GATKRead;

import java.util.Iterator;
import java.util.List;
import java.util.stream.Collectors;
import java.util.stream.StreamSupport;

/**
 * A Spark version of {@link IntervalWalker}. Subclasses should implement {@link #processIntervals(JavaRDD, JavaSparkContext)}
 * and operate on the passed in RDD.
 */
public abstract class IntervalWalkerSpark extends GATKSparkTool {
    private static final long serialVersionUID = 1L;

    @Override
    public boolean requiresIntervals() {
        return true;
    }

    @Argument(doc = "whether to use the shuffle implementation or not", shortName = "shuffle", fullName = "shuffle", optional = true)
    public boolean shuffle = false;

    private String referenceFileName;

    /**
     * Customize initialization of the Feature data source for this traversal type to disable query lookahead.
     */
    @Override
    void initializeFeatures() {
        // Disable query lookahead in our FeatureManager for this traversal type. Query lookahead helps
        // when our query intervals are overlapping and gradually increasing in position (as they are
        // with ReadWalkers, typically), but with IntervalWalkers our query intervals are guaranteed
        // to be non-overlapping, since our interval parsing code always merges overlapping intervals.
        features = new FeatureManager(this, 0);
        if ( features.isEmpty() ) {  // No available sources of Features for this tool
            features = null;
        }
    }

    /**
     * Loads intervals and the corresponding reads, reference and features into a {@link JavaRDD}.
     *
     * @return all intervals as a {@link JavaRDD}.
     */
    public JavaRDD getIntervals(JavaSparkContext ctx) {
        SAMSequenceDictionary sequenceDictionary = getBestAvailableSequenceDictionary();
        // don't shard the intervals themselves, since we want each interval to be processed by a single task
        final List intervalShardBoundaries = getIntervals().stream()
                .map(i -> new ShardBoundary(i, i)).collect(Collectors.toList());
        JavaRDD> shardedReads = SparkSharder.shard(ctx, getReads(), GATKRead.class, sequenceDictionary, intervalShardBoundaries, Integer.MAX_VALUE, shuffle);
        Broadcast bFeatureManager = features == null ? null : ctx.broadcast(features);
        return shardedReads.map(getIntervalsFunction(referenceFileName, bFeatureManager));
    }

    private static org.apache.spark.api.java.function.Function, IntervalWalkerContext> getIntervalsFunction(
            String referenceFileName, Broadcast bFeatureManager) {
        return (org.apache.spark.api.java.function.Function, IntervalWalkerContext>) shard -> {
            // get reference bases for this shard (padded)
            SimpleInterval interval = shard.getInterval();
            ReadsContext readsContext = new ReadsContext(new GATKDataSource() {
                @Override
                public Iterator iterator() {
                    return shard.iterator();
                }
                @Override
                public Iterator query(SimpleInterval interval) {
                    return StreamSupport.stream(shard.spliterator(), false).filter(
                            r -> IntervalUtils.overlaps(r, interval)).iterator();
                }
            }, shard.getInterval());
            ReferenceDataSource reference = referenceFileName == null ? null : new ReferenceFileSource(IOUtils.getPath(SparkFiles.get(referenceFileName)));
            FeatureManager features = bFeatureManager == null ? null : bFeatureManager.getValue();
            return new IntervalWalkerContext(interval, readsContext, new ReferenceContext(reference, interval), new FeatureContext(features, interval));
        };
    }

    @Override
    protected void runTool(JavaSparkContext ctx) {
        referenceFileName = addReferenceFilesForSpark(ctx, referenceArguments.getReferencePath());
        processIntervals(getIntervals(ctx), ctx);
    }

    /**
     * Process the intervals and write output. Must be implemented by subclasses.
     *
     * @param rdd a distributed collection of {@link IntervalWalkerContext}
     * @param ctx our Spark context
     */
    protected abstract void processIntervals(JavaRDD rdd, JavaSparkContext ctx);
}