org.broadinstitute.hellbender.tools.spark.pathseq.PathSeqBwaSpark Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of gatk Show documentation
Development on GATK 4
The newest version!
package org.broadinstitute.hellbender.tools.spark.pathseq;

import htsjdk.samtools.SAMFileHeader;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.storage.StorageLevel;
import org.broadinstitute.barclay.argparser.Argument;
import org.broadinstitute.barclay.argparser.ArgumentCollection;
import org.broadinstitute.barclay.argparser.CommandLineProgramProperties;
import org.broadinstitute.barclay.help.DocumentedFeature;
import org.broadinstitute.hellbender.cmdline.programgroups.MetagenomicsProgramGroup;
import org.broadinstitute.hellbender.engine.GATKPath;
import org.broadinstitute.hellbender.engine.spark.GATKSparkTool;
import org.broadinstitute.hellbender.engine.spark.datasources.ReadsSparkSink;
import org.broadinstitute.hellbender.engine.spark.datasources.ReadsSparkSource;
import org.broadinstitute.hellbender.exceptions.UserException;
import org.broadinstitute.hellbender.utils.Utils;
import org.broadinstitute.hellbender.utils.gcs.BucketUtils;
import org.broadinstitute.hellbender.utils.read.GATKRead;
import org.broadinstitute.hellbender.utils.read.ReadsWriteFormat;
import scala.Tuple2;

import java.io.IOException;

/**
 * Align reads to a microbe reference using BWA-MEM and Spark. Second step in the PathSeq pipeline.
 *
 * See PathSeqPipelineSpark for an overview of the PathSeq pipeline.
 *
 * This is a specialized version of BwaSpark designed for the PathSeq pipeline. The main difference is that alignments
 * with SAM bit flag 0x100 or 0x800 (indicating secondary or supplementary alignment) are omitted in the output.
 *
 * Inputs
 *
 * 
 *     Unaligned queryname-sorted BAM file containing only paired reads (paired-end reads with mates)
 *     Unaligned BAM file containing only unpaired reads (paired-end reads without mates and/or single-end reads)
 *     *Microbe reference BWA-MEM index image generated using BwaMemIndexImageCreator
 *     *Indexed microbe reference dictionary (fasta file NOT required)
 * 
 *
 * *A standard microbe reference is available in the GATK Resource Bundle.
 *
 * Output
 *
 * 
 *     Aligned BAM file containing the paired reads (paired-end reads with mates)
 *     Aligned BAM file containing the unpaired reads (paired-end reads without mates and/or single-end reads)
 * 
 *
 * Usage example
 *
 * This tool can be run without explicitly specifying Spark options. That is to say, the given example command
 * without Spark options will run locally. See
 * Tutorial#10060 for an example
 * of how to set up and run a Spark tool on a cloud Spark cluster.
 *
 * Local mode:
 *
 *  * gatk PathSeqBwaSpark  \
 *   --paired-input input_reads_paired.bam \
 *   --unpaired-input input_reads_unpaired.bam \
 *   --paired-output output_reads_paired.bam \
 *   --unpaired-output output_reads_unpaired.bam \
 *   --microbe-bwa-image reference.img \
 *   --microbe-dict reference.dict
 * 
 *
 * Spark cluster on Google Cloud DataProc with 6 32-core / 208GB memory worker nodes:
 *
 *  * gatk PathSeqBwaSpark  \
 *   --paired-input gs://my-gcs-bucket/input_reads_paired.bam \
 *   --unpaired-input gs://my-gcs-bucket/input_reads_unpaired.bam \
 *   --paired-output gs://my-gcs-bucket/output_reads_paired.bam \
 *   --unpaired-output gs://my-gcs-bucket/output_reads_unpaired.bam \
 *   --microbe-bwa-image /references/reference.img \
 *   --microbe-dict hdfs://my-cluster-m:8020//references/reference.dict \
 *   --bam-partition-size 4000000 \
 *   -- \
 *   --sparkRunner GCS \
 *   --cluster my_cluster \
 *   --driver-memory 8G \
 *   --executor-memory 32G \
 *   --num-executors 4 \
 *   --executor-cores 30 \
 *   --conf spark.executor.memoryOverhead=132000
 * 
 *
 * Note that the microbe BWA image must be copied to the same path on every worker node. The microbe FASTA
 * may also be copied to a single path on every worker node or to HDFS.
 *
 * Notes
 *
 * For small input BAMs, it is recommended that the user reduce the BAM partition size in order to increase parallelism. Note
 * that insert size is estimated separately for each Spark partition. Consequently partition size and other Spark
 * parameters can affect the output for paired-end alignment.
 *
 * To minimize output file size, header lines are included only for sequences with at least one alignment.
 *
 * @author Mark Walker <[email protected]>
 */

@CommandLineProgramProperties(summary = "Align reads to a microbe reference using BWA-MEM and Spark. Second step in the PathSeq pipeline.",
        oneLineSummary = "Step 2: Aligns reads to the microbe reference",
        programGroup = MetagenomicsProgramGroup.class)
@DocumentedFeature
public final class PathSeqBwaSpark extends GATKSparkTool {

    private static final long serialVersionUID = 1L;

    public static final String PAIRED_INPUT_LONG_NAME = "paired-input";
    public static final String UNPAIRED_INPUT_LONG_NAME = "unpaired-input";
    public static final String PAIRED_OUTPUT_LONG_NAME = "paired-output";
    public static final String UNPAIRED_OUTPUT_LONG_NAME = "unpaired-output";

    @Argument(doc = "Input queryname-sorted BAM containing only paired reads",
            fullName = PAIRED_INPUT_LONG_NAME,
            optional = true)
    public String inputPaired = null;

    @Argument(doc = "Input BAM containing only unpaired reads",
            fullName = UNPAIRED_INPUT_LONG_NAME,
            optional = true)
    public String inputUnpaired = null;

    @Argument(doc = "Output BAM containing only paired reads",
            fullName = PAIRED_OUTPUT_LONG_NAME,
            optional = true)
    public String outputPaired = null;

    @Argument(doc = "Output BAM containing only unpaired reads",
            fullName = UNPAIRED_OUTPUT_LONG_NAME,
            optional = true)
    public String outputUnpaired = null;

    @ArgumentCollection
    public PSBwaArgumentCollection bwaArgs = new PSBwaArgumentCollection();

    /**
     * Reads bam from path and returns tuple of the header and reads RDD
     */
    private Tuple2> loadBam(final String path,
                                                             final ReadsSparkSource readsSource) {
        if (path == null) return null;
        if (BucketUtils.fileExists(path)) {
            final SAMFileHeader header = readsSource.getHeader(new GATKPath(path), null);
            if (header.getSequenceDictionary() != null && !header.getSequenceDictionary().isEmpty()) {
                throw new UserException.BadInput("Input BAM should be unaligned, but found one or more sequences in the header.");
            }
            PSBwaUtils.addReferenceSequencesToHeader(header, bwaArgs.microbeDictionary);
            final JavaRDD reads = readsSource.getParallelReads(new GATKPath(path), null, null, bamPartitionSplitSize);
            return new Tuple2<>(header, reads);
        }
        logger.warn("Could not find file " + path + ". Skipping...");
        return null;
    }

    /**
     * Writes RDD of reads to path. Note writeReads() is not used because there are separate paired/unpaired outputs.
     * Header sequence dictionary is reduced to only those that were aligned to.
     */
    private void writeBam(final JavaRDD reads, final String inputBamPath, final boolean isPaired,
                          final JavaSparkContext ctx, SAMFileHeader header) {

        //Only retain header sequences that were aligned to.
        //This invokes an action and therefore the reads must be cached.
        reads.persist(StorageLevel.MEMORY_AND_DISK_SER());
        header = PSBwaUtils.removeUnmappedHeaderSequences(header, reads, logger);

        final String outputPath = isPaired ? outputPaired : outputUnpaired;
        try {
            ReadsSparkSink.writeReads(ctx, outputPath, null, reads, header,
                    shardedOutput ? ReadsWriteFormat.SHARDED : ReadsWriteFormat.SINGLE,
                    PSUtils.pathseqGetRecommendedNumReducers(inputBamPath, numReducers, getTargetPartitionSize()), shardedPartsDir, true, splittingIndexGranularity);
        } catch (final IOException e) {
            throw new UserException.CouldNotCreateOutputFile(outputPath, "Writing failed", e);
        }
    }

    /**
     * Loads a bam, aligns using the given aligner, and writes to a new bam. Returns false if the input bam could not
     * be read.
     */
    private boolean alignBam(final String inputBamPath, final PSBwaAlignerSpark aligner, final boolean isPaired,
                             final JavaSparkContext ctx, final ReadsSparkSource readsSource) {

        final Tuple2> loadedBam = loadBam(inputBamPath, readsSource);
        if (loadedBam == null) return false;
        final SAMFileHeader header = loadedBam._1;
        final JavaRDD reads = loadedBam._2;
        Utils.nonNull(header);
        Utils.nonNull(reads);
        if (isPaired && !header.getSortOrder().equals(SAMFileHeader.SortOrder.queryname)) {
            throw new UserException.BadInput("Paired input BAM must be sorted by queryname");
        }

        final JavaRDD alignedReads = aligner.doBwaAlignment(reads, isPaired, ctx.broadcast(header));
        writeBam(alignedReads, inputBamPath, isPaired, ctx, header);
        return true;
    }

    @Override
    protected void runTool(final JavaSparkContext ctx) {

        if (!readArguments.getReadPathSpecifiers().isEmpty()) {
            throw new UserException.BadInput("Please use --paired-input or --unpaired-input instead of --input");
        }
        Utils.validateArg((outputPaired == null || new GATKPath(outputPaired).isBam()) &&
                        (outputUnpaired == null || new GATKPath(outputUnpaired).isBam()),
                        "Only BAM output is supported");
        final ReadsSparkSource readsSource = new ReadsSparkSource(ctx, readArguments.getReadValidationStringency());

        final PSBwaAlignerSpark aligner = new PSBwaAlignerSpark(ctx, bwaArgs);
        boolean bPairedSuccess = alignBam(inputPaired, aligner, true, ctx, readsSource);
        boolean bUnpairedSuccess = alignBam(inputUnpaired, aligner, false, ctx, readsSource);
        if (!bPairedSuccess && !bUnpairedSuccess) {
            throw new UserException.BadInput("No reads were loaded. Ensure --paired-input and/or --unpaired-input are set and valid.");
        }
        aligner.close();
    }

}