All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.broadinstitute.hellbender.tools.spark.pathseq.PathSeqBwaSpark Maven / Gradle / Ivy

The newest version!
package org.broadinstitute.hellbender.tools.spark.pathseq;

import htsjdk.samtools.SAMFileHeader;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.storage.StorageLevel;
import org.broadinstitute.barclay.argparser.Argument;
import org.broadinstitute.barclay.argparser.ArgumentCollection;
import org.broadinstitute.barclay.argparser.CommandLineProgramProperties;
import org.broadinstitute.barclay.help.DocumentedFeature;
import org.broadinstitute.hellbender.cmdline.programgroups.MetagenomicsProgramGroup;
import org.broadinstitute.hellbender.engine.GATKPath;
import org.broadinstitute.hellbender.engine.spark.GATKSparkTool;
import org.broadinstitute.hellbender.engine.spark.datasources.ReadsSparkSink;
import org.broadinstitute.hellbender.engine.spark.datasources.ReadsSparkSource;
import org.broadinstitute.hellbender.exceptions.UserException;
import org.broadinstitute.hellbender.utils.Utils;
import org.broadinstitute.hellbender.utils.gcs.BucketUtils;
import org.broadinstitute.hellbender.utils.read.GATKRead;
import org.broadinstitute.hellbender.utils.read.ReadsWriteFormat;
import scala.Tuple2;

import java.io.IOException;

/**
 * Align reads to a microbe reference using BWA-MEM and Spark. Second step in the PathSeq pipeline.
 *
 * 

See PathSeqPipelineSpark for an overview of the PathSeq pipeline.

* *

This is a specialized version of BwaSpark designed for the PathSeq pipeline. The main difference is that alignments * with SAM bit flag 0x100 or 0x800 (indicating secondary or supplementary alignment) are omitted in the output.

* *

Inputs

* *
    *
  • Unaligned queryname-sorted BAM file containing only paired reads (paired-end reads with mates)
  • *
  • Unaligned BAM file containing only unpaired reads (paired-end reads without mates and/or single-end reads)
  • *
  • *Microbe reference BWA-MEM index image generated using BwaMemIndexImageCreator
  • *
  • *Indexed microbe reference dictionary (fasta file NOT required)
  • *
* *

*A standard microbe reference is available in the GATK Resource Bundle.

* *

Output

* *
    *
  • Aligned BAM file containing the paired reads (paired-end reads with mates)
  • *
  • Aligned BAM file containing the unpaired reads (paired-end reads without mates and/or single-end reads)
  • *
* *

Usage example

* *

This tool can be run without explicitly specifying Spark options. That is to say, the given example command * without Spark options will run locally. See * Tutorial#10060 for an example * of how to set up and run a Spark tool on a cloud Spark cluster.

* *

Local mode:

* *
 * gatk PathSeqBwaSpark  \
 *   --paired-input input_reads_paired.bam \
 *   --unpaired-input input_reads_unpaired.bam \
 *   --paired-output output_reads_paired.bam \
 *   --unpaired-output output_reads_unpaired.bam \
 *   --microbe-bwa-image reference.img \
 *   --microbe-dict reference.dict
 * 
* *

Spark cluster on Google Cloud DataProc with 6 32-core / 208GB memory worker nodes:

* *
 * gatk PathSeqBwaSpark  \
 *   --paired-input gs://my-gcs-bucket/input_reads_paired.bam \
 *   --unpaired-input gs://my-gcs-bucket/input_reads_unpaired.bam \
 *   --paired-output gs://my-gcs-bucket/output_reads_paired.bam \
 *   --unpaired-output gs://my-gcs-bucket/output_reads_unpaired.bam \
 *   --microbe-bwa-image /references/reference.img \
 *   --microbe-dict hdfs://my-cluster-m:8020//references/reference.dict \
 *   --bam-partition-size 4000000 \
 *   -- \
 *   --sparkRunner GCS \
 *   --cluster my_cluster \
 *   --driver-memory 8G \
 *   --executor-memory 32G \
 *   --num-executors 4 \
 *   --executor-cores 30 \
 *   --conf spark.executor.memoryOverhead=132000
 * 
* *

Note that the microbe BWA image must be copied to the same path on every worker node. The microbe FASTA * may also be copied to a single path on every worker node or to HDFS.

* *

Notes

* *

For small input BAMs, it is recommended that the user reduce the BAM partition size in order to increase parallelism. Note * that insert size is estimated separately for each Spark partition. Consequently partition size and other Spark * parameters can affect the output for paired-end alignment.

* *

To minimize output file size, header lines are included only for sequences with at least one alignment.

* * @author Mark Walker <[email protected]> */ @CommandLineProgramProperties(summary = "Align reads to a microbe reference using BWA-MEM and Spark. Second step in the PathSeq pipeline.", oneLineSummary = "Step 2: Aligns reads to the microbe reference", programGroup = MetagenomicsProgramGroup.class) @DocumentedFeature public final class PathSeqBwaSpark extends GATKSparkTool { private static final long serialVersionUID = 1L; public static final String PAIRED_INPUT_LONG_NAME = "paired-input"; public static final String UNPAIRED_INPUT_LONG_NAME = "unpaired-input"; public static final String PAIRED_OUTPUT_LONG_NAME = "paired-output"; public static final String UNPAIRED_OUTPUT_LONG_NAME = "unpaired-output"; @Argument(doc = "Input queryname-sorted BAM containing only paired reads", fullName = PAIRED_INPUT_LONG_NAME, optional = true) public String inputPaired = null; @Argument(doc = "Input BAM containing only unpaired reads", fullName = UNPAIRED_INPUT_LONG_NAME, optional = true) public String inputUnpaired = null; @Argument(doc = "Output BAM containing only paired reads", fullName = PAIRED_OUTPUT_LONG_NAME, optional = true) public String outputPaired = null; @Argument(doc = "Output BAM containing only unpaired reads", fullName = UNPAIRED_OUTPUT_LONG_NAME, optional = true) public String outputUnpaired = null; @ArgumentCollection public PSBwaArgumentCollection bwaArgs = new PSBwaArgumentCollection(); /** * Reads bam from path and returns tuple of the header and reads RDD */ private Tuple2> loadBam(final String path, final ReadsSparkSource readsSource) { if (path == null) return null; if (BucketUtils.fileExists(path)) { final SAMFileHeader header = readsSource.getHeader(new GATKPath(path), null); if (header.getSequenceDictionary() != null && !header.getSequenceDictionary().isEmpty()) { throw new UserException.BadInput("Input BAM should be unaligned, but found one or more sequences in the header."); } PSBwaUtils.addReferenceSequencesToHeader(header, bwaArgs.microbeDictionary); final JavaRDD reads = readsSource.getParallelReads(new GATKPath(path), null, null, bamPartitionSplitSize); return new Tuple2<>(header, reads); } logger.warn("Could not find file " + path + ". Skipping..."); return null; } /** * Writes RDD of reads to path. Note writeReads() is not used because there are separate paired/unpaired outputs. * Header sequence dictionary is reduced to only those that were aligned to. */ private void writeBam(final JavaRDD reads, final String inputBamPath, final boolean isPaired, final JavaSparkContext ctx, SAMFileHeader header) { //Only retain header sequences that were aligned to. //This invokes an action and therefore the reads must be cached. reads.persist(StorageLevel.MEMORY_AND_DISK_SER()); header = PSBwaUtils.removeUnmappedHeaderSequences(header, reads, logger); final String outputPath = isPaired ? outputPaired : outputUnpaired; try { ReadsSparkSink.writeReads(ctx, outputPath, null, reads, header, shardedOutput ? ReadsWriteFormat.SHARDED : ReadsWriteFormat.SINGLE, PSUtils.pathseqGetRecommendedNumReducers(inputBamPath, numReducers, getTargetPartitionSize()), shardedPartsDir, true, splittingIndexGranularity); } catch (final IOException e) { throw new UserException.CouldNotCreateOutputFile(outputPath, "Writing failed", e); } } /** * Loads a bam, aligns using the given aligner, and writes to a new bam. Returns false if the input bam could not * be read. */ private boolean alignBam(final String inputBamPath, final PSBwaAlignerSpark aligner, final boolean isPaired, final JavaSparkContext ctx, final ReadsSparkSource readsSource) { final Tuple2> loadedBam = loadBam(inputBamPath, readsSource); if (loadedBam == null) return false; final SAMFileHeader header = loadedBam._1; final JavaRDD reads = loadedBam._2; Utils.nonNull(header); Utils.nonNull(reads); if (isPaired && !header.getSortOrder().equals(SAMFileHeader.SortOrder.queryname)) { throw new UserException.BadInput("Paired input BAM must be sorted by queryname"); } final JavaRDD alignedReads = aligner.doBwaAlignment(reads, isPaired, ctx.broadcast(header)); writeBam(alignedReads, inputBamPath, isPaired, ctx, header); return true; } @Override protected void runTool(final JavaSparkContext ctx) { if (!readArguments.getReadPathSpecifiers().isEmpty()) { throw new UserException.BadInput("Please use --paired-input or --unpaired-input instead of --input"); } Utils.validateArg((outputPaired == null || new GATKPath(outputPaired).isBam()) && (outputUnpaired == null || new GATKPath(outputUnpaired).isBam()), "Only BAM output is supported"); final ReadsSparkSource readsSource = new ReadsSparkSource(ctx, readArguments.getReadValidationStringency()); final PSBwaAlignerSpark aligner = new PSBwaAlignerSpark(ctx, bwaArgs); boolean bPairedSuccess = alignBam(inputPaired, aligner, true, ctx, readsSource); boolean bUnpairedSuccess = alignBam(inputUnpaired, aligner, false, ctx, readsSource); if (!bPairedSuccess && !bUnpairedSuccess) { throw new UserException.BadInput("No reads were loaded. Ensure --paired-input and/or --unpaired-input are set and valid."); } aligner.close(); } }




© 2015 - 2025 Weber Informatics LLC | Privacy Policy