org.broadinstitute.hellbender.tools.spark.pathseq.PathSeqPipelineSpark Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of gatk Show documentation
Development on GATK 4
The newest version!
package org.broadinstitute.hellbender.tools.spark.pathseq;

import htsjdk.samtools.SAMFileHeader;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.broadcast.Broadcast;
import org.apache.spark.storage.StorageLevel;
import org.broadinstitute.barclay.argparser.Argument;
import org.broadinstitute.barclay.argparser.ArgumentCollection;
import org.broadinstitute.barclay.argparser.CommandLineProgramProperties;
import org.broadinstitute.barclay.help.DocumentedFeature;
import org.broadinstitute.hellbender.cmdline.GATKPlugin.GATKReadFilterPluginDescriptor;
import org.broadinstitute.hellbender.cmdline.StandardArgumentDefinitions;
import org.broadinstitute.hellbender.cmdline.programgroups.MetagenomicsProgramGroup;
import org.broadinstitute.hellbender.engine.spark.GATKSparkTool;
import org.broadinstitute.hellbender.engine.spark.datasources.ReadsSparkSink;
import org.broadinstitute.hellbender.exceptions.GATKException;
import org.broadinstitute.hellbender.exceptions.UserException;
import org.broadinstitute.hellbender.tools.spark.pathseq.loggers.*;
import org.broadinstitute.hellbender.utils.read.GATKRead;
import org.broadinstitute.hellbender.utils.read.ReadsWriteFormat;
import scala.Tuple2;

import java.io.IOException;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;

/**
 * Combined tool that performs all PathSeq steps: read filtering, microbe reference alignment and abundance scoring
 *
 * PathSeq is a suite of tools for detecting microbial organisms in deep sequenced biological samples. It is capable of
 * (1) quantifying microbial abundances in metagenomic samples containing a mixture of organisms, (2) detecting extremely
 * low-abundance (<0.001%) organisms, and (3) identifying unknown sequences that may belong to novel organisms. The pipeline is based
 * on a previously published tool of the same name (Kostic et al. 2011), which has been
 * used in a wide range of studies to investigate novel associations between pathogens and human disease.
 *
 * The pipeline consists of three phases: (1) removing reads that are low quality, low complexity, or match a given
 * host (e.g. human) reference, (2) aligning the remaining reads to a microorganism reference, and (3) determining
 * the taxonomic classification of each read and estimating microbe abundances. These steps can be performed individually using
 * PathSeqFilterSpark, PathSeqBwaSpark, and PathSeqScoreSpark. To simplify using the pipeline, this tool combines the
 * three steps into one. Further details can be found in the individual tools' documentations.
 *
 * The filtering phase ensures that only high fidelity, non-host reads are classified, thus reducing computational costs
 * and false positives. Note that while generally applicable to any type of biological sample (e.g. saliva, stool), PathSeq
 * is particularly efficient for samples containing a high percentage of host reads (e.g. blood, tissue, CSF). PathSeq
 * is able to detect evidence of low-abundance organisms and scales to use comprehensive genomic database references
 * (e.g. > 100 Gbp). Lastly, because PathSeq works by identifying both host and known microbial sequences, it can also
 * be used to discover novel pathogens by deducing the sample to sequences of unknown origin, which may be followed
 * by de novo assembly.
 *
 * Because sequence alignment is computationally burdensome, PathSeq is integrated with Apache Spark,
 * enabling parallelization of all steps in the pipeline on multi-core workstations and cluster environments. This
 * overcomes the high computational cost and permits rapid turnaround times (minutes to hours) in deep sequenced samples.
 *
 * Reference files
 *
 * Before running the PathSeq pipeline, the host and microbe references must be built. Prebuilt references
 * for a standard microbial set are available in the
 * GATK Resource Bundle.
 *
 * To build custom references, users must provide FASTA files of the host and pathogen sequences. Tools are included to
 * generate the necessary files: the host k-mer database (PathSeqBuildKmers), BWA-MEM index image files of the host and
 * pathogen references (BwaMemIndexImageCreator), and a taxonomic tree of the pathogen reference (PathSeqBuildReferenceTaxonomy).
 *
 * Input
 *
 * 
 *     BAM containing input reads (either unaligned or aligned to a host reference)
 *     Host k-mer file generated using PathSeqBuildKmers
 *     Host BWA-MEM index image generated using BwaMemIndexImageCreator
 *     Microbe BWA-MEM index image generated using BwaMemIndexImageCreator
 *     Indexed microbe reference dictionary (fasta file NOT required)
 *     Taxonomy file generated using PathSeqBuildReferenceTaxonomy
 * 
 *
 * Output
 *
 * 
 *     Taxonomic scores table
 *     Annotated BAM aligned to the microbe reference
 *     Filter metrics file (optional)
 *     Score metrics file (optional)
 * 
 *
 * Usage example
 *
 * This tool can be run without explicitly specifying Spark options. That is to say, the given example command
 * without Spark options will run locally. See
 * Tutorial#10060 for an example
 * of how to set up and run a Spark tool on a cloud Spark cluster.
 *
 * Local mode:
 *
 *  * gatk PathSeqPipelineSpark  \
 *   --input input_reads.bam \
 *   --kmer-file host_kmers.bfi \
 *   --filter-bwa-image host_reference.img \
 *   --microbe-bwa-image microbe_reference.img \
 *   --microbe-dict reference.dict \
 *   --taxonomy-file taxonomy.db \
 *   --min-clipped-read-length 60 \
 *   --min-score-identity 0.90 \
 *   --identity-margin 0.02 \
 *   --scores-output scores.txt \
 *   --output output_reads.bam \
 *   --filter-metrics filter_metrics.txt \
 *   --score-metrics score_metrics.txt
 * 
 *
 * Spark cluster on Google Cloud DataProc with 6 16-core / 208GB memory worker nodes:
 *
 *  * gatk PathSeqPipelineSpark  \
 *   --input gs://my-gcs-bucket/input_reads.bam \
 *   --kmer-file hdfs://my-cluster-m:8020//host_kmers.bfi \
 *   --filter-bwa-image /references/host_reference.img \
 *   --microbe-bwa-image /references/microbe_reference.img \
 *   --microbe-dict hdfs://my-cluster-m:8020//reference.dict \
 *   --taxonomy-file hdfs://my-cluster-m:8020//taxonomy.db \
 *   --min-clipped-read-length 60 \
 *   --min-score-identity 0.90 \
 *   --identity-margin 0.02 \
 *   --scores-output gs://my-gcs-bucket/scores.txt \
 *   --output gs://my-gcs-bucket/output_reads.bam \
 *   --filter-metrics gs://my-gcs-bucket/filter_metrics.txt \
 *   --score-metrics gs://my-gcs-bucket/score_metrics.txt \
 *   -- \
 *   --sparkRunner GCS \
 *   --cluster my_cluster \
 *   --driver-memory 8G \
 *   --executor-memory 32G \
 *   --num-executors 4 \
 *   --executor-cores 30 \
 *   --conf spark.executor.memoryOverhead=132000
 * 
 *
 * Note that the host and microbe BWA images must be copied to the same paths on every worker node. The microbe dictionary,
 * host k-mer file, and taxonomy file may also be copied to a single path on every worker node or to HDFS.
 *
 * References
 * 
 *     Walker, M. A., Pedamallu, C. S. et al. (2018). GATK PathSeq: a customizable computational tool for the discovery and identification of microbial sequences in libraries from eukaryotic hosts. Bioinformatics. 34, 4287-4289.
 *     Kostic, A. D. et al. (2011). PathSeq: software to identify or discover microbes by deep sequencing of human tissue. Nat. Biotechnol. 29, 393-396.
 * 
 *
 * @author Mark Walker <[email protected]>
 */
@CommandLineProgramProperties(summary = "Combined tool that performs all PathSeq steps: read filtering, microbe reference alignment and abundance scoring",
        oneLineSummary = "Combined tool that performs all steps: read filtering, microbe reference alignment, and abundance scoring",
        programGroup = MetagenomicsProgramGroup.class)
@DocumentedFeature
public class PathSeqPipelineSpark extends GATKSparkTool {

    private static final long serialVersionUID = 1L;

    public static final String READS_PER_PARTITION_LONG_NAME = "pipeline-reads-per-partition";

    @ArgumentCollection
    public PSFilterArgumentCollection filterArgs = new PSFilterArgumentCollection();

    @ArgumentCollection
    public PSBwaArgumentCollection bwaArgs = new PSBwaArgumentCollection();

    @ArgumentCollection
    public PSScoreArgumentCollection scoreArgs = new PSScoreArgumentCollection();

    @Argument(doc = "Output BAM",
            fullName = StandardArgumentDefinitions.OUTPUT_LONG_NAME,
            shortName = StandardArgumentDefinitions.OUTPUT_SHORT_NAME,
            optional = true)
    public String outputPath = null;

    @Argument(doc = "Number of reads per partition to use for alignment and scoring.",
            fullName = READS_PER_PARTITION_LONG_NAME,
            optional = true,
            minValue = 100)
    public int readsPerPartition = 5000;

    /**
     * Because numReducers is based on the input size, it causes too many partitions to be produced when the output size is much smaller.
     */
    @Argument(doc = "Number of reads per partition for output. Use this to control the number of sharded BAMs (not --num-reducers).",
            fullName = "readsPerPartitionOutput",
            optional = true,
            minValue = 100,
            minRecommendedValue = 100000)
    public int readsPerPartitionOutput = 1000000;

    /**
     * Reduces number of partitions of paired reads, keeping pairs together.
     */
    private static JavaRDD repartitionPairedReads(final JavaRDD pairedReads, final int alignmentPartitions, final long numReads) {
        final int readsPerPartition = 1 + (int) (numReads / alignmentPartitions);
        return pairedReads.mapPartitions(iter -> PathSeqPipelineSpark.pairPartitionReads(iter, readsPerPartition))
                .repartition(alignmentPartitions)
                .flatMap(List::iterator);
    }

    /**
     * Maps partition of paired reads to a partition of Lists containing each pair. Assumes pairs are adjacent.
     */
    private static Iterator> pairPartitionReads(final Iterator iter, final int readsPerPartition) {
        final ArrayList> readPairs = new ArrayList<>(readsPerPartition / 2);
        while (iter.hasNext()) {
            final List list = new ArrayList<>(2);
            list.add(iter.next());
            if (!iter.hasNext()) throw new GATKException("Odd number of read pairs in paired reads partition");
            list.add(iter.next());
            if (!list.get(0).getName().equals(list.get(1).getName())) throw new GATKException("Pair did not have the same name in a paired reads partition");
            readPairs.add(list);
        }
        return readPairs.iterator();
    }

    @Override
    public boolean requiresReads() {
        return true;
    }

    @Override
    protected void runTool(final JavaSparkContext ctx) {

        filterArgs.doReadFilterArgumentWarnings(getCommandLineParser().getPluginDescriptor(GATKReadFilterPluginDescriptor.class), logger);
        SAMFileHeader header = PSUtils.checkAndClearHeaderSequences(getHeaderForReads(), filterArgs, logger);

        //Do not allow use of numReducers
        if (numReducers > 0) {
            throw new UserException.BadInput("Use --readsPerPartitionOutput instead of --num-reducers.");
        }

        //Filter
        final Tuple2, JavaRDD> filterResult;
        final PSFilter filter = new PSFilter(ctx, filterArgs, header);
        try (final PSFilterLogger filterLogger = filterArgs.filterMetricsFileUri != null ? new PSFilterFileLogger(getMetricsFile(), filterArgs.filterMetricsFileUri) : new PSFilterEmptyLogger()) {
            final JavaRDD inputReads = getReads();
            filterResult = filter.doFilter(inputReads, filterLogger);
        }
        JavaRDD pairedReads = filterResult._1;
        JavaRDD unpairedReads = filterResult._2;

        //Counting forces an action on the RDDs to guarantee we're done with the Bwa image and kmer filter
        final long numPairedReads = pairedReads.count();
        final long numUnpairedReads = unpairedReads.count();
        final long numTotalReads = numPairedReads + numUnpairedReads;

        //Closes Bwa image, kmer filter, and metrics file if used
        //Note the host Bwa image before must be unloaded before trying to load the pathogen image
        filter.close();

        //Rebalance partitions using the counts
        final int numPairedPartitions = 1 + (int) (numPairedReads / readsPerPartition);
        final int numUnpairedPartitions = 1 + (int) (numUnpairedReads / readsPerPartition);
        pairedReads = repartitionPairedReads(pairedReads, numPairedPartitions, numPairedReads);
        unpairedReads = unpairedReads.repartition(numUnpairedPartitions);

        //Bwa pathogen alignment
        final PSBwaAlignerSpark aligner = new PSBwaAlignerSpark(ctx, bwaArgs);
        PSBwaUtils.addReferenceSequencesToHeader(header, bwaArgs.microbeDictionary);
        final Broadcast headerBroadcast = ctx.broadcast(header);
        JavaRDD alignedPairedReads = aligner.doBwaAlignment(pairedReads, true, headerBroadcast);
        JavaRDD alignedUnpairedReads = aligner.doBwaAlignment(unpairedReads, false, headerBroadcast);

        //Cache this expensive result. Note serialization significantly reduces memory consumption.
        alignedPairedReads.persist(StorageLevel.MEMORY_AND_DISK_SER());
        alignedUnpairedReads.persist(StorageLevel.MEMORY_AND_DISK_SER());

        //Score pathogens
        final PSScorer scorer = new PSScorer(scoreArgs);
        final JavaRDD readsFinal = scorer.scoreReads(ctx, alignedPairedReads, alignedUnpairedReads, header);

        //Clean up header
        header = PSBwaUtils.removeUnmappedHeaderSequences(header, readsFinal, logger);

        //Log read counts
        if (scoreArgs.scoreMetricsFileUri != null) {
            try (final PSScoreLogger scoreLogger = new PSScoreFileLogger(getMetricsFile(), scoreArgs.scoreMetricsFileUri)) {
                scoreLogger.logReadCounts(readsFinal);
            }
        }

        //Write reads to BAM, if specified
        if (outputPath != null) {
            try {
                //Reduce number of partitions since we previously went to ~5K reads per partition, which
                // is far too small for sharded output.
                final int numPartitions = Math.max(1, (int) (numTotalReads / readsPerPartitionOutput));
                final JavaRDD readsFinalRepartitioned = readsFinal.coalesce(numPartitions, false);
                ReadsSparkSink.writeReads(ctx, outputPath, null, readsFinalRepartitioned, header,
                        shardedOutput ? ReadsWriteFormat.SHARDED : ReadsWriteFormat.SINGLE, numPartitions, shardedPartsDir, true, splittingIndexGranularity);
            } catch (final IOException e) {
                throw new UserException.CouldNotCreateOutputFile(outputPath, "writing failed", e);
            }
        }
        aligner.close();
    }

}