org.broadinstitute.hellbender.tools.spark.pipelines.BQSRPipelineSpark Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of gatk Show documentation
Development on GATK 4
The newest version!
package org.broadinstitute.hellbender.tools.spark.pipelines;

import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.broadcast.Broadcast;
import org.broadinstitute.barclay.argparser.Argument;
import org.broadinstitute.barclay.argparser.ArgumentCollection;
import org.broadinstitute.barclay.argparser.BetaFeature;
import org.broadinstitute.barclay.argparser.CommandLineProgramProperties;
import org.broadinstitute.barclay.help.DocumentedFeature;
import org.broadinstitute.hellbender.cmdline.StandardArgumentDefinitions;
import org.broadinstitute.hellbender.engine.filters.ReadFilter;
import org.broadinstitute.hellbender.engine.spark.GATKSparkTool;
import org.broadinstitute.hellbender.utils.spark.JoinReadsWithVariants;
import org.broadinstitute.hellbender.tools.ApplyBQSRUniqueArgumentCollection;
import org.broadinstitute.hellbender.tools.spark.transforms.ApplyBQSRSparkFn;
import org.broadinstitute.hellbender.tools.spark.transforms.BaseRecalibratorSparkFn;
import org.broadinstitute.hellbender.tools.walkers.bqsr.BaseRecalibrator;
import org.broadinstitute.hellbender.utils.read.GATKRead;
import org.broadinstitute.hellbender.utils.recalibration.RecalibrationArgumentCollection;
import org.broadinstitute.hellbender.utils.recalibration.RecalibrationReport;
import org.broadinstitute.hellbender.utils.variant.GATKVariant;
import picard.cmdline.programgroups.ReadDataManipulationProgramGroup;

import java.util.List;

/**
 * The full BQSR pipeline in one tool to run on Spark.
 * The final result is analysis-ready reads.
 * This runs BaseRecalibrator and then ApplyBQSR to give a BAM with recalibrated base qualities.
 *
 *
 * Input
 * 
 *     A BAM or CRAM file containing input read data
 *      A database of known polymorphic sites to skip over.
 * 
 *
 * Output
 *  A BAM or CRAM file containing the recalibrated read data
 *
 * Usage example
 *  * gatk BQSRPipelineSpark \
 *   -R gs://my-gcs-bucket/reference.fasta \
 *   -I gs://my-gcs-bucket/input.bam \
 *   --known-sites gs://my-gcs-bucket/sites_of_variation.vcf \
 *   --known-sites gs://my-gcs-bucket/another/optional/setOfSitesToMask.vcf \
 *   -O gs://my-gcs-bucket/output.bam \
 *   -- \
 *   --sparkRunner GCS \
 *   --cluster my-dataproc-cluster
 * 
 */
@CommandLineProgramProperties(
        summary = BQSRPipelineSpark.USAGE_SUMMARY,
        oneLineSummary = BQSRPipelineSpark.USAGE_ONE_LINE_SUMMARY,
        usageExample = "BQSRPipelineSpark -I in.bam --known-sites in.vcf -O out.bam",
        programGroup = ReadDataManipulationProgramGroup.class
)
@DocumentedFeature
@BetaFeature
public final class BQSRPipelineSpark extends GATKSparkTool {
    private static final long serialVersionUID = 1L;

    static final String USAGE_ONE_LINE_SUMMARY = "Both steps of BQSR (BaseRecalibrator and ApplyBQSR) on Spark";
    static final String USAGE_SUMMARY = "This tools performs 2 steps of BQSR - " +
            "creation of recalibration tables and rewriting of the bam, " +
            "without writing the tables to disk. ";

    @Override
    public boolean requiresReads() { return true; }

    @Override
    public boolean requiresReference() { return true; }

    @Argument(doc = "the known variants", fullName = BaseRecalibrator.KNOWN_SITES_ARG_FULL_NAME, optional = false)
    protected List knownVariants;

    @Argument(doc = "the output bam", shortName = StandardArgumentDefinitions.OUTPUT_SHORT_NAME,
            fullName = StandardArgumentDefinitions.OUTPUT_LONG_NAME, optional = false)
    protected String output;

    /**
     * all the command line arguments for BQSR and its covariates
     */
    @ArgumentCollection(doc = "all the command line arguments for BQSR and its covariates")
    private final RecalibrationArgumentCollection bqsrArgs = new RecalibrationArgumentCollection();

    /**
     * command-line arguments to fine tune the apply BQSR step.
     */
    @ArgumentCollection
    public ApplyBQSRUniqueArgumentCollection applyBqsrArgs = new ApplyBQSRUniqueArgumentCollection();

    @Override
    protected void runTool(final JavaSparkContext ctx) {
        String referenceFileName = addReferenceFilesForSpark(ctx, referenceArguments.getReferencePath());
        List localKnownSitesFilePaths = addVCFsForSpark(ctx, knownVariants);

        //Should this get the getUnfilteredReads? getReads will merge default and command line filters.
        //but the code below uses other filters for other parts of the pipeline that do not honor
        //the commandline.
        final JavaRDD initialReads = getReads();

        // The initial reads have already had the WellformedReadFilter applied to them, which
        // is all the filtering that ApplyBQSR wants. BQSR itself wants additional filtering
        // performed, so we do that here.
        //NOTE: this filter doesn't honor enabled/disabled commandline filters
        final ReadFilter bqsrReadFilter = ReadFilter.fromList(BaseRecalibrator.getBQSRSpecificReadFilterList(), getHeaderForReads());
        final JavaRDD filteredReadsForBQSR = initialReads.filter(read -> bqsrReadFilter.test(read));

        JavaPairRDD> readsWithVariants = JoinReadsWithVariants.join(filteredReadsForBQSR, localKnownSitesFilePaths);
        //note: we use the reference dictionary from the reads themselves.
        final RecalibrationReport bqsrReport = BaseRecalibratorSparkFn.apply(readsWithVariants, getHeaderForReads(), referenceFileName, bqsrArgs);

        final Broadcast reportBroadcast = ctx.broadcast(bqsrReport);
        final JavaRDD finalReads = ApplyBQSRSparkFn.apply(initialReads, reportBroadcast, getHeaderForReads(), applyBqsrArgs.toApplyBQSRArgumentCollection(bqsrArgs));

        writeReads(ctx, output, finalReads);
    }
}