org.broadinstitute.hellbender.tools.spark.ApplyBQSRSpark Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of gatk Show documentation
Development on GATK 4
The newest version!
package org.broadinstitute.hellbender.tools.spark;

import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.broadcast.Broadcast;
import org.broadinstitute.barclay.argparser.Argument;
import org.broadinstitute.barclay.argparser.ArgumentCollection;
import org.broadinstitute.barclay.argparser.BetaFeature;
import org.broadinstitute.barclay.argparser.CommandLineProgramProperties;
import org.broadinstitute.barclay.help.DocumentedFeature;
import org.broadinstitute.hellbender.cmdline.StandardArgumentDefinitions;
import org.broadinstitute.hellbender.engine.spark.GATKSparkTool;
import org.broadinstitute.hellbender.tools.ApplyBQSRArgumentCollection;
import org.broadinstitute.hellbender.tools.spark.transforms.ApplyBQSRSparkFn;
import org.broadinstitute.hellbender.utils.gcs.BucketUtils;
import org.broadinstitute.hellbender.utils.read.GATKRead;
import org.broadinstitute.hellbender.utils.recalibration.RecalibrationReport;
import picard.cmdline.programgroups.ReadDataManipulationProgramGroup;

/**
 * Apply base quality score recalibration with Spark.
 * 
 * 
This tool performs the second pass in a two-stage process called Base Quality Score Recalibration (BQSR).
 * Specifically, it recalibrates the base qualities of the input reads based on the recalibration table produced by
 * the BaseRecalibrator tool, and outputs a recalibrated BAM or CRAM file.
 * See Tutorial#10060
 * for an example of how to set up and run a Spark tool on a cloud Spark cluster.
 * 
 * Usage examples
 *  * gatk ApplyBQSRSpark \
 * -I gs://my-gcs-bucket/input.bam \
 * -bqsr gs://my-gcs-bucket/recalibration.table \
 * -O gs://my-gcs-bucket/output.bam \
 * -- \
 * --sparkRunner GCS \
 * --cluster my-dataproc-cluster
 * 
 * 
 * To additionally bin base qualities:
 * 
 * gatk ApplyBQSRSpark \
 * -I gs://my-gcs-bucket/input.bam \
 * -bqsr gs://my-gcs-bucket/recalibration.table \
 * --static-quantized-quals 10 --static-quantized-quals 20 \
 * --static-quantized-quals 30 --static-quantized-quals 40 \
 * -O gs://my-gcs-bucket/output.bam \
 * -- \
 * --sparkRunner GCS \
 * --cluster my-dataproc-cluster
 * 
 */

@CommandLineProgramProperties(
        summary=ApplyBQSRSpark.USAGE_SUMMARY,
        oneLineSummary=ApplyBQSRSpark.USAGE_ONE_LINE_SUMMARY,
        programGroup = ReadDataManipulationProgramGroup.class
)

@DocumentedFeature
@BetaFeature
public final class ApplyBQSRSpark extends GATKSparkTool {
    private static final long serialVersionUID = 0l;
    static final String USAGE_ONE_LINE_SUMMARY = "Apply base quality score recalibration on Spark";
    static final String USAGE_SUMMARY = "Apply a linear base quality recalibration model trained with the BaseRecalibrator tool on Spark.";

    @Override
    public boolean requiresReads() { return true; }

    @Argument(doc = "the output bam", shortName = StandardArgumentDefinitions.OUTPUT_SHORT_NAME,
            fullName = StandardArgumentDefinitions.OUTPUT_LONG_NAME, optional = false)
    private String output;

    /**
     * Enables recalibration of base qualities.
     * The covariates tables are produced by the BaseRecalibrator tool.
     * Please be aware that you should only run recalibration with the covariates file created on the same input bam(s).
     */
    @Argument(fullName= StandardArgumentDefinitions.BQSR_TABLE_LONG_NAME, shortName= StandardArgumentDefinitions.BQSR_TABLE_SHORT_NAME, doc="Input covariates table file for base quality score recalibration")
    private String bqsrRecalFile;

    @ArgumentCollection
    private ApplyBQSRArgumentCollection applyBQSRArgs = new ApplyBQSRArgumentCollection();

    @Override
    protected void runTool(JavaSparkContext ctx) {
        JavaRDD initialReads = getReads();
        Broadcast recalibrationReportBroadCast = ctx.broadcast(new RecalibrationReport(BucketUtils.openFile(bqsrRecalFile)));
        final JavaRDD recalibratedReads = ApplyBQSRSparkFn.apply(initialReads, recalibrationReportBroadCast, getHeaderForReads(), applyBQSRArgs);
        writeReads(ctx, output, recalibratedReads);
    }
}