org.broadinstitute.hellbender.tools.spark.BaseRecalibratorSpark Maven / Gradle / Ivy

Show more of this group Show more artifacts with this name
Show all versions of gatk Show documentation
Development on GATK 4
There is a newer version: 4.6.0.0
package org.broadinstitute.hellbender.tools.spark;

import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.broadinstitute.barclay.argparser.Argument;
import org.broadinstitute.barclay.argparser.ArgumentCollection;
import org.broadinstitute.barclay.argparser.BetaFeature;
import org.broadinstitute.barclay.argparser.CommandLineProgramProperties;
import org.broadinstitute.barclay.help.DocumentedFeature;
import org.broadinstitute.hellbender.cmdline.StandardArgumentDefinitions;
import org.broadinstitute.hellbender.engine.filters.ReadFilter;
import org.broadinstitute.hellbender.engine.spark.GATKSparkTool;
import org.broadinstitute.hellbender.utils.spark.JoinReadsWithVariants;
import org.broadinstitute.hellbender.tools.spark.transforms.BaseRecalibratorSparkFn;
import org.broadinstitute.hellbender.tools.walkers.bqsr.BaseRecalibrator;
import org.broadinstitute.hellbender.utils.SerializableFunction;
import org.broadinstitute.hellbender.utils.SimpleInterval;
import org.broadinstitute.hellbender.utils.gcs.BucketUtils;
import org.broadinstitute.hellbender.utils.read.GATKRead;
import org.broadinstitute.hellbender.utils.recalibration.BaseRecalibrationEngine;
import org.broadinstitute.hellbender.utils.recalibration.RecalUtils;
import org.broadinstitute.hellbender.utils.recalibration.RecalibrationArgumentCollection;
import org.broadinstitute.hellbender.utils.recalibration.RecalibrationReport;
import org.broadinstitute.hellbender.utils.variant.GATKVariant;
import picard.cmdline.programgroups.ReadDataManipulationProgramGroup;

import java.io.PrintStream;
import java.util.List;

/**
 * Spark version of the first pass of the base quality score recalibration.
 * Generates a recalibration table based on various covariates.
 * The default covariates are read group, reported quality score, machine cycle, and nucleotide context.
 *
 * 
 * This walker generates tables based on specified covariates.
 * It does a by-locus traversal operating only at sites that are not in the known-sites resource.
 * ExAc, gnomAD, or dbSNP resources can be used as known sites of variation.
 * We assume that all reference mismatches we see are therefore errors and indicative of poor base quality.
 * Since there is a large amount of data one can then calculate an empirical
 * probability of error given the particular covariates seen at this site, where p(error) = num mismatches / num observations.
 * The output file is a table (of the several covariate values, num observations, num mismatches, empirical quality score).
 * 
 *

 *
 * Input
 * 
 *  The input read data whose base quality scores need to be assessed.
 *  A database of known polymorphic sites to skip over.
 * 
 *
 * Output
 * 
 * A GATK Report file with many tables:
 * 

 *     The list of arguments
 *     The quantized qualities table
 *     The recalibration table by read group
 *     The recalibration table by quality score
 *     The recalibration table for all the optional covariates
 * 
 *
 * The GATK Report is intended to be easy to read by humans or computers. Check out the documentation of the GATKReport to learn how to manipulate this table.
 * 
 *
 * Examples
 *  * gatk BaseRecalibratorSpark \
 *   -I gs://my-gcs-bucket/my_reads.bam \
 *   -R gs://my-gcs-bucket/reference.fasta \
 *   --known-sites gs://my-gcs-bucket/sites_of_variation.vcf \
 *   --known-sites gs://my-gcs-bucket/another/optional/setOfSitesToMask.vcf \
 *   -O gs://my-gcs-bucket/recal_data.table \
 *   -- \
 *   --sparkRunner GCS \
 *   --cluster my-dataproc-cluster
 * 
 */

@CommandLineProgramProperties(
        summary = BaseRecalibratorSpark.USAGE_SUMMARY,
        oneLineSummary = BaseRecalibratorSpark.USAGE_ONE_LINE_SUMMARY,
        programGroup = ReadDataManipulationProgramGroup.class
)
@DocumentedFeature
@BetaFeature
public class BaseRecalibratorSpark extends GATKSparkTool {
    private static final long serialVersionUID = 1L;

    static final String USAGE_ONE_LINE_SUMMARY = "Generate recalibration table for Base Quality Score Recalibration (BQSR) on Spark";
    static final String USAGE_SUMMARY = "First pass of the Base Quality Score Recalibration (BQSR) on Spark." +
            " Generate a recalibration table based on various user-specified covariates " +
            "(such as read group, reported quality score, machine cycle, and nucleotide context).";

    @Override
    public boolean requiresReads() { return true; }

    @Override
    public boolean requiresReference() { return true; }

    @Override
    public SerializableFunction getReferenceWindowFunction() {
        return BaseRecalibrationEngine.BQSR_REFERENCE_WINDOW_FUNCTION;
    }

    @Override
    public List getDefaultReadFilters() {
        return BaseRecalibrator.getStandardBQSRReadFilterList();
    }

    @Argument(doc = "the known variants", fullName = BaseRecalibrator.KNOWN_SITES_ARG_FULL_NAME, optional = false)
    private List knownVariants;

    @Argument(doc = "Path to save the final recalibration tables to.",
            shortName = StandardArgumentDefinitions.OUTPUT_SHORT_NAME, fullName = StandardArgumentDefinitions.OUTPUT_LONG_NAME, optional = false)
    private String outputTablesPath = null;

    /**
     * all the command line arguments for BQSR and its covariates
     */
    @ArgumentCollection(doc = "all the command line arguments for BQSR and its covariates")
    private final RecalibrationArgumentCollection bqsrArgs = new RecalibrationArgumentCollection();

    @Override
    protected void runTool( JavaSparkContext ctx ) {
        String referenceFileName = addReferenceFilesForSpark(ctx, referenceArguments.getReferencePath());
        List localKnownSitesFilePaths = addVCFsForSpark(ctx, knownVariants);

        JavaPairRDD> readsWithVariants = JoinReadsWithVariants.join(getReads(), localKnownSitesFilePaths);

        final RecalibrationReport bqsrReport = BaseRecalibratorSparkFn.apply(readsWithVariants, getHeaderForReads(), referenceFileName, bqsrArgs);

        try ( final PrintStream reportStream = new PrintStream(BucketUtils.createFile(outputTablesPath)) ) {
            RecalUtils.outputRecalibrationReport(reportStream, bqsrArgs, bqsrReport.getQuantizationInfo(), bqsrReport.getRecalibrationTables(), bqsrReport.getCovariates());
        }
    }
}