org.broadinstitute.hellbender.tools.spark.pipelines.BQSRPipelineSpark Maven / Gradle / Ivy
The newest version!
package org.broadinstitute.hellbender.tools.spark.pipelines;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.broadcast.Broadcast;
import org.broadinstitute.barclay.argparser.Argument;
import org.broadinstitute.barclay.argparser.ArgumentCollection;
import org.broadinstitute.barclay.argparser.BetaFeature;
import org.broadinstitute.barclay.argparser.CommandLineProgramProperties;
import org.broadinstitute.barclay.help.DocumentedFeature;
import org.broadinstitute.hellbender.cmdline.StandardArgumentDefinitions;
import org.broadinstitute.hellbender.engine.filters.ReadFilter;
import org.broadinstitute.hellbender.engine.spark.GATKSparkTool;
import org.broadinstitute.hellbender.utils.spark.JoinReadsWithVariants;
import org.broadinstitute.hellbender.tools.ApplyBQSRUniqueArgumentCollection;
import org.broadinstitute.hellbender.tools.spark.transforms.ApplyBQSRSparkFn;
import org.broadinstitute.hellbender.tools.spark.transforms.BaseRecalibratorSparkFn;
import org.broadinstitute.hellbender.tools.walkers.bqsr.BaseRecalibrator;
import org.broadinstitute.hellbender.utils.read.GATKRead;
import org.broadinstitute.hellbender.utils.recalibration.RecalibrationArgumentCollection;
import org.broadinstitute.hellbender.utils.recalibration.RecalibrationReport;
import org.broadinstitute.hellbender.utils.variant.GATKVariant;
import picard.cmdline.programgroups.ReadDataManipulationProgramGroup;
import java.util.List;
/**
* The full BQSR pipeline in one tool to run on Spark.
* The final result is analysis-ready reads.
* This runs BaseRecalibrator and then ApplyBQSR to give a BAM with recalibrated base qualities.
*
*
* Input
*
* - A BAM or CRAM file containing input read data
* - A database of known polymorphic sites to skip over.
*
*
* Output
* A BAM or CRAM file containing the recalibrated read data
*
* Usage example
*
* gatk BQSRPipelineSpark \
* -R gs://my-gcs-bucket/reference.fasta \
* -I gs://my-gcs-bucket/input.bam \
* --known-sites gs://my-gcs-bucket/sites_of_variation.vcf \
* --known-sites gs://my-gcs-bucket/another/optional/setOfSitesToMask.vcf \
* -O gs://my-gcs-bucket/output.bam \
* -- \
* --sparkRunner GCS \
* --cluster my-dataproc-cluster
*
*/
@CommandLineProgramProperties(
summary = BQSRPipelineSpark.USAGE_SUMMARY,
oneLineSummary = BQSRPipelineSpark.USAGE_ONE_LINE_SUMMARY,
usageExample = "BQSRPipelineSpark -I in.bam --known-sites in.vcf -O out.bam",
programGroup = ReadDataManipulationProgramGroup.class
)
@DocumentedFeature
@BetaFeature
public final class BQSRPipelineSpark extends GATKSparkTool {
private static final long serialVersionUID = 1L;
static final String USAGE_ONE_LINE_SUMMARY = "Both steps of BQSR (BaseRecalibrator and ApplyBQSR) on Spark";
static final String USAGE_SUMMARY = "This tools performs 2 steps of BQSR - " +
"creation of recalibration tables and rewriting of the bam, " +
"without writing the tables to disk. ";
@Override
public boolean requiresReads() { return true; }
@Override
public boolean requiresReference() { return true; }
@Argument(doc = "the known variants", fullName = BaseRecalibrator.KNOWN_SITES_ARG_FULL_NAME, optional = false)
protected List knownVariants;
@Argument(doc = "the output bam", shortName = StandardArgumentDefinitions.OUTPUT_SHORT_NAME,
fullName = StandardArgumentDefinitions.OUTPUT_LONG_NAME, optional = false)
protected String output;
/**
* all the command line arguments for BQSR and its covariates
*/
@ArgumentCollection(doc = "all the command line arguments for BQSR and its covariates")
private final RecalibrationArgumentCollection bqsrArgs = new RecalibrationArgumentCollection();
/**
* command-line arguments to fine tune the apply BQSR step.
*/
@ArgumentCollection
public ApplyBQSRUniqueArgumentCollection applyBqsrArgs = new ApplyBQSRUniqueArgumentCollection();
@Override
protected void runTool(final JavaSparkContext ctx) {
String referenceFileName = addReferenceFilesForSpark(ctx, referenceArguments.getReferencePath());
List localKnownSitesFilePaths = addVCFsForSpark(ctx, knownVariants);
//Should this get the getUnfilteredReads? getReads will merge default and command line filters.
//but the code below uses other filters for other parts of the pipeline that do not honor
//the commandline.
final JavaRDD initialReads = getReads();
// The initial reads have already had the WellformedReadFilter applied to them, which
// is all the filtering that ApplyBQSR wants. BQSR itself wants additional filtering
// performed, so we do that here.
//NOTE: this filter doesn't honor enabled/disabled commandline filters
final ReadFilter bqsrReadFilter = ReadFilter.fromList(BaseRecalibrator.getBQSRSpecificReadFilterList(), getHeaderForReads());
final JavaRDD filteredReadsForBQSR = initialReads.filter(read -> bqsrReadFilter.test(read));
JavaPairRDD> readsWithVariants = JoinReadsWithVariants.join(filteredReadsForBQSR, localKnownSitesFilePaths);
//note: we use the reference dictionary from the reads themselves.
final RecalibrationReport bqsrReport = BaseRecalibratorSparkFn.apply(readsWithVariants, getHeaderForReads(), referenceFileName, bqsrArgs);
final Broadcast reportBroadcast = ctx.broadcast(bqsrReport);
final JavaRDD finalReads = ApplyBQSRSparkFn.apply(initialReads, reportBroadcast, getHeaderForReads(), applyBqsrArgs.toApplyBQSRArgumentCollection(bqsrArgs));
writeReads(ctx, output, finalReads);
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy