All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.broadinstitute.hellbender.tools.walkers.consensus.DownsampleByDuplicateSet Maven / Gradle / Ivy

The newest version!
package org.broadinstitute.hellbender.tools.walkers.consensus;

import org.broadinstitute.barclay.argparser.Argument;
import org.broadinstitute.barclay.argparser.BetaFeature;
import org.broadinstitute.barclay.argparser.CommandLineProgramProperties;
import org.broadinstitute.barclay.argparser.WorkflowProperties;
import org.broadinstitute.barclay.argparser.WorkflowOutput;
import org.broadinstitute.hellbender.cmdline.StandardArgumentDefinitions;
import org.broadinstitute.hellbender.engine.DuplicateSetWalker;
import org.broadinstitute.hellbender.engine.FeatureContext;
import org.broadinstitute.hellbender.engine.GATKPath;
import org.broadinstitute.hellbender.engine.ReferenceContext;
import org.broadinstitute.hellbender.utils.read.SAMFileGATKReadWriter;
import picard.cmdline.programgroups.ReadDataManipulationProgramGroup;

import java.util.Random;

/**
 * Given a bam grouped by the same unique molecular identifier (UMI), this tool drops a specified fraction of duplicate sets and returns a new bam.
 * A duplicate set refers to a group of reads whose fragments start and end at the same genomic coordinate _and_ share the same UMI.
 *
 * The input bam must first be sorted by UMI using FGBio GroupReadsByUmi (http://fulcrumgenomics.github.io/fgbio/tools/latest/GroupReadsByUmi.html).
 *
 * Use this tool to create, for instance, an insilico mixture of duplex-sequenced samples to simulate tumor subclones.
 *
 * Suppose you wish to simulate a tumor sample in which 5% cells share a common set of somatic mutations
 * in addition to ones common to the entire cell population.
 *
 * If you randomly drop 5% of reads in sample A and 95% of reads in sample B and merge the reduced bams,
 * the resulting mixture skews the family-size distribution to the left. Here the family size refers to the
 * number of sequenced duplicate reads that share the same UMI.
 *
 * To see this, take a cancer sample, in which 5% of cells share a unique set of somatic mutations,
 * that was processed with duplex-UMIs (i.e. UMIs on both adapters) and high rounds of PCR. Suppose we have the sequence-ready
 * libraries of this sample attached to and amplified on the flowcell. Now, sort the flowcell lawn such that the reads from the above
 * 5% of the cell population moves near the top of the flowcell. This population must have the same family-size distribution as
 * the rest of the flowcell, at about 5% of the library complexity compared to the entire flowcell.
 *
 * Now imagine replacing this population with 5% ramdonly chosen from the *entire* flowcell of another sample that was prepared and sequenced similarly.
 * The library complexity of these "graft" reads is higher than that of the original, and, consequently, with other parameters
 * such as the number of PCR cycles and sequencing depth fixed, its family distribution would be skewed left---that is, the family size
 * would be smaller than it should be.
 *
 * This tool will help address the above problem by dropping a set fraction of _molecules_, or duplicate sets, rather than reads, at random.
 *
 * Example Usage:
 *
 * Keep 95 percent of the molecules.
 *
 * gatk DownsampleByDuplicateSet \ \
 * -I umiGrouped.bam \
 * --fraction-to-keep 0.95 \
 * -O umiGrouped_0.95.bam
 **/
@CommandLineProgramProperties(
        summary = "Discard a set fraction of duplicate sets from a UMI-grouped bam",
        oneLineSummary = "Discard a set fraction of duplicate sets from a UMI-grouped bam",
        programGroup = ReadDataManipulationProgramGroup.class
)
@BetaFeature
@WorkflowProperties
public class DownsampleByDuplicateSet extends DuplicateSetWalker {
    @Argument(fullName = StandardArgumentDefinitions.OUTPUT_LONG_NAME, shortName = StandardArgumentDefinitions.OUTPUT_SHORT_NAME, doc = "Output BAM file")
    @WorkflowOutput(optionalCompanions = {StandardArgumentDefinitions.OUTPUT_INDEX_COMPANION})
    public GATKPath outputBam;

    public static final String FRACTION_TO_KEEP_NAME = "fraction-to-keep";
    @Argument(fullName = FRACTION_TO_KEEP_NAME, doc = "This fraction of molecules in the input bam will be retained", minValue = 0.0, maxValue = 1.0)
    public double fractionToKeep;

    private static final int RANDOM_SEED = 142;
    private final Random rng = new Random(RANDOM_SEED);
    private int numDuplicateReadSets;
    private int numReads;
    private SAMFileGATKReadWriter outputWriter;

    @Override
    public void onTraversalStart() {
        outputWriter = createSAMWriter(outputBam, false);
    }

    @Override
    public void apply(ReadsWithSameUMI readsWithSameUMI, ReferenceContext referenceContext, FeatureContext featureContext) {
        if (rng.nextDouble() < fractionToKeep){
            readsWithSameUMI.getReads().forEach(r -> outputWriter.addRead(r));
            numReads += readsWithSameUMI.getReads().size();
            numDuplicateReadSets += 1;
        }
    }

    @Override
    public Object onTraversalSuccess(){
        outputWriter.close();
        logger.info(String.format("Wrote %d reads", numReads));
        logger.info(String.format("Wrote %d duplicate read sets", numDuplicateReadSets));
        return "SUCCESS";
    }

    @Override
    public void closeTool(){
        if (outputWriter != null) {
            outputWriter.close();
        }
    }
}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy