All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.broadinstitute.hellbender.engine.spark.AssemblyRegionArgumentCollection Maven / Gradle / Ivy

There is a newer version: 4.6.0.0
Show newest version
package org.broadinstitute.hellbender.engine.spark;

import org.broadinstitute.barclay.argparser.Advanced;
import org.broadinstitute.barclay.argparser.Argument;
import org.broadinstitute.barclay.argparser.CommandLineException;
import org.broadinstitute.barclay.argparser.Hidden;

import java.io.Serializable;

public class AssemblyRegionArgumentCollection implements Serializable {
    public static final String ASSEMBLY_REGION_OUT_LONG_NAME = "assembly-region-out";
    public static final String FORCE_ACTIVE_REGIONS_LONG_NAME = "force-active";
    private static final long serialVersionUID = 1L;

    public static final String MIN_ASSEMBLY_LONG_NAME = "min-assembly-region-size";
    public static final String MAX_ASSEMBLY_LONG_NAME = "max-assembly-region-size";
    public static final String ASSEMBLY_PADDING_LONG_NAME = "assembly-region-padding";
    public static final String MAX_STARTS_LONG_NAME = "max-reads-per-alignment-start";
    public static final String THRESHOLD_LONG_NAME = "active-probability-threshold";
    public static final String PROPAGATION_LONG_NAME = "max-prob-propagation-distance";

    public static final int DEFAULT_MIN_ASSEMBLY_REGION_SIZE = 50;
    public static final int DEFAULT_MAX_ASSEMBLY_REGION_SIZE = 300;
    public static final int DEFAULT_ASSEMBLY_REGION_PADDING = 100;
    public static final int DEFAULT_MAX_READS_PER_ALIGNMENT = 50;
    public static final double DEFAULT_ACTIVE_PROB_THRESHOLD = 0.002;
    public static final int DEFAULT_MAX_PROB_PROPAGATION_DISTANCE = 50;
    public static final String INDEL_PADDING_LONG_NAME = "padding-around-indels";
    public static final String SNP_PADDING_LONG_NAME = "padding-around-snps";
    public static final String STR_PADDING_LONG_NAME = "padding-around-strs";

    /**
     * The following parameters can be confusing due to the overlap between active regions, assembly regions, and genotyping regions,
     * all of which are implemented as the {@link org.broadinstitute.hellbender.engine.AssemblyRegion} class.
     *
     * An active region is a genomic interval in which we call variants.  In order to output phased calls we try to put nearby variants
     * in the same active region.  The size of active regions are determined by the variants present, {@link AssemblyRegionArgumentCollection.activeProbThreshold},
     * and {@link AssemblyRegionArgumentCollection.maxProbPropagationDistance}, and are bounded by the parameters {@link AssemblyRegionArgumentCollection.minAssemblyRegionSize}
     * and {@link AssemblyRegionArgumentCollection.maxAssemblyRegionSize}.  Importantly, active regions define responsibility for variant calling
     * and have nothing to do with the size of the span over which we perform local assembly and pair-HMM.  That is, a variant may belong to two
     * overlapping assembly or genotyping regions but it only belongs to a single active region.
     *
     * An assembly region is the padded interval surrounding an active region over which we perform local assembly.  Padding is useful to phase with
     * any variation that may be just outside the active region, to avoid dangling ends in the assembly region, and to resolve indels.  The
     * {@code assemblyRegionPadding} parameter determines the number of extra bases an assembly region contains on either side of the active region
     * it surrounds.
     *
     * A genotyping region is an interval surrounding an active region in which we perform pair-HMM and Smith-Waterman alignment.  Even though
     * we ultimately only genotype and call variants within the active region, we call entire haplotypes as an intermediate step, hence the
     * needed for an expanded genotyping region.  The parameters {@link AssemblyRegionArgumentCollection.snpPaddingForGenotyping},
     * {@link AssemblyRegionArgumentCollection.snpPaddingForGenotyping} and {@link AssemblyRegionArgumentCollection.indelPaddingForGenotyping} determine
     * the size of genotyping regions.
     *
     * The overall flow is as follows:
     *
     * 1) Active regions are found by triaging sites for activity and combining nearby potential variants into active regions.
     * 2) Active regions are padded for assembly and overlapping reads are hard-clipped to this padded assembly region.
     * 3) Reads are assembled into haplotypes.
     * 4) The active region is re-set to the span of all variants found in assembly that overlap the original active region.
     *    This shrinking does not lose variants but it does affect the size of the genotyping region.
     * 5) The genotyping region is determined by padding the shrunk active region.
     * 6) Assembled haplotypes are trimmed and reads are hard-clipped (again) to fit the padded genotyping region.
     */


    /**
     * Parameters that control active regions
     */

    @Argument(fullName = MIN_ASSEMBLY_LONG_NAME, doc = "Minimum size of an assembly region", optional = true)
    public int minAssemblyRegionSize = defaultMinAssemblyRegionSize();

    @Argument(fullName = MAX_ASSEMBLY_LONG_NAME, doc = "Maximum size of an assembly region", optional = true)
    public int maxAssemblyRegionSize = defaultMaxAssemblyRegionSize();

    @Advanced
    @Argument(fullName = THRESHOLD_LONG_NAME, doc="Minimum probability for a locus to be considered active.", optional = true)
    public double activeProbThreshold = defaultActiveProbThreshold();

    @Advanced
    @Argument(fullName = PROPAGATION_LONG_NAME, doc="Upper limit on how many bases away probability mass can be moved around when calculating the boundaries between active and inactive assembly regions", optional = true)
    public int maxProbPropagationDistance = defaultMaxProbPropagationDistance();

    @Advanced
    @Argument(fullName = FORCE_ACTIVE_REGIONS_LONG_NAME, doc = "If provided, all regions will be marked as active", optional = true)
    public boolean forceActive = false;

    /**
     * Parameters that control assembly regions
     */

    @Argument(fullName = ASSEMBLY_PADDING_LONG_NAME, doc = "Number of additional bases of context to include around each assembly region", optional = true)
    public int assemblyRegionPadding = defaultAssemblyRegionPadding();

    /**
     * Parameters that control genotyping regions
     */

    @Hidden
    @Argument(fullName= INDEL_PADDING_LONG_NAME, doc = "Include at least this many bases around an event for calling indels", optional = true)
    public int indelPaddingForGenotyping = 75;

    @Hidden
    @Argument(fullName= SNP_PADDING_LONG_NAME, doc = "Include at least this many bases around an event for calling snps", optional = true)
    public int snpPaddingForGenotyping = 20;

    @Hidden
    @Argument(fullName= STR_PADDING_LONG_NAME, doc = "Include at least this many bases around an event for calling STR indels", optional = true)
    public int strPaddingForGenotyping = 75;


    /**
     * Other parameters
     */

    @Argument(fullName = MAX_STARTS_LONG_NAME, doc = "Maximum number of reads to retain per alignment start position. Reads above this threshold will be downsampled. Set to 0 to disable.", optional = true)
    public int maxReadsPerAlignmentStart = defaultMaxReadsPerAlignmentStart();

    /**
     * @return Default value for the {@link #minAssemblyRegionSize} parameter, if none is provided on the command line
     */
    protected int defaultMinAssemblyRegionSize() { return DEFAULT_MIN_ASSEMBLY_REGION_SIZE; }

    /**
     * @return Default value for the {@link #maxAssemblyRegionSize} parameter, if none is provided on the command line
     */
    protected int defaultMaxAssemblyRegionSize() { return DEFAULT_MAX_ASSEMBLY_REGION_SIZE; }

    /**
     * @return Default value for the {@link #assemblyRegionPadding} parameter, if none is provided on the command line
     */
    protected int defaultAssemblyRegionPadding() { return DEFAULT_ASSEMBLY_REGION_PADDING; }

    /**
     * @return Default value for the {@link #maxReadsPerAlignmentStart} parameter, if none is provided on the command line
     */
    protected int defaultMaxReadsPerAlignmentStart() { return DEFAULT_MAX_READS_PER_ALIGNMENT; }

    /**
     * @return Default value for the {@link #activeProbThreshold} parameter, if none is provided on the command line
     */
    protected double defaultActiveProbThreshold() { return DEFAULT_ACTIVE_PROB_THRESHOLD; }

    /**
     * @return Default value for the {@link #maxProbPropagationDistance} parameter, if none is provided on the command line
     */
    protected int defaultMaxProbPropagationDistance() { return DEFAULT_MAX_PROB_PROPAGATION_DISTANCE; }

    public void validate() {
        if ( minAssemblyRegionSize <= 0 || maxAssemblyRegionSize <= 0 ) {
            throw new CommandLineException.BadArgumentValue("min/max assembly region size must be > 0");
        }

        if ( minAssemblyRegionSize > maxAssemblyRegionSize ) {
            throw new CommandLineException.BadArgumentValue("minAssemblyRegionSize must be <= maxAssemblyRegionSize");
        }

        if ( assemblyRegionPadding < 0 ) {
            throw new CommandLineException.BadArgumentValue("assemblyRegionPadding must be >= 0");
        }

        if ( maxReadsPerAlignmentStart < 0 ) {
            throw new CommandLineException.BadArgumentValue("maxReadsPerAlignmentStart must be >= 0");
        }

        if ( snpPaddingForGenotyping < 0 ) {
            throw new CommandLineException.BadArgumentValue("paddingAroundSNPs", "" + snpPaddingForGenotyping + "< 0");
        }

        if ( indelPaddingForGenotyping < 0 ) {
            throw new CommandLineException.BadArgumentValue("paddingAroundIndels", "" + indelPaddingForGenotyping + "< 0");
        }
    }


}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy