org.broadinstitute.hellbender.tools.walkers.rnaseq.ASEReadCounter Maven / Gradle / Ivy

Show more of this group Show more artifacts with this name
Show all versions of gatk Show documentation
Development on GATK 4
There is a newer version: 4.6.0.0
package org.broadinstitute.hellbender.tools.walkers.rnaseq;


import htsjdk.variant.variantcontext.VariantContext;
import htsjdk.samtools.SAMFileHeader;
import org.broadinstitute.barclay.argparser.CommandLineProgramProperties;
import org.broadinstitute.barclay.help.DocumentedFeature;
import org.broadinstitute.hellbender.cmdline.programgroups.CoverageAnalysisProgramGroup;
import org.broadinstitute.hellbender.engine.AlignmentContext;
import org.broadinstitute.hellbender.engine.FeatureContext;
import org.broadinstitute.hellbender.engine.LocusWalker;
import org.broadinstitute.hellbender.engine.ReferenceContext;
import org.broadinstitute.barclay.argparser.Argument;
import org.broadinstitute.hellbender.cmdline.StandardArgumentDefinitions;
import org.broadinstitute.hellbender.engine.*;
import org.broadinstitute.hellbender.engine.filters.ReadFilter;
import org.broadinstitute.hellbender.engine.filters.ReadFilterLibrary;
import org.broadinstitute.hellbender.exceptions.UserException;
import org.broadinstitute.hellbender.utils.pileup.PileupElement;
import org.broadinstitute.hellbender.utils.pileup.ReadPileup;

import java.io.File;
import java.io.FileNotFoundException;
import java.io.PrintStream;
import java.util.ArrayList;
import java.util.List;

/**
 * Calculate read counts per allele for allele-specific expression analysis of RNAseq data
 * 
 * 

 * This tool calculates allele counts at a set of positions after applying filters that are tuned for enabling
 * allele-specific expression (ASE) analysis of RNAseq data. The filters operate on mapping quality, base quality, depth of coverage,
 * overlapping paired reads and deletions overlapping the position. All thresholds and options are controlled by
 * command-line arguments.
 * 
 * 
 * 
Input
 * 
 * BAM files (with proper headers) to be analyzed for ASE
 * A VCF file with specific sites to process.
 * 
 * 
 * 
Output
 * 
 * A table of allele counts at the given sites. By default, it is formatted as a tab-delimited text file
 * that is readable by R and compatible with Mamba,
 * a downstream tool developed for allele-specific expression analysis.
 * 
 * Usage Example
 * 
 *     gatk ASEReadCounter \
 *     -R Homo_sapiens_assembly38.fasta \
 *     -I input.bam \
 *     -V sites.vcf.gz \
 *     -O output.table
 * 
 * 
 * 
Note
 * 
 * Like most GATK tools, this tools filters out duplicate reads by default. However, some ASE methods
 * recommend including duplicate reads in the analysis, so the DuplicateRead filter can be disabled using the
 * "-DF NotDuplicateReadFilter" flag in the command-line.
 * 
 * Caveat
 * 
 * This tool will only process biallelic het SNP sites. If your callset contains multiallelic sites, they will be ignored.
 * Optionally, you can subset your callset to just biallelic variants using e.g.
 * SelectVariants
 * with the option "-restrictAllelesTo BIALLELIC".
 * 
 * 
 * For more details see Castel, S. et al. Tools and Best Practices for allelic expression analysis.
 *
 * @author Ami Levy Moonshine
 */
@DocumentedFeature
@CommandLineProgramProperties(
        summary = "Counts filtered reads at het sites for allele specific expression estimate",
        oneLineSummary = "Generates table of filtered base counts at het sites for allele specific expression",
        programGroup = CoverageAnalysisProgramGroup.class
)
public class ASEReadCounter extends LocusWalker {

    @Argument(fullName = StandardArgumentDefinitions.OUTPUT_LONG_NAME, shortName = StandardArgumentDefinitions.OUTPUT_SHORT_NAME, doc = "Output file (if not provided, defaults to STDOUT)", optional = true)
    private File outputFile = null;

    @Argument(fullName = StandardArgumentDefinitions.VARIANT_LONG_NAME, shortName = StandardArgumentDefinitions.VARIANT_SHORT_NAME, doc = "One or more VCF files")
    private List> variants;

    /**
     * If this argument is enabled, loci with total depth lower than this threshold after all filters have been applied
     * will be skipped. This can be set to -1 by default to disable the evaluation and ignore this threshold.
     */
    @Argument(fullName = "min-depth-of-non-filtered-base", shortName = "min-depth", doc = "Minimum number of bases that pass filters", optional = true)
    public int minDepthOfNonFilteredBases = -1;

    /**
     * If this argument is enabled, reads with mapping quality values lower than this threshold will not be counted.
     * This can be set to -1 by default to disable the evaluation and ignore this threshold.
     */
    @Argument(fullName = "min-mapping-quality", shortName = "mmq", doc = "Minimum read mapping quality", optional = true)
    public int minMappingQuality = 0;

    /**
     * If this argument is enabled, bases with quality scores lower than this threshold will not be counted.
     * This can be set to -1 by default to disable the evaluation and ignore this threshold.
     */
    @Argument(fullName = "min-base-quality", shortName = "mbq", doc = "Minimum base quality", optional = true)
    public byte minBaseQuality = 0;

    /**
     * These options modify how the tool deals with overlapping read pairs. The default value is COUNT_FRAGMENTS_REQUIRE_SAME_BASE.
     */
    @Argument(fullName = "count-overlap-reads-handling", shortName = "overlap", doc = "Handling of overlapping reads from the same fragment", optional = true)
    public CountPileupType countType = CountPileupType.COUNT_FRAGMENTS_REQUIRE_SAME_BASE;

    /**
     * Available options are csv, table, rtable. By default, the format is rtable (an r-readable table).
     */
    @Argument(fullName = "output-format", doc = "Format of the output file", optional = true)
    public OUTPUT_FORMAT outputFormat = OUTPUT_FORMAT.RTABLE;

    public String separator = "\t";

    public enum OUTPUT_FORMAT {
        TABLE,
        RTABLE,
        CSV
    }

    public enum CountPileupType {
        /**
         * Count all reads independently (even if from the same fragment).
         */
        COUNT_READS,
        /**
         * Count all fragments (even if the reads that compose the fragment are not consistent at that base).
         */
        COUNT_FRAGMENTS,
        /**
         * Count all fragments (but only if the reads that compose the fragment are consistent at that base).
         */
        COUNT_FRAGMENTS_REQUIRE_SAME_BASE
    }

    private PrintStream outputStream = null;

    @Override
    public List getDefaultReadFilters() {
        final List defaultFilters = new ArrayList<>();
        defaultFilters.add(ReadFilterLibrary.VALID_ALIGNMENT_START);
        defaultFilters.add(ReadFilterLibrary.VALID_ALIGNMENT_END);
        defaultFilters.add(ReadFilterLibrary.HAS_READ_GROUP);
        defaultFilters.add(ReadFilterLibrary.HAS_MATCHING_BASES_AND_QUALS);
        defaultFilters.add(ReadFilterLibrary.SEQ_IS_STORED);
        defaultFilters.add(ReadFilterLibrary.NOT_DUPLICATE);
        defaultFilters.add(ReadFilterLibrary.NOT_SECONDARY_ALIGNMENT);
        defaultFilters.add(new ReadFilterLibrary.MappedReadFilter());
        return defaultFilters;
    }

    @Override
    public void onTraversalStart() {
        try {
            outputStream = outputFile != null ? new PrintStream(outputFile) : System.out;
        } catch (FileNotFoundException e) {
            throw new UserException.CouldNotCreateOutputFile(outputFile, e);
        }

        if (outputFormat.equals(OUTPUT_FORMAT.CSV)) {
            separator = ",";
        }
        final String header = "contig" + separator + "position" + separator + "variantID" + separator + "refAllele" + separator + "altAllele" + separator + "refCount" + separator + "altCount" + separator + "totalCount" + separator + "lowMAPQDepth" + separator + "lowBaseQDepth" + separator + "rawDepth" + separator + "otherBases" + separator + "improperPairs";
        outputStream.println(header);
    }

    @Override
    public void apply(AlignmentContext alignmentContext, ReferenceContext referenceContext, FeatureContext featureContext) {
        final String contig = alignmentContext.getContig();
        final long position = alignmentContext.getPosition();

        final char refAllele = (char) referenceContext.getBase();

        final List VCs = featureContext.getValues(variants);
        if (VCs != null && VCs.size() > 1) {
            throw new UserException("More then one variant context at position: " + contig + ":" + position);
        }
        if (VCs == null || VCs.isEmpty()) {
            return;
        }

        final VariantContext vc = VCs.get(0);
        if (!vc.isBiallelic()) {
            logger.warn("Ignoring site: cannot run ASE on non-biallelic sites: " + vc.toString());
            return;
        }

        if (vc.getHetCount() < 1) {
            logger.warn("Ignoring site: variant is not het at postion: " + contig + ":" + position);
            return;
        }

        if (vc.getNAlleles() == 1 || vc.getAlternateAllele(0).getBases().length == 0) {
            throw new UserException("The file of variant sites must contain heterozygous sites and cannot be a GVCF file containing  alleles.");
        }

        final char altAllele = (char) vc.getAlternateAllele(0).getBases()[0];

        final String siteID = vc.getID();
        final ReadPileup pileup = filterPileup(alignmentContext.getBasePileup(), countType);

        // count up the depths of all and QC+ bases
        final String line = calculateLineForSite(pileup, siteID, refAllele, altAllele);
        if (line != null) {
            outputStream.println(line);
        }
    }

    @Override
    public void closeTool() {
        if (outputStream != null)
            outputStream.close();
    }

    private ReadPileup filterPileup(final ReadPileup originalPileup, final CountPileupType countType) {
        SAMFileHeader header = getHeaderForReads();

        final ReadPileup pileupWithDeletions;
        switch (countType) {
            case COUNT_FRAGMENTS_REQUIRE_SAME_BASE: pileupWithDeletions = originalPileup.getOverlappingFragmentFilteredPileup(true, ReadPileup.baseQualTieBreaker, header);
                break;
            case COUNT_READS: pileupWithDeletions = originalPileup;
                break;
            case COUNT_FRAGMENTS: pileupWithDeletions = originalPileup.getOverlappingFragmentFilteredPileup(false, ReadPileup.baseQualTieBreaker, header);
                break;
            default: throw new UserException("Must use valid CountPileupType");
        }

        return pileupWithDeletions.makeFilteredPileup(p -> !p.isDeletion());

    }

    private String calculateLineForSite(final ReadPileup pileup, final String siteID, final char refAllele, final char altAllele) {

        int rawDepth = 0, lowBaseQDepth = 0, lowMAPQDepth = 0, refCount = 0, altCount = 0, totalNonFilteredCount = 0, otherBasesCount = 0, improperPairsCount = 0;

        for (final PileupElement base : pileup) {
            rawDepth++;

            if (base.getRead().isPaired() && (base.getRead().mateIsUnmapped() || !base.getRead().isProperlyPaired())) {
                improperPairsCount++;
                continue;
            }
            if (base.getMappingQual() < minMappingQuality) {
                lowMAPQDepth++;
                continue;
            }

            if (base.getQual() < minBaseQuality) {
                lowBaseQDepth++;
                continue;
            }

            if (base.getBase() == refAllele) {
                refCount++;
            }
            else if (base.getBase() == altAllele) {
                altCount++;
            }
            else {
                otherBasesCount++;
                continue;
            }
            totalNonFilteredCount++;
        }

        if (totalNonFilteredCount < minDepthOfNonFilteredBases) {
            return null;
        }

        final StringBuilder line = new StringBuilder();
        line.append(pileup.getLocation().getContig()).append(separator);
        line.append(pileup.getLocation().getStart()).append(separator);
        line.append(siteID).append(separator);
        line.append(refAllele).append(separator);
        line.append(altAllele).append(separator);
        line.append(refCount).append(separator);
        line.append(altCount).append(separator);
        line.append(totalNonFilteredCount).append(separator);
        line.append(lowMAPQDepth).append(separator);
        line.append(lowBaseQDepth).append(separator);
        line.append(rawDepth).append(separator);
        line.append(otherBasesCount).append(separator);
        line.append(improperPairsCount);

        return line.toString();
    }
}