All Downloads are FREE. Search and download functionalities are using the official Maven repository.

picard.analysis.GcBiasMetricsCollector Maven / Gradle / Ivy

There is a newer version: 3.2.0
Show newest version
/*
 * The MIT License
 *
 * Copyright (c) 2015 The Broad Institute
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
 * in the Software without restriction, including without limitation the rights
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be included in
 * all copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 * THE SOFTWARE.
 */

package picard.analysis;

import htsjdk.samtools.SAMReadGroupRecord;
import htsjdk.samtools.SAMRecord;
import htsjdk.samtools.metrics.MetricsFile;
import htsjdk.samtools.reference.ReferenceSequence;
import htsjdk.samtools.util.Log;
import htsjdk.samtools.util.QualityUtil;
import htsjdk.samtools.util.SequenceUtil;
import htsjdk.samtools.util.StringUtil;
import picard.metrics.GcBiasMetrics;
import picard.metrics.MultiLevelCollector;
import picard.metrics.PerUnitMetricCollector;

import java.util.Collection;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.HashMap;

/** Calculates GC Bias Metrics on multiple levels
 *  Created by kbergin on 3/23/15.
 */
public class GcBiasMetricsCollector extends MultiLevelCollector {
    // Histograms to track the number of windows at each GC, and the number of read starts
    // at windows of each GC. Need 101 to get from 0-100.
    private final int scanWindowSize;
    private final boolean bisulfite;
    private int[] windowsByGc = new int[BINS];
    private static final int BINS = 101;
    //Use to calculate additional results without duplicates
    private boolean ignoreDuplicates;

    //will hold the relevant gc information per contig
    private byte [] gc = null;
    private int referenceIndex = -1;
    private byte [] refBases = null;
    private static final Log log = Log.getInstance(GcBiasMetricsCollector.class);

    public GcBiasMetricsCollector(final Set accumulationLevels, final int[] windowsByGc,
                                  final List samRgRecords, final int scanWindowSize,
                                  final boolean bisulfite) {
        this(accumulationLevels, windowsByGc, samRgRecords, scanWindowSize, bisulfite, false);
    }

    public GcBiasMetricsCollector(final Set accumulationLevels, final int[] windowsByGc,
                                  final List samRgRecords, final int scanWindowSize,
                                  final boolean bisulfite, final boolean ignoreDuplicates) {
        this.scanWindowSize = scanWindowSize;
        this.bisulfite = bisulfite;
        this.windowsByGc = windowsByGc;
        this.ignoreDuplicates = ignoreDuplicates;
        setup(accumulationLevels, samRgRecords);
    }

    /////////////////////////////////////////////////////////////////////////////
    // This method is called once Per samRecord
    /////////////////////////////////////////////////////////////////////////////
    @Override
    protected GcBiasCollectorArgs makeArg(final SAMRecord rec, final ReferenceSequence ref) {
        return new GcBiasCollectorArgs(rec, ref);
    }

    /////////////////////////////////////////////////////////////////////////////
    //Make a GcBiasCollector with the given arguments
    /////////////////////////////////////////////////////////////////////////////
    @Override
    protected PerUnitMetricCollector makeChildCollector(final String sample, final String library, final String readGroup) {
        return new PerUnitGcBiasMetricsCollector(sample, library, readGroup);
    }

    /////////////////////////////////////////////////////////////////////////////
    //A collector for individual GcBiasMetrics for a given SAMPLE or SAMPLE/LIBRARY
    //or SAMPLE/LIBRARY/READ_GROUP (depending on aggregation levels)
    /////////////////////////////////////////////////////////////////////////////
    public class PerUnitGcBiasMetricsCollector implements PerUnitMetricCollector {
        private final Map gcData;
        // Additional object to store data without duplicates (null if option ALSO_IGNORE_DUPLICATES is not specified)
        private Map gcDataNonDups;
        private final String sample;
        private final String library;
        private final String readGroup;
        private static final String allReads = "All_Reads";
        final static String ACCUMULATION_LEVEL_ALL_READS = "All Reads";
        final static String ACCUMULATION_LEVEL_LIBRARY = "Library";
        final static String ACCUMULATION_LEVEL_SAMPLE = "Sample";
        final static String ACCUMULATION_LEVEL_READ_GROUP = "Read Group";
        final static String READS_USED_ALL = "ALL";
        final static String READS_USED_UNIQUE = "UNIQUE";
        private int logCounter;

        /////////////////////////////////////////////////////////////////////////////
        //Records the accumulation level for each level of collection and initializes
        // a GcObject for this accumulation level
        /////////////////////////////////////////////////////////////////////////////
        public PerUnitGcBiasMetricsCollector(final String sample, final String library, final String readGroup) {
            this.sample = sample;
            this.library = library;
            this.readGroup = readGroup;
            this.gcData = prepareGcData();
            if (ignoreDuplicates) {
                this.gcDataNonDups = prepareGcData();
            }
        }

        /////////////////////////////////////////////////////////////////////////////
        //Takes each record and sends them to addRead to calculate gc metrics for
        // that read for each accumulation level
        /////////////////////////////////////////////////////////////////////////////
        @Override
        public void acceptRecord(final GcBiasCollectorArgs args) {
            final SAMRecord rec = args.getRec();
            if (logCounter < 100 && rec.getReadBases().length == 0) {
                log.warn("Omitting read " + rec.getReadName() + " with '*' in SEQ field.");
                if (++logCounter == 100) {
                    log.warn("There are more than 100 reads with '*' in SEQ field in file.");
                }
                return;
            }
            if (!rec.getReadUnmappedFlag()) {
                if (referenceIndex != rec.getReferenceIndex() || gc == null) {
                    final ReferenceSequence ref = args.getRef();
                    refBases = ref.getBases();
                    StringUtil.toUpperCase(refBases);
                    final int refLength = refBases.length;
                    final int lastWindowStart = refLength - scanWindowSize;
                    gc = GcBiasUtils.calculateAllGcs(refBases, lastWindowStart, scanWindowSize);
                    referenceIndex = rec.getReferenceIndex();
                }

                addReadToGcData(rec, this.gcData);
                if (ignoreDuplicates && !rec.getDuplicateReadFlag()) {
                    addReadToGcData(rec, this.gcDataNonDups);
                }
            } else {
                updateTotalClusters(rec, this.gcData);
                if (ignoreDuplicates && !rec.getDuplicateReadFlag()) {
                    updateTotalClusters(rec, this.gcDataNonDups);
                }
            }
        }

        @Override
        public void finish() {}

        /////////////////////////////////////////////////////////////////////////////
        //Called to add metrics to the output file for each level of collection
        // these metrics are used for graphing gc bias in R script
        /////////////////////////////////////////////////////////////////////////////
        @Override
        public void addMetricsToFile(final MetricsFile file) {
            addGcDataToFile(file, this.gcData, true);
            if (ignoreDuplicates) {
                addGcDataToFile(file, this.gcDataNonDups, false);
            }
        }

        private void updateTotalClusters(final SAMRecord rec, final Map gcData) {
            for (final Map.Entry entry : gcData.entrySet()) {
                final GcObject gcCur = entry.getValue();
                if (!rec.getReadPairedFlag() || rec.getFirstOfPairFlag()) ++gcCur.totalClusters;
            }
        }

        private Map prepareGcData() {
            final String prefix;
            final Map gcData = new HashMap<>();
            if (this.readGroup != null) {
                prefix = this.readGroup;
            } else if (this.library != null) {
                prefix = this.library;
            } else if (this.sample != null) {
                prefix = this.sample;
            } else {
                prefix = allReads;
            }
            gcData.put(prefix, new GcObject());
            return gcData;
        }

        private void addReadToGcData(final SAMRecord rec, final Map gcData) {
            final String type;
            String group;
            if (this.readGroup != null) {
                type = this.readGroup;
                group = ACCUMULATION_LEVEL_READ_GROUP;
            } else if (this.library != null) {
                type = this.library;
                group = ACCUMULATION_LEVEL_LIBRARY;
            } else if (this.sample != null) {
                type = this.sample;
                group = ACCUMULATION_LEVEL_SAMPLE;
            } else {
                type = allReads;
                group = ACCUMULATION_LEVEL_ALL_READS;
            }
            addRead(gcData.get(type), rec, group, gc, refBases);
        }

        /////////////////////////////////////////////////////////////////////////////
        // Sums the values in an int[].
        /////////////////////////////////////////////////////////////////////////////
        private double sum(final int[] values) {
            final int length = values.length;
            double total = 0;
            for (int i = 0; i < length; i++) {
                total += values[i];
            }

            return total;
        }

        private void addGcDataToFile(final MetricsFile file, final Map gcData,
                                     final boolean includeDuplicates) {
            for (final Map.Entry entry : gcData.entrySet()) {
                final GcObject gcCur = entry.getValue();
                final String gcType = entry.getKey();

                final int[] readsByGc = gcCur.readsByGc;
                final long[] errorsByGc = gcCur.errorsByGc;
                final long[] basesByGc = gcCur.basesByGc;
                final long totalClusters = gcCur.totalClusters;
                final long totalAlignedReads = gcCur.totalAlignedReads;
                final String group = gcCur.group;

                final GcBiasMetrics metrics = new GcBiasMetrics();

                final double totalWindows = sum(windowsByGc);
                final double totalReads = sum(readsByGc);
                final double meanReadsPerWindow = totalReads / totalWindows;

                if (totalAlignedReads > 0) {
                    for (int i = 0; i < windowsByGc.length; ++i) {
                        final GcBiasDetailMetrics detail = new GcBiasDetailMetrics();
                        detail.GC = i;
                        detail.WINDOWS = windowsByGc[i];
                        detail.READ_STARTS = readsByGc[i];
                        if (errorsByGc[i] > 0) {
                            detail.MEAN_BASE_QUALITY = QualityUtil.getPhredScoreFromObsAndErrors(basesByGc[i], errorsByGc[i]);
                        }
                        if (windowsByGc[i] != 0) {
                            detail.NORMALIZED_COVERAGE = (detail.READ_STARTS / (double) detail.WINDOWS) / meanReadsPerWindow;
                            detail.ERROR_BAR_WIDTH = (Math.sqrt(detail.READ_STARTS) / (double) detail.WINDOWS) / meanReadsPerWindow;
                        } else {
                            detail.NORMALIZED_COVERAGE = 0;
                            detail.ERROR_BAR_WIDTH = 0;
                        }
                        detail.ACCUMULATION_LEVEL = group;
                        if (group.equals(ACCUMULATION_LEVEL_READ_GROUP)) {detail.READ_GROUP = gcType;}
                        else if (group.equals(ACCUMULATION_LEVEL_SAMPLE)) {detail.SAMPLE = gcType;}
                        else if (group.equals(ACCUMULATION_LEVEL_LIBRARY)) {detail.LIBRARY = gcType;}

                        detail.READS_USED = includeDuplicates ? READS_USED_ALL : READS_USED_UNIQUE;

                        metrics.DETAILS.addMetric(detail);
                    }

                    // Synthesize the high level summary metrics
                    final GcBiasSummaryMetrics summary = new GcBiasSummaryMetrics();
                    if (group.equals(ACCUMULATION_LEVEL_READ_GROUP)) {summary.READ_GROUP = gcType;}
                    else if (group.equals(ACCUMULATION_LEVEL_SAMPLE)) {summary.SAMPLE = gcType;}
                    else if (group.equals(ACCUMULATION_LEVEL_LIBRARY)) {summary.LIBRARY = gcType;}

                    summary.READS_USED = includeDuplicates ? READS_USED_ALL : READS_USED_UNIQUE;

                    summary.ACCUMULATION_LEVEL = group;
                    summary.WINDOW_SIZE = scanWindowSize;
                    summary.TOTAL_CLUSTERS = totalClusters;
                    summary.ALIGNED_READS = totalAlignedReads;
                    summary.GC_NC_0_19 = calculateGcNormCoverage(meanReadsPerWindow, readsByGc, 0, 19);
                    summary.GC_NC_20_39 = calculateGcNormCoverage(meanReadsPerWindow, readsByGc, 20, 39);
                    summary.GC_NC_40_59 = calculateGcNormCoverage(meanReadsPerWindow, readsByGc, 40, 59);
                    summary.GC_NC_60_79 = calculateGcNormCoverage(meanReadsPerWindow, readsByGc, 60, 79);
                    summary.GC_NC_80_100 = calculateGcNormCoverage(meanReadsPerWindow, readsByGc, 80, 100);

                    calculateDropoutMetrics(metrics.DETAILS.getMetrics(), summary);

                    metrics.SUMMARY = summary;

                    file.addMetric(metrics);
                }
            }
        }
    }

    /////////////////////////////////////////////////////////////////////////////
    // Calculates the normalized coverage over a given gc content region
    /////////////////////////////////////////////////////////////////////////////
    private double calculateGcNormCoverage(final double meanReadsPerWindow, final int[] readsByGc,
                                           final int start, final int end) {
        int windowsTotal = 0;
        double sum = 0.0;
        for (int i = start; i <= end; i++) {
            if (windowsByGc[i] != 0) {
                sum += (double) readsByGc[i];
                windowsTotal += windowsByGc[i];
            }
        }

        if (windowsTotal == 0) {
            return 0.0;
        } else {
            return (sum / (windowsTotal * meanReadsPerWindow));
        }
    }

    /////////////////////////////////////////////////////////////////////////////
    // Calculates the Illumina style AT and GC dropout numbers
    /////////////////////////////////////////////////////////////////////////////
    private void calculateDropoutMetrics(final Collection details,
                                         final GcBiasSummaryMetrics summary) {
        // First calculate the totals
        double totalReads = 0;
        double totalWindows = 0;

        for (final GcBiasDetailMetrics detail : details) {
            totalReads += detail.READ_STARTS;
            totalWindows += detail.WINDOWS;
        }

        double atDropout = 0;
        double gcDropout = 0;

        for (final GcBiasDetailMetrics detail : details) {
            final double relativeReads = detail.READ_STARTS / totalReads;
            final double relativeWindows = detail.WINDOWS / totalWindows;
            final double dropout = (relativeWindows - relativeReads) * 100;

            if (dropout > 0) {
                if (detail.GC <= 50) atDropout += dropout;
                else{ gcDropout += dropout; }
            }
        }

        summary.AT_DROPOUT = atDropout;
        summary.GC_DROPOUT = gcDropout;
    }

    /////////////////////////////////////////////////////////////////////////////
    //Keeps track of each level of GcCalculation
    /////////////////////////////////////////////////////////////////////////////
    class GcObject {
        long totalClusters = 0;
        long totalAlignedReads = 0;
        int[] readsByGc = new int[BINS];
        long[] basesByGc = new long[BINS];
        long[] errorsByGc = new long[BINS];
        String group = null;
    }

    /////////////////////////////////////////////////////////////////////////////
    //Adds each read to the appropriate gcObj which is determined in acceptRecord above
    //Also calculates values for calculating GC Bias at each level
    /////////////////////////////////////////////////////////////////////////////
     private void addRead(final GcObject gcObj, final SAMRecord rec, final String group, final byte[] gc, final byte[] refBases) {
        if (!rec.getReadPairedFlag() || rec.getFirstOfPairFlag()) ++gcObj.totalClusters;
        final int pos = rec.getReadNegativeStrandFlag() ? rec.getAlignmentEnd() - scanWindowSize : rec.getAlignmentStart();
        ++gcObj.totalAlignedReads;
        if (pos > 0) {
            final int windowGc = gc[pos];
            if (windowGc >= 0) {
                ++gcObj.readsByGc[windowGc];
                gcObj.basesByGc[windowGc] += rec.getReadLength();
                gcObj.errorsByGc[windowGc] +=
                        SequenceUtil.countMismatches(rec, refBases, bisulfite) +
                                SequenceUtil.countInsertedBases(rec) + SequenceUtil.countDeletedBases(rec);
            }
        }
        if (gcObj.group == null) {
            gcObj.group = group;
        }
    }
}

/////////////////////////////////////////////////////////////////////////////
// Arguments that need to be passed to each PerUnitMetricCollector
// for the given record
/////////////////////////////////////////////////////////////////////////////
class GcBiasCollectorArgs {
    private final SAMRecord rec;
    private final ReferenceSequence ref;
    public SAMRecord getRec() {return rec;}
    public ReferenceSequence getRef() {return ref;}
    public GcBiasCollectorArgs(final SAMRecord rec, final ReferenceSequence ref) {
        this.rec = rec;
        this.ref = ref;
    }
}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy