All Downloads are FREE. Search and download functionalities are using the official Maven repository.

picard.analysis.RrbsMetricsCollector Maven / Gradle / Ivy

/*
 * The MIT License
 *
 * Copyright (c) 2013 The Broad Institute
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
 * in the Software without restriction, including without limitation the rights
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be included in
 * all copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 * THE SOFTWARE.
 */

package picard.analysis;


import htsjdk.samtools.metrics.MetricsFile;
import htsjdk.samtools.reference.ReferenceSequence;
import htsjdk.samtools.util.Histogram;
import htsjdk.samtools.AlignmentBlock;
import htsjdk.samtools.SAMReadGroupRecord;
import htsjdk.samtools.SAMRecord;
import htsjdk.samtools.util.SequenceUtil;
import picard.metrics.PerUnitMetricCollector;
import picard.metrics.SAMRecordAndReference;
import picard.metrics.SAMRecordAndReferenceMultiLevelCollector;

import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import java.util.Set;

public class RrbsMetricsCollector extends SAMRecordAndReferenceMultiLevelCollector> {
	private final int minReadLength;
	private final double maxMismatchRate;
	private final int cQualityThreshold;
	private final int nextBaseQualityThreshold;

	public RrbsMetricsCollector(final Set accumulationLevels, final List samRgRecords,
								final int cQualityThreshold, final int nextBaseQualityThreshold, final int minReadLength,
								final double maxMismatchRate) {
		this.cQualityThreshold = cQualityThreshold;
		this.nextBaseQualityThreshold = nextBaseQualityThreshold;
		this.minReadLength = minReadLength;
		this.maxMismatchRate = maxMismatchRate;
		setup(accumulationLevels, samRgRecords);
	}

	@Override
	protected PerUnitMetricCollector, SAMRecordAndReference> makeChildCollector(final String sample, final String library, final String readGroup) {
		return new PerUnitRrbsMetricsCollector(sample, library, readGroup);
	}

	private class PerUnitRrbsMetricsCollector implements PerUnitMetricCollector, SAMRecordAndReference> {
		final String sample;
		final String library;
		final String readGroup;

		// Counters for CpG & non-CpG seen/converted sites
		int nCytoConverted = 0;
		int nCytoTotal = 0;
		final Histogram cpgTotal = new Histogram();
		final Histogram cpgConverted = new Histogram();

		// Counters for QC filters used in the final metrics
		int mappedRecordCount = 0;
		int smallReadCount = 0;
		int mismatchCount = 0;
		int noCpgCount = 0;

		// Final metrics calculated once all the reads are done
		double cytoConversionRate;
		double cpgConversionRate;
		int nCpgSeen;
		int nCpgConverted;
		double coverageMean;
		int coverageMedian;

		public PerUnitRrbsMetricsCollector(final String sample, final String library, final String readGroup) {
			this.sample = sample;
			this.library = library;
			this.readGroup = readGroup;
		}

		public void acceptRecord(final SAMRecordAndReference args) {
			mappedRecordCount++;

			final SAMRecord samRecord = args.getSamRecord();
			final ReferenceSequence referenceSequence = args.getReferenceSequence();

			final byte[] readBases = samRecord.getReadBases();
			final byte[] readQualities = samRecord.getBaseQualities();
			final byte[] refBases = referenceSequence.getBases();

			if (samRecord.getReadLength() < minReadLength) {
				smallReadCount++;
				return;
			} else if (SequenceUtil.countMismatches(samRecord, refBases, true) > Math.round(samRecord.getReadLength() * maxMismatchRate)) {
				mismatchCount++;
				return;
			}

			// We only record non-CpG C sites if there was at least one CpG in the read, keep track of
			// the values for this record and then apply to the global totals if valid
			int recordCpgs = 0;

			for (final AlignmentBlock alignmentBlock : samRecord.getAlignmentBlocks()) {
				final int blockLength = alignmentBlock.getLength();
				final int refFragmentStart = alignmentBlock.getReferenceStart() - 1;
				final int readFragmentStart = alignmentBlock.getReadStart() - 1;

				final byte[] refFragment = getFragment(refBases, refFragmentStart, blockLength);
				final byte[] readFragment = getFragment(readBases, readFragmentStart, blockLength);
				final byte[] readQualityFragment = getFragment(readQualities, readFragmentStart, blockLength);

				if (samRecord.getReadNegativeStrandFlag()) {
					// In the case of a negative strand, reverse (and complement for base arrays) the reference,
					// reads & qualities so that it can be treated as a positive strand for the rest of the process
					SequenceUtil.reverseComplement(refFragment);
					SequenceUtil.reverseComplement(readFragment);
					SequenceUtil.reverseQualities(readQualityFragment);
				}

				for (int i=0; i < blockLength-1; i++) {
					final int curRefIndex = getCurRefIndex(refFragmentStart, blockLength, i, samRecord.getReadNegativeStrandFlag());

					// Look at a 2-base window to see if we're on a CpG site, and if so check for conversion
					// (CG -> TG). We do not consider ourselves to be on a CpG site if we're on the last base of a read
					if ((SequenceUtil.basesEqual(refFragment[i], SequenceUtil.C)) &&
							(SequenceUtil.basesEqual(refFragment[i+1], SequenceUtil.G))) {
						// We want to catch the case where there's a CpG in the reference, even if it is not valid
						// to prevent the C showing up as a non-CpG C down below. Otherwise this could have been all
						// in one if statement
						if (isValidCpg(refFragment, readFragment, readQualityFragment, i)) {
							recordCpgs++;
							final CpgLocation curLocation = new CpgLocation(samRecord.getReferenceName(), curRefIndex);
							cpgTotal.increment(curLocation);
							if (SequenceUtil.isBisulfiteConverted(readFragment[i], refFragment[i])) {
								cpgConverted.increment(curLocation);
							}
						}
						i++;
					} else if (isC(refFragment[i], readFragment[i]) && isAboveCytoQcThreshold(readQualities, i) &&
							SequenceUtil.bisulfiteBasesEqual(false, readFragment[i+1], refFragment[i+1])) {
						// C base in the reference that's not associated with a CpG
						nCytoTotal++;
						if (SequenceUtil.isBisulfiteConverted(readFragment[i], refFragment[i])) {
							nCytoConverted++;
						}
					}
				}
			}

			if (recordCpgs == 0) {
				noCpgCount++;
			}
		}

		public void finish() {
			cytoConversionRate = nCytoTotal == 0 ? 0 : nCytoConverted / (double)nCytoTotal;
			nCpgSeen = (int)cpgTotal.getSumOfValues();
			nCpgConverted = (int)cpgConverted.getSumOfValues();
			cpgConversionRate = nCpgSeen == 0 ? 0 : nCpgConverted / (double)nCpgSeen;
			coverageMean = cpgTotal.getMeanBinSize();
			coverageMedian = (int)cpgTotal.getMedianBinSize();
		}

		@Override
		public void addMetricsToFile(final MetricsFile> metricsFile) {
			// Create both the summary and detail metrics & add them to the RrbsMetrics container class for
			// the downstream code to use as desired
			final RrbsSummaryMetrics summaryMetrics = buildSummaryMetrics();
			final List detailMetrics = buildDetailMetrics();
			final RrbsMetrics rrbsMetrics = new RrbsMetrics(summaryMetrics, detailMetrics);
			metricsFile.addMetric(rrbsMetrics);
		}

		private RrbsSummaryMetrics buildSummaryMetrics() {
			final RrbsSummaryMetrics summaryMetrics = new RrbsSummaryMetrics();
			summaryMetrics.SAMPLE = sample;
			summaryMetrics.READ_GROUP = readGroup;
			summaryMetrics.LIBRARY = library;
			summaryMetrics.READS_ALIGNED = mappedRecordCount;
			summaryMetrics.NON_CPG_BASES = nCytoTotal;
			summaryMetrics.NON_CPG_CONVERTED_BASES = nCytoConverted;
			summaryMetrics.PCT_NON_CPG_BASES_CONVERTED = cytoConversionRate;
			summaryMetrics.CPG_BASES_SEEN = nCpgSeen;
			summaryMetrics.CPG_BASES_CONVERTED = nCpgConverted;
			summaryMetrics.PCT_CPG_BASES_CONVERTED = cpgConversionRate;
			summaryMetrics.MEAN_CPG_COVERAGE = coverageMean;
			summaryMetrics.MEDIAN_CPG_COVERAGE = coverageMedian;
			summaryMetrics.READS_IGNORED_SHORT = smallReadCount;
			summaryMetrics.READS_WITH_NO_CPG = noCpgCount;
			summaryMetrics.READS_IGNORED_MISMATCHES = mismatchCount;
			return summaryMetrics;
		}

		private List buildDetailMetrics() {
			final List detailMetrics = new ArrayList();
			for (final CpgLocation key : cpgTotal.keySet()) {
				final RrbsCpgDetailMetrics cpgMetric = new RrbsCpgDetailMetrics();
				cpgMetric.SAMPLE = sample;
				cpgMetric.READ_GROUP = readGroup;
				cpgMetric.LIBRARY = library;
				cpgMetric.SEQUENCE_NAME = key.getSequence();
				cpgMetric.POSITION = key.getPosition();
				cpgMetric.TOTAL_SITES = (int)cpgTotal.get(key).getValue();
				cpgMetric.CONVERTED_SITES = cpgConverted.containsKey(key) ? (int)cpgConverted.get(key).getValue() : 0;
				cpgMetric.PCT_CONVERTED = cpgMetric.CONVERTED_SITES == 0 ? 0 : cpgMetric.CONVERTED_SITES / (double)cpgMetric.TOTAL_SITES;
				detailMetrics.add(cpgMetric);
			}
			return detailMetrics;
		}
	}

	private byte[] getFragment(final byte[] fullArray, final int fragmentStart, final int length) {
		return Arrays.copyOfRange(fullArray, fragmentStart, fragmentStart + length);
	}

	/**
	 * True if there's a C in the reference as well as read (possibly bisulfite converted)
	 */
	private boolean isC(final byte refBase, final byte readBase) {
		return (SequenceUtil.basesEqual(refBase, SequenceUtil.C) && SequenceUtil.bisulfiteBasesEqual(readBase, refBase));
	}

	/**
	 * Checks a pair of bases for CpG status & quality thresholds
	 */
	private boolean isValidCpg(final byte[] refBases, final byte[] readBases, final byte[] readQualities, final int index) {
		return isC(refBases[index], readBases[index]) && SequenceUtil.basesEqual(refBases[index+1], readBases[index+1]) &&
				isAboveCytoQcThreshold(readQualities, index);
	}

	/**
	 * Any cyto base (CpG context or not) needs to be above a specified quality threshold. Similarly, the neighboring
	 * base on the 3' side must also be above another threshold unless the cyto base is the final base in the 3'
	 * direction.
	 */
	private boolean isAboveCytoQcThreshold(final byte[] readQualities, final int index) {
		return ((index < readQualities.length - 1) && (readQualities[index] >= cQualityThreshold) &&
				(readQualities[index+1] >= nextBaseQualityThreshold));
	}

	/**
	 * Accounts for the fact that negative strand counts have been reversed
	 */
	private int getCurRefIndex(final int refStart, final int blockLength, final int idx, final boolean isNegative) {
		return isNegative ? refStart + (blockLength - 1) - idx - 1 : refStart + idx;
	}
}

/**
 * Used to keep track of the location of CpG sites
 */
class CpgLocation implements Comparable {
	private final String sequence;
	private final Integer position;

	public CpgLocation(final String sequence, final int position) {
		this.sequence = sequence;
		this.position = position;
	}

	@Override
	public int compareTo(final CpgLocation other) {
		final int seqComp = sequence.compareTo(other.sequence);
		return seqComp == 0 ? position.compareTo(other.position) : seqComp;
	}

	@Override
	public boolean equals(final Object other) {
		if (this == other) {
			return true;
		}

		if (other == null || getClass() != other.getClass()) {
			return false;
		}

		final CpgLocation that = (CpgLocation)other;

		return (sequence.equals(that.sequence)) && (position.equals(that.position));
	}

	@Override
	public int hashCode() {
		int result = sequence.hashCode();
		result = 31 * result + position;
		return result;
	}

	public String getSequence() {
		return sequence;
	}

	public Integer getPosition() {
		return position;
	}
}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy