All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.broadinstitute.hellbender.tools.ClipReads Maven / Gradle / Ivy

The newest version!
package org.broadinstitute.hellbender.tools;

import htsjdk.samtools.reference.ReferenceSequence;
import htsjdk.samtools.reference.ReferenceSequenceFile;
import htsjdk.samtools.reference.ReferenceSequenceFileFactory;
import htsjdk.samtools.util.StringUtil;
import org.apache.commons.lang3.StringUtils;
import org.apache.commons.lang3.tuple.MutablePair;
import org.apache.commons.lang3.tuple.Pair;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
import org.broadinstitute.barclay.argparser.Argument;
import org.broadinstitute.barclay.argparser.CommandLineProgramProperties;
import org.broadinstitute.barclay.argparser.WorkflowOutput;
import org.broadinstitute.barclay.argparser.WorkflowProperties;
import org.broadinstitute.barclay.help.DocumentedFeature;
import org.broadinstitute.hellbender.cmdline.StandardArgumentDefinitions;
import org.broadinstitute.hellbender.engine.FeatureContext;
import org.broadinstitute.hellbender.engine.GATKPath;
import org.broadinstitute.hellbender.engine.ReadWalker;
import org.broadinstitute.hellbender.engine.ReferenceContext;
import org.broadinstitute.hellbender.utils.BaseUtils;
import org.broadinstitute.hellbender.utils.clipping.ClippingOp;
import org.broadinstitute.hellbender.utils.clipping.ClippingRepresentation;
import org.broadinstitute.hellbender.utils.clipping.ReadClipper;
import org.broadinstitute.hellbender.utils.logging.OneShotLogger;
import org.broadinstitute.hellbender.utils.read.GATKRead;
import org.broadinstitute.hellbender.utils.read.SAMFileGATKReadWriter;
import picard.cmdline.programgroups.ReadDataManipulationProgramGroup;

import java.io.PrintStream;
import java.util.*;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

/**
 * Read clipping based on quality, position or sequence matching. This tool provides simple, powerful read clipping
 * capabilities that allow you to remove low quality strings of bases, sections of reads, and reads containing
 * user-provided sequences.
 *
 * 

There are three arguments for clipping (quality, position and sequence), which can be used alone or in * combination. In addition, you can also specify a clipping representation, which determines exactly how ClipReads * applies clips to the reads (soft clips, writing Q0 base quality scores, etc.). Please note that you MUST specify at * least one of the three clipping arguments, and specifying a clipping representation is not sufficient. If you do not * specify a clipping argument, the program will run but it will not do anything to your reads.

* *

Quality score based clipping

* *

Clip bases from the read in clipper from

* *
argmax_x{ \sum{i = x + 1}^l (qTrimmingThreshold - qual)
* *

to the end of the read. This is copied from BWA.

* *

Walk through the read from the end (in machine cycle order) to the beginning, calculating the * running sum of qTrimmingThreshold - qual. While we do this, we track the maximum value of this * sum where the delta > 0. After the loop, clipPoint is either -1 (don't do anything) or the * clipping index in the read (from the end).

* *

Cycle based clipping

* *

Clips machine cycles from the read. Accepts a string of ranges of the form start1-end1,start2-end2, etc. * For each start/end pair, removes bases in machine cycles from start to end, inclusive. These are 1-based values (positions). * For example, 1-5,10-12 clips the first 5 bases, and then three bases at cycles 10, 11, and 12.

* *

Sequence matching

* *

Clips bases that exactly match one of a number of base sequences. This employs an exact match algorithm, * filtering only bases whose sequence exactly matches SEQ.

* * *

Adapter locations

*

Only on uBAM: if adapter on five prime XF or adapter in three prime XT is given - clip it.

*

Input

*

Any number of SAM/BAM/CRAM files.

* *

Output

*

A new SAM/BAM/CRAM file containing all of the reads from the input SAM/BAM/CRAMs with the user-specified clipping * operation applied to each read.

* *

Summary output (console)

*
 * Number of examined reads              13
 * Number of clipped reads               13
 * Percent of clipped reads              100.00
 * Number of examined bases              988
 * Number of clipped bases               126
 * Percent of clipped bases              12.75
 * Number of quality-score clipped bases 126
 * Number of range clipped bases         0
 * Number of sequence clipped bases      0
 *
 * if --clip-adapter is provided, an additional line will be appended:
 * Number of adapter clipped bases       18228
 * 
* *

Example usage

*
gatk ClipReads \
 *   -I input_reads.bam \
 *   -O output_reads.bam \
 *   -XF sequences.fasta \
 *   -X CCCCC \
 *   -CT "1-5,11-15" \
 *   -QT 10
* *

The command line shown above will apply all three arguments in combination. See the detailed examples below to see how the choice of clipping representation affects the output.

* *

Detailed clipping examples

*

Suppose we are given this read:

*
 *     314KGAAXX090507:1:19:1420:1123#0        16      chrM    3116    29      76M     *       *       *
 *          TAGGACCCGGGCCCCCCTCCCCAATCCTCCAACGCATATAGCGGCCGCGCCTTCCCCCGTAAATGATATCATCTCA
 *          #################4?6/?2135;;;'1/=/<'B9;12;68?A79@,@==@9?=AAA3;A@B;A?B54;?ABA
 *     
* *

If we are clipping reads with -QT 10 and -CR WRITE_NS, we get:

* *
 *     314KGAAXX090507:1:19:1420:1123#0        16      chrM    3116    29      76M     *       *       *
 *          NNNNNNNNNNNNNNNNNTCCCCAATCCTCCAACGCATATAGCGGCCGCGCCTTCCCCCGTAAATGATATCATCTCA
 *          #################4?6/?2135;;;'1/=/<'B9;12;68?A79@,@==@9?=AAA3;A@B;A?B54;?ABA
 *     
* *

Whereas with -QT 10 -CR WRITE_Q0S:

*
 *     314KGAAXX090507:1:19:1420:1123#0        16      chrM    3116    29      76M     *       *       *
 *          TAGGACCCGGGCCCCCCTCCCCAATCCTCCAACGCATATAGCGGCCGCGCCTTCCCCCGTAAATGATATCATCTCA
 *          !!!!!!!!!!!!!!!!!4?6/?2135;;;'1/=/<'B9;12;68?A79@,@==@9?=AAA3;A@B;A?B54;?ABA
 *     
* *

Or -QT 10 -CR SOFTCLIP_BASES:

*
 *     314KGAAXX090507:1:19:1420:1123#0        16      chrM    3133    29      17S59M  *       *       *
 *          TAGGACCCGGGCCCCCCTCCCCAATCCTCCAACGCATATAGCGGCCGCGCCTTCCCCCGTAAATGATATCATCTCA
 *          #################4?6/?2135;;;'1/=/<'B9;12;68?A79@,@==@9?=AAA3;A@B;A?B54;?ABA
 *     
* */ @CommandLineProgramProperties( summary = "Read clipping based on quality, position or sequence matching. This tool provides simple, powerful read clipping " + "capabilities that allow you to remove low quality strings of bases, sections of reads, and reads containing " + "user-provided sequences.", oneLineSummary = "Clip reads in a SAM/BAM/CRAM file", programGroup = ReadDataManipulationProgramGroup.class ) @DocumentedFeature @WorkflowProperties public final class ClipReads extends ReadWalker { private final Logger logger = LogManager.getLogger(ClipReads.class); private static final OneShotLogger tooShortOneShotLogger = new OneShotLogger(ClipReads.class); private static final OneShotLogger noAttrOneShotLogger = new OneShotLogger(ClipReads.class); public static final String OUTPUT_STATISTICS_LONG_NAME = "output-statistics"; public static final String OUTPUT_STATISTICS_SHORT_NAME = "os"; public static final String Q_TRIMMING_THRESHOLD_LONG_NAME = "q-trimming-threshold"; public static final String Q_TRIMMING_THRESHOLD_SHORT_NAME = "QT"; public static final String CYCLES_TO_TRIM_LONG_NAME = "cycles-to-trim"; public static final String CYCLES_TO_TRIM_SHORT_NAME = "CT"; public static final String CLIP_SEQUENCES_FILE_LONG_NAME = "clip-sequences-file"; public static final String CLIP_SEQUENCES_FILE_SHORT_NAME = "XF"; public static final String CLIP_SEQUENCE_LONG_NAME = "clip-sequence"; public static final String CLIP_SEQUENCE_SHORT_NAME = "X"; public static final String CLIP_REPRESENTATION_LONG_NAME = "clip-representation"; public static final String CLIP_REPRESENTATION_SHORT_NAME = "CR"; public static final String CLIP_ADAPTER_LONG_NAME = "clip-adapter"; public static final String CLIP_ADAPTER_SHORT_NAME = "CA"; public static final String READ_LONG_NAME = "read"; public static final String READ_SHORT_NAME = READ_LONG_NAME; public static final String MIN_READ_LENGTH_TO_REPORT_LONG_NAME = "min-read-length-to-output"; public static final String FIVE_PRIME_TRIMMING_TAG = "tf"; public static final String THREE_PRIME_TRIMMING_TAG = "tm"; public static final String FIVE_PRIME_ADAPTER_LOCATION_TAG = "XF"; public static final String THREE_PRIME_ADAPTER_LOCATION_TAG = "XT"; /** * The output SAM/BAM/CRAM file will be written here */ @Argument(doc = "BAM output file", shortName = StandardArgumentDefinitions.OUTPUT_SHORT_NAME, fullName = StandardArgumentDefinitions.OUTPUT_LONG_NAME) @WorkflowOutput(optionalCompanions={StandardArgumentDefinitions.OUTPUT_INDEX_COMPANION}) GATKPath OUTPUT; /** * If provided, ClipReads will write summary statistics about the clipping operations applied to the reads in this file. */ @Argument(fullName = OUTPUT_STATISTICS_LONG_NAME, shortName = OUTPUT_STATISTICS_SHORT_NAME, doc = "File to output statistics", optional = true) @WorkflowOutput GATKPath STATSOUTPUT = null; /** * If a value > 0 is provided, then the quality score based read clipper will be applied to the reads using this * quality score threshold. */ @Argument(fullName = Q_TRIMMING_THRESHOLD_LONG_NAME, shortName = Q_TRIMMING_THRESHOLD_SHORT_NAME, doc = "If provided, the Q-score clipper will be applied", optional = true) int qTrimmingThreshold = -1; /** * Clips machine cycles from the read. Accepts a string of ranges of the form start1-end1,start2-end2, etc. * For each start/end pair, removes bases in machine cycles from start to end, inclusive. These are 1-based * values (positions). For example, 1-5,10-12 clips the first 5 bases, and then three bases at cycles 10, 11, * and 12. */ @Argument(fullName = CYCLES_TO_TRIM_LONG_NAME, shortName = CYCLES_TO_TRIM_SHORT_NAME, doc = "String indicating machine cycles to clip from the reads", optional = true) String cyclesToClipArg = null; /** * Reads the sequences in the provided FASTA file, and clip any bases that exactly match any of the * sequences in the file. */ @Argument(fullName = CLIP_SEQUENCES_FILE_LONG_NAME, shortName = CLIP_SEQUENCES_FILE_SHORT_NAME, doc = "Remove sequences within reads matching the sequences in this FASTA file", optional = true) GATKPath clipSequenceFile = null; /** * Clips bases from the reads matching the provided SEQ. */ @Argument(fullName = CLIP_SEQUENCE_LONG_NAME, shortName = CLIP_SEQUENCE_SHORT_NAME, doc = "Remove sequences within reads matching this sequence", optional = true) List clipSequencesArgs = null; /** * The different values for this argument determines how ClipReads applies clips to the reads. This can range * from writing Ns over the clipped bases to hard clipping away the bases from the BAM. */ @Argument(fullName = CLIP_REPRESENTATION_LONG_NAME, shortName = CLIP_REPRESENTATION_SHORT_NAME, doc = "How should we actually clip the bases?", optional = true) ClippingRepresentation clippingRepresentation = ClippingRepresentation.WRITE_NS; @Argument(fullName=READ_LONG_NAME, shortName = READ_SHORT_NAME, doc="", optional = true) String onlyDoRead = null; @Argument(fullName = CLIP_ADAPTER_LONG_NAME, shortName = CLIP_ADAPTER_SHORT_NAME, doc = "Clip locations according to XF, XT tags. This will destroy reads which have none of these tags", optional = true) boolean clipAdapter = false; //Note: relevant only to the single ended reads @Argument(fullName = MIN_READ_LENGTH_TO_REPORT_LONG_NAME, doc = "Shortest read to output. Note that this works correctly only on non-paired reads.", optional = true) private final Integer minReadLength = 0; /** * List of sequence that should be clipped from the reads */ private final List sequencesToClip = new ArrayList<>(); /** * List of cycle start / stop pairs (0-based, stop is included in the cycle to remove) to clip from the reads */ private List> cyclesToClip = null; /** * Output reads is written to this BAM. */ private SAMFileGATKReadWriter outputBam; /** * Accumulator for the stats. */ private ClippingData accumulator; /** * Output stream for the stats. */ private PrintStream outputStats; /** * The initialize function. */ @Override public void onTraversalStart() { if (qTrimmingThreshold >= 0) { logger.info(String.format("Creating Q-score clipper with threshold %d", qTrimmingThreshold)); } // // Initialize the sequences to clip // if (clipSequencesArgs != null) { int i = 0; for (String toClip : clipSequencesArgs) { i++; ReferenceSequence rs = new ReferenceSequence("CMDLINE-" + i, -1, StringUtil.stringToBytes(toClip)); addSeqToClip(rs.getName(), rs.getBases()); } } if (clipSequenceFile != null) { ReferenceSequenceFile rsf = ReferenceSequenceFileFactory.getReferenceSequenceFile(clipSequenceFile.toPath()); while (true) { ReferenceSequence rs = rsf.nextSequence(); if (rs == null) break; else { addSeqToClip(rs.getName(), rs.getBases()); } } } // // Initialize the cycle ranges to clip // if (cyclesToClipArg != null) { cyclesToClip = new ArrayList<>(); for (String range : cyclesToClipArg.split(",")) { try { String[] elts = range.split("-"); int start = Integer.parseInt(elts[0]) - 1; int stop = Integer.parseInt(elts[1]) - 1; if (start < 0) throw new Exception(); if (stop < start) throw new Exception(); logger.info(String.format("Creating cycle clipper %d-%d", start, stop)); cyclesToClip.add(new MutablePair<>(start, stop)); } catch (Exception e) { throw new RuntimeException("Badly formatted cyclesToClip argument: " + cyclesToClipArg); } } } final boolean presorted = EnumSet.of(ClippingRepresentation.WRITE_NS, ClippingRepresentation.WRITE_NS_Q0S, ClippingRepresentation.WRITE_Q0S).contains(clippingRepresentation); outputBam = createSAMWriter(OUTPUT, presorted); accumulator = new ClippingData(sequencesToClip); outputStats = STATSOUTPUT == null ? null : new PrintStream(STATSOUTPUT.getOutputStream()); } @Override public void apply( GATKRead read, ReferenceContext ref, FeatureContext featureContext ) { if ( onlyDoRead == null || read.getName().equals(onlyDoRead) ) { if ( clippingRepresentation == ClippingRepresentation.HARDCLIP_BASES || clippingRepresentation == ClippingRepresentation.REVERT_SOFTCLIPPED_BASES ) read = ReadClipper.revertSoftClippedBases(read); ReadClipperWithData clipper = new ReadClipperWithData(read, sequencesToClip); // // run all three clipping modules // clipBadQualityScores(clipper); clipCycles(clipper); clipSequences(clipper); clipAdapter(clipper); accumulate(clipper); } } @Override public ClippingData onTraversalSuccess(){ if ( outputStats != null ){ outputStats.printf(accumulator.toString()); } return accumulator; } @Override public void closeTool() { if ( outputStats != null ){ outputStats.close(); } if ( outputBam != null ) { outputBam.close(); } } /** * Helper function that adds a seq with name and bases (as bytes) to the list of sequences to be clipped * * @param name * @param bases */ private void addSeqToClip(String name, byte[] bases) { SeqToClip clip = new SeqToClip(name, bases); sequencesToClip.add(clip); logger.info(String.format("Creating sequence clipper %s: %s/%s", clip.name, clip.seq, clip.revSeq)); } /** * clip sequences from the reads that match all of the sequences in the global sequencesToClip variable. * Adds ClippingOps for each clip to clipper. * * @param clipper */ private void clipSequences(ReadClipperWithData clipper) { if (sequencesToClip != null) { // don't bother if we don't have any sequences to clip GATKRead read = clipper.getRead(); ClippingData data = clipper.getData(); for (SeqToClip stc : sequencesToClip) { // we have a pattern for both the forward and the reverse strands Pattern pattern = read.isReverseStrand() ? stc.revPat : stc.fwdPat; String bases = read.getBasesString(); Matcher match = pattern.matcher(bases); // keep clipping until match.find() says it can't find anything else boolean found = true; // go through at least once while (found) { found = match.find(); //System.out.printf("Matching %s against %s/%s => %b%n", bases, stc.seq, stc.revSeq, found); if (found) { int start = match.start(); int stop = match.end() - 1; //ClippingOp op = new ClippingOp(ClippingOp.ClippingType.MATCHES_CLIP_SEQ, start, stop, stc.seq); ClippingOp op = new ClippingOp(start, stop); clipper.addOp(op); data.incSeqClippedBases(stc.seq, op.getLength()); } } } clipper.setData(data); } } /** * Convenence function that takes a read and the start / stop clipping positions based on the forward * strand, and returns start/stop values appropriate for the strand of the read. * * @param read * @param start * @param stop * @return */ private Pair strandAwarePositions(GATKRead read, int start, int stop) { if (read.isReverseStrand()) return new MutablePair<>(read.getLength() - stop - 1, read.getLength() - start - 1); else return new MutablePair<>(start, stop); } /** * clip bases at cycles between the ranges in cyclesToClip by adding appropriate ClippingOps to clipper. * * @param clipper */ private void clipCycles(ReadClipperWithData clipper) { if (cyclesToClip != null) { GATKRead read = clipper.getRead(); ClippingData data = clipper.getData(); for (Pair p : cyclesToClip) { // iterate over each cycle range int cycleStart = p.getLeft(); int cycleStop = p.getRight(); if (cycleStart < read.getLength()) { // only try to clip if the cycleStart is less than the read's length if (cycleStop >= read.getLength()) // we do tolerate [for convenience) clipping when the stop is beyond the end of the read cycleStop = read.getLength() - 1; Pair startStop = strandAwarePositions(read, cycleStart, cycleStop); int start = startStop.getLeft(); int stop = startStop.getRight(); //ClippingOp op = new ClippingOp(ClippingOp.ClippingType.WITHIN_CLIP_RANGE, start, stop, null); ClippingOp op = new ClippingOp(start, stop); clipper.addOp(op); data.incNRangeClippedBases(op.getLength()); } } clipper.setData(data); } } /** * Clip bases from the read in clipper from *

* argmax_x{ \sum{i = x + 1}^l (qTrimmingThreshold - qual) *

* to the end of the read. This is blatantly stolen from BWA. *

* Walk through the read from the end (in machine cycle order) to the beginning, calculating the * running sum of qTrimmingThreshold - qual. While we do this, we track the maximum value of this * sum where the delta > 0. After the loop, clipPoint is either -1 (don't do anything) or the * clipping index in the read (from the end). * * @param clipper */ private void clipBadQualityScores(ReadClipperWithData clipper) { GATKRead read = clipper.getRead(); ClippingData data = clipper.getData(); int readLen = read.getLength(); byte[] quals = read.getBaseQualities(); int clipSum = 0, lastMax = -1, clipPoint = -1; // -1 means no clip for (int i = readLen - 1; i >= 0; i--) { int baseIndex = read.isReverseStrand() ? readLen - i - 1 : i; byte qual = quals[baseIndex]; clipSum += (qTrimmingThreshold - qual); if (clipSum >= 0 && (clipSum >= lastMax)) { lastMax = clipSum; clipPoint = baseIndex; } } if (clipPoint != -1) { int start = read.isReverseStrand() ? 0 : clipPoint; int stop = read.isReverseStrand() ? clipPoint : readLen - 1; //clipper.addOp(new ClippingOp(ClippingOp.ClippingType.LOW_Q_SCORES, start, stop, null)); ClippingOp op = new ClippingOp(start, stop); clipper.addOp(op); data.incNQClippedBases(op.getLength()); } clipper.setData(data); } /** * Clip bases on the reads that have a trimming mark. Some programs use tags to indicate * the number of bases that should be trimmed from the read to trim the adapter and do not * trim the adapter itself. This function uses these tags (XF and XT) to make the trimming * * @param clipper */ private void clipAdapter(ReadClipperWithData clipper) { if (clipAdapter) { GATKRead read = clipper.getRead(); ClippingData data = clipper.getData(); Integer xf = read.getAttributeAsInteger(FIVE_PRIME_ADAPTER_LOCATION_TAG); Integer xt = read.getAttributeAsInteger(THREE_PRIME_ADAPTER_LOCATION_TAG); if ((xf != null) && (xt != null) && (xf == 0) && (xt == 0)) { ClippingOp clip = new ClippingOp(0, read.getLength()); clipper.addOp(clip); data.incNAdapterClippedBases((read.getLength())); return; } if ((xt != null) && (xt <= read.getLength())) { //XT is the location of the first nucleotide in the 3' adapter to be clipped (one-based) ClippingOp xt_clip = new ClippingOp(xt - 1, read.getLength()); clipper.addOp(xt_clip); addAdapterTag(clipper, THREE_PRIME_TRIMMING_TAG); data.incNAdapterClippedBases(read.getLength() - xt + 1); } if ((xf != null) && (xf > 1)) { // XF is the location of the first nucleotide to be not-clipped (one-based to be consistent with XT) ClippingOp xf_clip = new ClippingOp(0, xf - 2); //stop is included clipper.addOp(xf_clip); addAdapterTag(clipper, FIVE_PRIME_TRIMMING_TAG); data.incNAdapterClippedBases(xf); } if ( (xf == null) && (xt == null)) { noAttrOneShotLogger.warn("clipAdapter requested, yet neither " + FIVE_PRIME_ADAPTER_LOCATION_TAG + " nor " + THREE_PRIME_ADAPTER_LOCATION_TAG + " attributes found. first read: " + read); } } } private void addAdapterTag(ReadClipperWithData clipper,final String tag){ String curTagValue = clipper.getRead().getAttributeAsString(tag); if (curTagValue == null ){ clipper.getRead().setAttribute(tag, "A"); } else if (!curTagValue.contains("A")){ clipper.getRead().setAttribute(tag, curTagValue + "A"); } } private void accumulate(ReadClipperWithData clipper) { if ( clipper == null ) return; GATKRead clippedRead = clipper.clipRead(clippingRepresentation); if ( minReadLength > 0 ) { if (clippedRead.isPaired()){ tooShortOneShotLogger.warn("Limit on the read length is not implemented for PE reads. continuing anyway. first read:" + clippedRead); } } if (clippedRead.getLength() >= minReadLength) { outputBam.addRead(clippedRead); } accumulator.nTotalReads++; accumulator.nTotalBases += clipper.getRead().getLength(); if (clipper.wasClipped()) { accumulator.nClippedReads++; accumulator.addData(clipper.getData()); } } // -------------------------------------------------------------------------------------------------------------- // // utility classes // // -------------------------------------------------------------------------------------------------------------- private static final class SeqToClip { String name; String seq, revSeq; Pattern fwdPat, revPat; public SeqToClip(String name, byte[] bytez) { this.name = name; this.seq = new String(bytez); this.fwdPat = Pattern.compile(seq, Pattern.CASE_INSENSITIVE); this.revSeq = new String(BaseUtils.simpleReverseComplement(bytez)); this.revPat = Pattern.compile(revSeq, Pattern.CASE_INSENSITIVE); } } public final class ClippingData { public long nTotalReads = 0; public long nTotalBases = 0; public long nClippedReads = 0; public long nClippedBases = 0; public long nQClippedBases = 0; public long nRangeClippedBases = 0; public long nSeqClippedBases = 0; public long nAdapterClippedBases = 0; SortedMap seqClipCounts = new TreeMap<>(); public ClippingData(List clipSeqs) { for (SeqToClip clipSeq : clipSeqs) { seqClipCounts.put(clipSeq.seq, 0L); } } public void incNQClippedBases(int n) { nQClippedBases += n; nClippedBases += n; } public void incNRangeClippedBases(int n) { nRangeClippedBases += n; nClippedBases += n; } public void incNAdapterClippedBases(int n){ nAdapterClippedBases += n; nClippedBases +=n; } public void incSeqClippedBases(final String seq, int n) { nSeqClippedBases += n; nClippedBases += n; seqClipCounts.put(seq, seqClipCounts.get(seq) + n); } public void addData (ClippingData data) { nTotalReads += data.nTotalReads; nTotalBases += data.nTotalBases; nClippedReads += data.nClippedReads; nClippedBases += data.nClippedBases; nQClippedBases += data.nQClippedBases; nRangeClippedBases += data.nRangeClippedBases; nSeqClippedBases += data.nSeqClippedBases; nAdapterClippedBases += data.nAdapterClippedBases; for (String seqClip : data.seqClipCounts.keySet()) { Long count = data.seqClipCounts.get(seqClip); if (seqClipCounts.containsKey(seqClip)) count += seqClipCounts.get(seqClip); seqClipCounts.put(seqClip, count); } } public String toString() { StringBuilder s = new StringBuilder(); s.append(StringUtils.repeat('-', 80) + "\n") .append(String.format("Number of examined reads %d%n", nTotalReads)) .append(String.format("Number of clipped reads %d%n", nClippedReads)) .append(String.format("Percent of clipped reads %.2f%n", (100.0 * nClippedReads) / nTotalReads)) .append(String.format("Number of examined bases %d%n", nTotalBases)) .append(String.format("Number of clipped bases %d%n", nClippedBases)) .append(String.format("Percent of clipped bases %.2f%n", (100.0 * nClippedBases) / nTotalBases)) .append(String.format("Number of quality-score clipped bases %d%n", nQClippedBases)) .append(String.format("Number of range clipped bases %d%n", nRangeClippedBases)) .append(String.format("Number of sequence clipped bases %d%n", nSeqClippedBases)); for (Map.Entry elt : seqClipCounts.entrySet()) { s.append(String.format(" %8d clip sites matching %s%n", elt.getValue(), elt.getKey())); } if ( clipAdapter ) { s.append(String.format("Number of adapter clipped bases %d%n", nAdapterClippedBases)); } s.append(StringUtils.repeat('-', 80) + "\n"); return s.toString(); } } public final class ReadClipperWithData extends ReadClipper { private ClippingData data; public ReadClipperWithData(GATKRead read, List clipSeqs) { super(read); data = new ClippingData(clipSeqs); } public ClippingData getData() { return data; } public void setData(ClippingData data) { this.data = data; } public void addData(ClippingData data) { this.data.addData(data); } } }





© 2015 - 2025 Weber Informatics LLC | Privacy Policy