All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.broadinstitute.hellbender.tools.walkers.rnaseq.SplitNCigarReads Maven / Gradle / Ivy

The newest version!
package org.broadinstitute.hellbender.tools.walkers.rnaseq;

import htsjdk.samtools.*;
import htsjdk.samtools.reference.ReferenceSequenceFile;
import org.broadinstitute.barclay.argparser.Advanced;
import org.broadinstitute.barclay.argparser.Argument;
import org.broadinstitute.barclay.argparser.CommandLineProgramProperties;
import org.broadinstitute.barclay.argparser.WorkflowProperties;
import org.broadinstitute.barclay.argparser.WorkflowOutput;
import org.broadinstitute.barclay.help.DocumentedFeature;
import org.broadinstitute.hellbender.cmdline.StandardArgumentDefinitions;
import org.broadinstitute.hellbender.engine.FeatureContext;
import org.broadinstitute.hellbender.engine.GATKPath;
import org.broadinstitute.hellbender.engine.MultiplePassReadWalker;
import org.broadinstitute.hellbender.engine.ReferenceContext;
import org.broadinstitute.hellbender.engine.filters.ReadFilter;
import org.broadinstitute.hellbender.engine.filters.ReadFilterLibrary;
import org.broadinstitute.hellbender.exceptions.UserException;
import org.broadinstitute.hellbender.transformers.MappingQualityReadTransformer;
import org.broadinstitute.hellbender.transformers.NDNCigarReadTransformer;
import org.broadinstitute.hellbender.transformers.ReadTransformer;
import org.broadinstitute.hellbender.utils.GenomeLocParser;
import org.broadinstitute.hellbender.utils.SATagBuilder;
import org.broadinstitute.hellbender.utils.clipping.ReadClipper;
import org.broadinstitute.hellbender.utils.fasta.CachingIndexedFastaSequenceFile;
import org.broadinstitute.hellbender.utils.read.ArtificialReadUtils;
import org.broadinstitute.hellbender.utils.read.CigarUtils;
import org.broadinstitute.hellbender.utils.read.GATKRead;
import org.broadinstitute.hellbender.utils.read.SAMFileGATKReadWriter;
import picard.cmdline.programgroups.ReadDataManipulationProgramGroup;

import java.io.IOException;
import java.util.ArrayList;
import java.util.Collections;
import java.util.List;

import static org.broadinstitute.hellbender.cmdline.StandardArgumentDefinitions.OUTPUT_LONG_NAME;
import static org.broadinstitute.hellbender.cmdline.StandardArgumentDefinitions.OUTPUT_SHORT_NAME;

/**
 *
 * Splits reads that contain Ns in their cigar string (e.g. spanning splicing events in RNAseq data).
 *
 * Identifies all N cigar elements and creates k+1 new reads (where k is the number of N cigar elements).
 * The first read includes the bases that are to the left of the first N element, while the part of the read that is to the right of the N
 * (including the Ns) is hard clipped and so on for the rest of the new reads. Used for post-processing RNA reads aligned against the full reference.
 *
 * 

Input

*

* BAM file *

* * *

Output

*

* BAM file with reads split at N CIGAR elements and CIGAR strings updated. *

* *

Usage example

*
 *    gatk SplitNCigarReads \
 *      -R Homo_sapiens_assembly38.fasta \
 *      -I input.bam \
 *      -O output.bam
 *  
*/ @CommandLineProgramProperties( summary = "Splits reads that contain Ns in their cigar string (e.g. spanning splicing events).", oneLineSummary = "Split Reads with N in Cigar", programGroup = ReadDataManipulationProgramGroup.class ) @DocumentedFeature @WorkflowProperties public final class SplitNCigarReads extends MultiplePassReadWalker { // A list of tags that break upon splitting on N. These will be removed from reads in the output. // NOTE: for future developers who want to use these tags. For each tag you remove from this list corresponding // support for properly recalculating the tags must be added to repairSupplementaryTags() static final String[] TAGS_TO_REMOVE = {"NM","MD","NH"}; static final String MATE_CIGAR_TAG = "MC"; @Argument(fullName = OUTPUT_LONG_NAME, shortName = OUTPUT_SHORT_NAME, doc="Write output to this BAM filename") @WorkflowOutput(optionalCompanions={StandardArgumentDefinitions.OUTPUT_INDEX_COMPANION}) GATKPath OUTPUT; /** * This flag tells GATK to refactor cigar string with NDN elements to one element. It intended primarily for use in * a RNAseq pipeline since the problem might come up when using RNAseq aligner such as Tophat2 with provided transcriptomes. * You should only use this if you know that your reads have that problem. */ @Argument(fullName = "refactor-cigar-string", shortName = "fixNDN", doc = "refactor cigar string with NDN elements to one element", optional = true) boolean REFACTOR_NDN_CIGAR_READS = false; /** * This flag turns off the mapping quality 255 -> 60 read transformer. The transformer is on by default to ensure that * uniquely mapping reads assigned STAR's default 255 MQ aren't filtered out by HaplotypeCaller. */ @Argument(fullName = "skip-mapping-quality-transform", shortName = "skip-mq-transform", doc = "skip the 255 -> 60 MQ read transform", optional = true) boolean SKIP_MQ_TRANSFORM = false; /** * For expert users only! To minimize memory consumption you can lower this number, but then the tool may skip * overhang fixing in regions with too much coverage. Just make sure to give Java enough memory! 4Gb should be * enough with the default value. */ @Advanced @Argument(fullName="max-reads-in-memory", doc="max reads allowed to be kept in memory at a time by the BAM writer", optional=true) int MAX_RECORDS_IN_MEMORY = 150000; /** * If there are more than this many mismatches within the overhang regions, the whole overhang will get hard-clipped out. * It is still possible in some cases that the overhang could get clipped if the number of mismatches do not exceed this * value, e.g. if most of the overhang mismatches. */ @Argument(fullName="max-mismatches-in-overhang", doc="max number of mismatches allowed in the overhang", optional=true) int MAX_MISMATCHES_IN_OVERHANG = 1; /** * If there are more than this many bases in the overhang, we won't try to hard-clip them out */ @Argument(fullName="max-bases-in-overhang", doc="max number of bases allowed in the overhang", optional=true) int MAX_BASES_TO_CLIP = 40; @Argument(fullName="do-not-fix-overhangs", doc="do not have the walker soft-clip overhanging sections of the reads", optional=true) boolean doNotFixOverhangs = false; @Argument(fullName="process-secondary-alignments", doc="have the walker split secondary alignments (will still repair MC tag without it)", optional=true) boolean processSecondaryAlignments = false; @Override public boolean requiresReference() { return true; } private SAMFileGATKReadWriter outputWriter; private OverhangFixingManager overhangManager; private ReferenceSequenceFile referenceReader; SAMFileHeader header; @Override public List getDefaultReadFilters() { return Collections.singletonList(ReadFilterLibrary.ALLOW_ALL_READS); } @Override public ReadTransformer makePreReadFilterTransformer(){ if (REFACTOR_NDN_CIGAR_READS) { return new NDNCigarReadTransformer(); } else { return ReadTransformer.identity(); } } //FIXME: once the engine accepts read transformer arguments, remove these magic numbers? private static final int FROM_QUALITY = 255; private static final int TO_QUALITY = 60; @Override public ReadTransformer makePostReadFilterTransformer(){ if (SKIP_MQ_TRANSFORM) { return ReadTransformer.identity(); } else { return new MappingQualityReadTransformer(FROM_QUALITY, TO_QUALITY); } } @Override public void onTraversalStart() { header = getHeaderForSAMWriter(); referenceReader = new CachingIndexedFastaSequenceFile(referenceArguments.getReferencePath()); GenomeLocParser genomeLocParser = new GenomeLocParser(getBestAvailableSequenceDictionary()); outputWriter = createSAMWriter(OUTPUT, false); overhangManager = new OverhangFixingManager(header, outputWriter, genomeLocParser, referenceReader, MAX_RECORDS_IN_MEMORY, MAX_MISMATCHES_IN_OVERHANG, MAX_BASES_TO_CLIP, doNotFixOverhangs, processSecondaryAlignments); } @Override public void traverseReads() { forEachRead((GATKRead read, ReferenceContext reference, FeatureContext features) -> splitNCigarRead(read, overhangManager,true, header, processSecondaryAlignments) ); overhangManager.activateWriting(); forEachRead((GATKRead read, ReferenceContext reference, FeatureContext features) -> splitNCigarRead(read, overhangManager,true, header, processSecondaryAlignments) ); } @Override public void closeTool() { if (overhangManager != null) { overhangManager.flush(); } if (outputWriter != null ) { outputWriter.close(); } try {if (referenceReader != null) { referenceReader.close(); } } catch (IOException ex) { throw new UserException.MissingReference("Could not find reference file.", true); } } /** * Goes through the cigar string of the read and create new reads for each consecutive non-N elements (while soft clipping the rest of the read) that are supplemental to each other. * For example: for a read with cigar '1H2M2D1M2N1M2I1N1M2S' 3 new reads will be created with cigar strings: 1H2M2D1M6S, 1H3S1M2I2S and 1H6S1M2S * If the read has an MC tag it will be adjusted according to the clipping of that mate based on its cigar * * @param read the read to split (can be null) * @param emitReads a parameter used to mock behavior for repairing mate cigar string information */ public static GATKRead splitNCigarRead(final GATKRead read, OverhangFixingManager manager, boolean emitReads, SAMFileHeader header, boolean secondaryAlignments) { final int numCigarElements = read.numCigarElements(); List splitReads = new ArrayList<>(2); // Run the tool on dummy mate read to determine what the mate cigar will be upon completion, if manager has a prediction then dont repair if (emitReads && read.hasAttribute(MATE_CIGAR_TAG)) { final GATKRead mateSplitting = splitNCigarRead(ArtificialReadUtils.createArtificialRead(header, TextCigarCodec.decode(read.getAttributeAsString(MATE_CIGAR_TAG))), manager, false, header, secondaryAlignments); read.setAttribute(MATE_CIGAR_TAG, mateSplitting.getCigar().toString()); } manager.setPredictedMateInformation(read); // If it is a secondary alignment, repair its mate-information (assuming its mate was primary) and pass if ( !secondaryAlignments && read.isSecondaryAlignment()){ manager.addReadGroup(Collections.singletonList(read)); return read; } boolean sectionHasMatch = false; int firstCigarIndex = 0; for ( int i = 0; i < numCigarElements; i++ ) { final CigarElement cigarElement = read.getCigarElement(i); CigarOperator op = cigarElement.getOperator(); // One of the "Real" operators if (op == CigarOperator.M || op == CigarOperator.EQ || op == CigarOperator.X || op == CigarOperator.I || op == CigarOperator.D) { sectionHasMatch = true; } if (op == CigarOperator.N) { if (sectionHasMatch) { if (!emitReads) { // not passing the manager ensures that no splices get added to the manager for fake reads splitReads.add(splitReadBasedOnCigar(read, firstCigarIndex, i, null)); } else { splitReads.add(splitReadBasedOnCigar(read, firstCigarIndex, i, manager)); } } firstCigarIndex = i+1; sectionHasMatch = false; } } // if there are no N's in the read if (splitReads.size() < 1) { if (emitReads) { manager.addReadGroup(Collections.singletonList(read)); } return read; } //add the last section of the read: from the last N to the the end of the read // (it will be done for all the usual cigar string that does not end with N) else if ((firstCigarIndex < numCigarElements) && sectionHasMatch) { splitReads.add(splitReadBasedOnCigar(read, firstCigarIndex, numCigarElements, null)); } if (emitReads) { manager.addReadGroup(splitReads); return read; // If emitReads is false then we want the mate of the read } else { return splitReads.get(0); } } /** * Pull out an individual split position for a read * * @param read the read being split * @param cigarStartIndex the index of the first cigar element to keep * @param cigarEndIndex the index of the last cigar element to keep * @param forSplitPositions the manager for keeping track of split positions; can be null * @return a non-null read representing the section of the original read being split out */ private static GATKRead splitReadBasedOnCigar(final GATKRead read, final int cigarStartIndex, final int cigarEndIndex, final OverhangFixingManager forSplitPositions) { int cigarFirstIndex = cigarStartIndex; int cigarSecondIndex = cigarEndIndex; //in case a section of the read ends or starts with D (for example the first section in 1M1D1N1M is 1M1D), we should trim this cigar element // it can be 'if', but it was kept as 'while' to make sure the code can work with Cigar strings that were not "cleaned" while(read.getCigarElement(cigarFirstIndex).getOperator().equals(CigarOperator.D)) { cigarFirstIndex++; } while(read.getCigarElement(cigarSecondIndex-1).getOperator().equals(CigarOperator.D)) { cigarSecondIndex--; } if(cigarFirstIndex > cigarSecondIndex) { throw new IllegalArgumentException("Cannot split this read (might be an empty section between Ns, for example 1N1D1N): " + read.getCigar().toString()); } // we keep only the section of the read that is aligned to the reference between startRefIndex and stopRefIndex (inclusive). // the other sections of the read are clipped: final List elements = read.getCigarElements(); final int startRefIndex = read.getUnclippedStart() + CigarUtils.countRefBasesAndClips(elements, 0, cigarFirstIndex); //goes through the prefix of the cigar (up to cigarStartIndex) and move the reference index. final int stopRefIndex = startRefIndex + CigarUtils.countRefBasesAndClips(elements, cigarFirstIndex,cigarSecondIndex)-1; //goes through a consecutive non-N section of the cigar (up to cigarEndIndex) and move the reference index. if ( forSplitPositions != null ) { final String contig = read.getContig(); final int splitStart = startRefIndex + CigarUtils.countRefBasesAndClips(elements,cigarFirstIndex,cigarEndIndex); //we use cigarEndIndex instead of cigarSecondIndex so we won't take into account the D's at the end. final int splitEnd = splitStart + read.getCigarElement(cigarEndIndex).getLength() - 1; forSplitPositions.addSplicePosition(contig, splitStart, splitEnd); } return ReadClipper.softClipToRegionIncludingClippedBases(read,startRefIndex,stopRefIndex); } /** * A method that repairs the NH and NM tags for a group of reads * * @param readFamily the a list of reads where the first item is the read to be marked as primary * @param header the file header to associate with the given reads * @return a non-null read representing the section of the original read being split out */ public static void repairSupplementaryTags(List readFamily, SAMFileHeader header) { for (GATKRead read : readFamily) { for (String attribute : TAGS_TO_REMOVE) { read.clearAttribute(attribute); } } if (readFamily.size() > 1) { GATKRead primary = readFamily.remove(0); SATagBuilder.setReadsAsSupplemental(primary,readFamily); } } }




© 2015 - 2025 Weber Informatics LLC | Privacy Policy