org.broadinstitute.hellbender.tools.walkers.rnaseq.SplitNCigarReads Maven / Gradle / Ivy
The newest version!
package org.broadinstitute.hellbender.tools.walkers.rnaseq;
import htsjdk.samtools.*;
import htsjdk.samtools.reference.ReferenceSequenceFile;
import org.broadinstitute.barclay.argparser.Advanced;
import org.broadinstitute.barclay.argparser.Argument;
import org.broadinstitute.barclay.argparser.CommandLineProgramProperties;
import org.broadinstitute.barclay.argparser.WorkflowProperties;
import org.broadinstitute.barclay.argparser.WorkflowOutput;
import org.broadinstitute.barclay.help.DocumentedFeature;
import org.broadinstitute.hellbender.cmdline.StandardArgumentDefinitions;
import org.broadinstitute.hellbender.engine.FeatureContext;
import org.broadinstitute.hellbender.engine.GATKPath;
import org.broadinstitute.hellbender.engine.MultiplePassReadWalker;
import org.broadinstitute.hellbender.engine.ReferenceContext;
import org.broadinstitute.hellbender.engine.filters.ReadFilter;
import org.broadinstitute.hellbender.engine.filters.ReadFilterLibrary;
import org.broadinstitute.hellbender.exceptions.UserException;
import org.broadinstitute.hellbender.transformers.MappingQualityReadTransformer;
import org.broadinstitute.hellbender.transformers.NDNCigarReadTransformer;
import org.broadinstitute.hellbender.transformers.ReadTransformer;
import org.broadinstitute.hellbender.utils.GenomeLocParser;
import org.broadinstitute.hellbender.utils.SATagBuilder;
import org.broadinstitute.hellbender.utils.clipping.ReadClipper;
import org.broadinstitute.hellbender.utils.fasta.CachingIndexedFastaSequenceFile;
import org.broadinstitute.hellbender.utils.read.ArtificialReadUtils;
import org.broadinstitute.hellbender.utils.read.CigarUtils;
import org.broadinstitute.hellbender.utils.read.GATKRead;
import org.broadinstitute.hellbender.utils.read.SAMFileGATKReadWriter;
import picard.cmdline.programgroups.ReadDataManipulationProgramGroup;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Collections;
import java.util.List;
import static org.broadinstitute.hellbender.cmdline.StandardArgumentDefinitions.OUTPUT_LONG_NAME;
import static org.broadinstitute.hellbender.cmdline.StandardArgumentDefinitions.OUTPUT_SHORT_NAME;
/**
*
* Splits reads that contain Ns in their cigar string (e.g. spanning splicing events in RNAseq data).
*
* Identifies all N cigar elements and creates k+1 new reads (where k is the number of N cigar elements).
* The first read includes the bases that are to the left of the first N element, while the part of the read that is to the right of the N
* (including the Ns) is hard clipped and so on for the rest of the new reads. Used for post-processing RNA reads aligned against the full reference.
*
* Input
*
* BAM file
*
*
*
* Output
*
* BAM file with reads split at N CIGAR elements and CIGAR strings updated.
*
*
* Usage example
*
* gatk SplitNCigarReads \
* -R Homo_sapiens_assembly38.fasta \
* -I input.bam \
* -O output.bam
*
*/
@CommandLineProgramProperties(
summary = "Splits reads that contain Ns in their cigar string (e.g. spanning splicing events).",
oneLineSummary = "Split Reads with N in Cigar",
programGroup = ReadDataManipulationProgramGroup.class
)
@DocumentedFeature
@WorkflowProperties
public final class SplitNCigarReads extends MultiplePassReadWalker {
// A list of tags that break upon splitting on N. These will be removed from reads in the output.
// NOTE: for future developers who want to use these tags. For each tag you remove from this list corresponding
// support for properly recalculating the tags must be added to repairSupplementaryTags()
static final String[] TAGS_TO_REMOVE = {"NM","MD","NH"};
static final String MATE_CIGAR_TAG = "MC";
@Argument(fullName = OUTPUT_LONG_NAME, shortName = OUTPUT_SHORT_NAME, doc="Write output to this BAM filename")
@WorkflowOutput(optionalCompanions={StandardArgumentDefinitions.OUTPUT_INDEX_COMPANION})
GATKPath OUTPUT;
/**
* This flag tells GATK to refactor cigar string with NDN elements to one element. It intended primarily for use in
* a RNAseq pipeline since the problem might come up when using RNAseq aligner such as Tophat2 with provided transcriptomes.
* You should only use this if you know that your reads have that problem.
*/
@Argument(fullName = "refactor-cigar-string", shortName = "fixNDN", doc = "refactor cigar string with NDN elements to one element", optional = true)
boolean REFACTOR_NDN_CIGAR_READS = false;
/**
* This flag turns off the mapping quality 255 -> 60 read transformer. The transformer is on by default to ensure that
* uniquely mapping reads assigned STAR's default 255 MQ aren't filtered out by HaplotypeCaller.
*/
@Argument(fullName = "skip-mapping-quality-transform", shortName = "skip-mq-transform", doc = "skip the 255 -> 60 MQ read transform", optional = true)
boolean SKIP_MQ_TRANSFORM = false;
/**
* For expert users only! To minimize memory consumption you can lower this number, but then the tool may skip
* overhang fixing in regions with too much coverage. Just make sure to give Java enough memory! 4Gb should be
* enough with the default value.
*/
@Advanced
@Argument(fullName="max-reads-in-memory", doc="max reads allowed to be kept in memory at a time by the BAM writer", optional=true)
int MAX_RECORDS_IN_MEMORY = 150000;
/**
* If there are more than this many mismatches within the overhang regions, the whole overhang will get hard-clipped out.
* It is still possible in some cases that the overhang could get clipped if the number of mismatches do not exceed this
* value, e.g. if most of the overhang mismatches.
*/
@Argument(fullName="max-mismatches-in-overhang", doc="max number of mismatches allowed in the overhang", optional=true)
int MAX_MISMATCHES_IN_OVERHANG = 1;
/**
* If there are more than this many bases in the overhang, we won't try to hard-clip them out
*/
@Argument(fullName="max-bases-in-overhang", doc="max number of bases allowed in the overhang", optional=true)
int MAX_BASES_TO_CLIP = 40;
@Argument(fullName="do-not-fix-overhangs", doc="do not have the walker soft-clip overhanging sections of the reads", optional=true)
boolean doNotFixOverhangs = false;
@Argument(fullName="process-secondary-alignments", doc="have the walker split secondary alignments (will still repair MC tag without it)", optional=true)
boolean processSecondaryAlignments = false;
@Override
public boolean requiresReference() {
return true;
}
private SAMFileGATKReadWriter outputWriter;
private OverhangFixingManager overhangManager;
private ReferenceSequenceFile referenceReader;
SAMFileHeader header;
@Override
public List getDefaultReadFilters() {
return Collections.singletonList(ReadFilterLibrary.ALLOW_ALL_READS);
}
@Override
public ReadTransformer makePreReadFilterTransformer(){
if (REFACTOR_NDN_CIGAR_READS) {
return new NDNCigarReadTransformer();
}
else {
return ReadTransformer.identity();
}
}
//FIXME: once the engine accepts read transformer arguments, remove these magic numbers?
private static final int FROM_QUALITY = 255;
private static final int TO_QUALITY = 60;
@Override
public ReadTransformer makePostReadFilterTransformer(){
if (SKIP_MQ_TRANSFORM) {
return ReadTransformer.identity();
}
else {
return new MappingQualityReadTransformer(FROM_QUALITY, TO_QUALITY);
}
}
@Override
public void onTraversalStart() {
header = getHeaderForSAMWriter();
referenceReader = new CachingIndexedFastaSequenceFile(referenceArguments.getReferencePath());
GenomeLocParser genomeLocParser = new GenomeLocParser(getBestAvailableSequenceDictionary());
outputWriter = createSAMWriter(OUTPUT, false);
overhangManager = new OverhangFixingManager(header, outputWriter, genomeLocParser, referenceReader, MAX_RECORDS_IN_MEMORY, MAX_MISMATCHES_IN_OVERHANG, MAX_BASES_TO_CLIP, doNotFixOverhangs, processSecondaryAlignments);
}
@Override
public void traverseReads() {
forEachRead((GATKRead read, ReferenceContext reference, FeatureContext features) ->
splitNCigarRead(read, overhangManager,true, header, processSecondaryAlignments)
);
overhangManager.activateWriting();
forEachRead((GATKRead read, ReferenceContext reference, FeatureContext features) ->
splitNCigarRead(read, overhangManager,true, header, processSecondaryAlignments)
);
}
@Override
public void closeTool() {
if (overhangManager != null) { overhangManager.flush(); }
if (outputWriter != null ) { outputWriter.close(); }
try {if (referenceReader != null) { referenceReader.close(); } }
catch (IOException ex) {
throw new UserException.MissingReference("Could not find reference file.", true);
}
}
/**
* Goes through the cigar string of the read and create new reads for each consecutive non-N elements (while soft clipping the rest of the read) that are supplemental to each other.
* For example: for a read with cigar '1H2M2D1M2N1M2I1N1M2S' 3 new reads will be created with cigar strings: 1H2M2D1M6S, 1H3S1M2I2S and 1H6S1M2S
* If the read has an MC tag it will be adjusted according to the clipping of that mate based on its cigar
*
* @param read the read to split (can be null)
* @param emitReads a parameter used to mock behavior for repairing mate cigar string information
*/
public static GATKRead splitNCigarRead(final GATKRead read, OverhangFixingManager manager, boolean emitReads, SAMFileHeader header, boolean secondaryAlignments) {
final int numCigarElements = read.numCigarElements();
List splitReads = new ArrayList<>(2);
// Run the tool on dummy mate read to determine what the mate cigar will be upon completion, if manager has a prediction then dont repair
if (emitReads && read.hasAttribute(MATE_CIGAR_TAG)) {
final GATKRead mateSplitting = splitNCigarRead(ArtificialReadUtils.createArtificialRead(header, TextCigarCodec.decode(read.getAttributeAsString(MATE_CIGAR_TAG))), manager, false, header, secondaryAlignments);
read.setAttribute(MATE_CIGAR_TAG, mateSplitting.getCigar().toString());
}
manager.setPredictedMateInformation(read);
// If it is a secondary alignment, repair its mate-information (assuming its mate was primary) and pass
if ( !secondaryAlignments && read.isSecondaryAlignment()){
manager.addReadGroup(Collections.singletonList(read));
return read;
}
boolean sectionHasMatch = false;
int firstCigarIndex = 0;
for ( int i = 0; i < numCigarElements; i++ ) {
final CigarElement cigarElement = read.getCigarElement(i);
CigarOperator op = cigarElement.getOperator();
// One of the "Real" operators
if (op == CigarOperator.M || op == CigarOperator.EQ || op == CigarOperator.X ||
op == CigarOperator.I || op == CigarOperator.D) {
sectionHasMatch = true;
}
if (op == CigarOperator.N) {
if (sectionHasMatch) {
if (!emitReads) {
// not passing the manager ensures that no splices get added to the manager for fake reads
splitReads.add(splitReadBasedOnCigar(read, firstCigarIndex, i, null));
} else {
splitReads.add(splitReadBasedOnCigar(read, firstCigarIndex, i, manager));
}
}
firstCigarIndex = i+1;
sectionHasMatch = false;
}
}
// if there are no N's in the read
if (splitReads.size() < 1) {
if (emitReads) {
manager.addReadGroup(Collections.singletonList(read));
}
return read;
}
//add the last section of the read: from the last N to the the end of the read
// (it will be done for all the usual cigar string that does not end with N)
else if ((firstCigarIndex < numCigarElements) && sectionHasMatch) {
splitReads.add(splitReadBasedOnCigar(read, firstCigarIndex, numCigarElements, null));
}
if (emitReads) {
manager.addReadGroup(splitReads);
return read;
// If emitReads is false then we want the mate of the read
} else {
return splitReads.get(0);
}
}
/**
* Pull out an individual split position for a read
*
* @param read the read being split
* @param cigarStartIndex the index of the first cigar element to keep
* @param cigarEndIndex the index of the last cigar element to keep
* @param forSplitPositions the manager for keeping track of split positions; can be null
* @return a non-null read representing the section of the original read being split out
*/
private static GATKRead splitReadBasedOnCigar(final GATKRead read, final int cigarStartIndex, final int cigarEndIndex, final OverhangFixingManager forSplitPositions) {
int cigarFirstIndex = cigarStartIndex;
int cigarSecondIndex = cigarEndIndex;
//in case a section of the read ends or starts with D (for example the first section in 1M1D1N1M is 1M1D), we should trim this cigar element
// it can be 'if', but it was kept as 'while' to make sure the code can work with Cigar strings that were not "cleaned"
while(read.getCigarElement(cigarFirstIndex).getOperator().equals(CigarOperator.D)) {
cigarFirstIndex++;
}
while(read.getCigarElement(cigarSecondIndex-1).getOperator().equals(CigarOperator.D)) {
cigarSecondIndex--;
}
if(cigarFirstIndex > cigarSecondIndex) {
throw new IllegalArgumentException("Cannot split this read (might be an empty section between Ns, for example 1N1D1N): " + read.getCigar().toString());
}
// we keep only the section of the read that is aligned to the reference between startRefIndex and stopRefIndex (inclusive).
// the other sections of the read are clipped:
final List elements = read.getCigarElements();
final int startRefIndex = read.getUnclippedStart() + CigarUtils.countRefBasesAndClips(elements, 0, cigarFirstIndex); //goes through the prefix of the cigar (up to cigarStartIndex) and move the reference index.
final int stopRefIndex = startRefIndex + CigarUtils.countRefBasesAndClips(elements, cigarFirstIndex,cigarSecondIndex)-1; //goes through a consecutive non-N section of the cigar (up to cigarEndIndex) and move the reference index.
if ( forSplitPositions != null ) {
final String contig = read.getContig();
final int splitStart = startRefIndex + CigarUtils.countRefBasesAndClips(elements,cigarFirstIndex,cigarEndIndex); //we use cigarEndIndex instead of cigarSecondIndex so we won't take into account the D's at the end.
final int splitEnd = splitStart + read.getCigarElement(cigarEndIndex).getLength() - 1;
forSplitPositions.addSplicePosition(contig, splitStart, splitEnd);
}
return ReadClipper.softClipToRegionIncludingClippedBases(read,startRefIndex,stopRefIndex);
}
/**
* A method that repairs the NH and NM tags for a group of reads
*
* @param readFamily the a list of reads where the first item is the read to be marked as primary
* @param header the file header to associate with the given reads
* @return a non-null read representing the section of the original read being split out
*/
public static void repairSupplementaryTags(List readFamily, SAMFileHeader header) {
for (GATKRead read : readFamily) {
for (String attribute : TAGS_TO_REMOVE) {
read.clearAttribute(attribute);
}
}
if (readFamily.size() > 1) {
GATKRead primary = readFamily.remove(0);
SATagBuilder.setReadsAsSupplemental(primary,readFamily);
}
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy