org.broadinstitute.hellbender.tools.StructuralVariantDiscoverer Maven / Gradle / Ivy
The newest version!
package org.broadinstitute.hellbender.tools;
import htsjdk.samtools.SAMFileHeader;
import htsjdk.samtools.SAMSequenceDictionary;
import htsjdk.variant.variantcontext.VariantContext;
import org.broadinstitute.barclay.argparser.Argument;
import org.broadinstitute.barclay.argparser.ArgumentCollection;
import org.broadinstitute.barclay.argparser.BetaFeature;
import org.broadinstitute.barclay.argparser.CommandLineProgramProperties;
import org.broadinstitute.hellbender.cmdline.StandardArgumentDefinitions;
import org.broadinstitute.hellbender.cmdline.programgroups.StructuralVariantDiscoveryProgramGroup;
import org.broadinstitute.hellbender.engine.BasicReference;
import org.broadinstitute.hellbender.engine.FeatureContext;
import org.broadinstitute.hellbender.engine.ReadWalker;
import org.broadinstitute.hellbender.engine.ReferenceContext;
import org.broadinstitute.hellbender.engine.filters.ReadFilter;
import org.broadinstitute.hellbender.engine.filters.ReadFilterLibrary;
import org.broadinstitute.hellbender.exceptions.UserException;
import org.broadinstitute.hellbender.tools.spark.sv.StructuralVariationDiscoveryArgumentCollection.DiscoverVariantsFromContigAlignmentsArgumentCollection;
import org.broadinstitute.hellbender.tools.spark.sv.discovery.AnnotatedVariantProducer;
import org.broadinstitute.hellbender.tools.spark.sv.discovery.SvType;
import org.broadinstitute.hellbender.tools.spark.sv.discovery.alignment.AlignedContig;
import org.broadinstitute.hellbender.tools.spark.sv.discovery.alignment.AlignmentInterval;
import org.broadinstitute.hellbender.tools.spark.sv.discovery.alignment.AssemblyContigWithFineTunedAlignments;
import org.broadinstitute.hellbender.tools.spark.sv.discovery.alignment.AssemblyContigWithFineTunedAlignments.AlignmentSignatureBasicType;
import org.broadinstitute.hellbender.tools.spark.sv.discovery.inference.ContigChimericAlignmentIterativeInterpreter;
import org.broadinstitute.hellbender.tools.spark.sv.discovery.inference.CpxVariantCanonicalRepresentation;
import org.broadinstitute.hellbender.tools.spark.sv.discovery.inference.CpxVariantInducingAssemblyContig;
import org.broadinstitute.hellbender.tools.spark.sv.discovery.inference.CpxVariantInterpreter;
import org.broadinstitute.hellbender.tools.spark.sv.discovery.inference.NovelAdjacencyAndAltHaplotype;
import org.broadinstitute.hellbender.tools.spark.sv.discovery.inference.SegmentedCpxVariantSimpleVariantExtractor;
import org.broadinstitute.hellbender.tools.spark.sv.discovery.inference.SegmentedCpxVariantSimpleVariantExtractor.MultiSegmentsCpxVariantExtractor;
import org.broadinstitute.hellbender.tools.spark.sv.discovery.inference.SegmentedCpxVariantSimpleVariantExtractor.RelevantAttributes;
import org.broadinstitute.hellbender.tools.spark.sv.discovery.inference.SegmentedCpxVariantSimpleVariantExtractor.ZeroAndOneSegmentCpxVariantExtractor;
import org.broadinstitute.hellbender.tools.spark.sv.discovery.inference.SimpleChimera;
import org.broadinstitute.hellbender.tools.spark.sv.discovery.inference.SimpleNovelAdjacencyAndChimericAlignmentEvidence;
import org.broadinstitute.hellbender.tools.spark.sv.utils.CNVInputReader;
import org.broadinstitute.hellbender.tools.spark.sv.utils.GATKSVVCFConstants;
import org.broadinstitute.hellbender.utils.SVIntervalTree;
import org.broadinstitute.hellbender.tools.spark.sv.utils.SVUtils;
import org.broadinstitute.hellbender.tools.spark.sv.utils.SVVCFWriter;
import org.broadinstitute.hellbender.utils.BaseUtils;
import org.broadinstitute.hellbender.utils.read.GATKRead;
import scala.Tuple2;
import java.util.*;
@BetaFeature
@CommandLineProgramProperties(
oneLineSummary = "(Internal) Examines aligned contigs from local assemblies and calls structural variants or their breakpoints",
summary =
"This tool takes a file containing the alignments of assembled contigs and searches it for contigs with" +
" split alignments or large gaps indicating the presence of structural variation breakpoints." +
" Variations' types are determined by analyzing the signatures of the split alignments," +
" and are written to a VCF file.",
programGroup = StructuralVariantDiscoveryProgramGroup.class)
public class StructuralVariantDiscoverer extends ReadWalker {
@Argument(doc = "Name of output VCF.", shortName = StandardArgumentDefinitions.OUTPUT_SHORT_NAME,
fullName = "outputVCFName")
private String outputVCFName;
@Argument(doc = "file containing non-canonical chromosome names (e.g chrUn_KI270588v1) in the reference, " +
"human reference (hg19 or hg38) assumed when omitted", shortName = "alt-tigs",
fullName = "non-canonical-contig-names-file", optional = true)
private String nonCanonicalChromosomeNamesFile;
@ArgumentCollection
private final DiscoverVariantsFromContigAlignmentsArgumentCollection discoverStageArgs =
new DiscoverVariantsFromContigAlignmentsArgumentCollection();
private static final double SCORE_DIFF_TOLERANCE = 0.;
private String sampleId;
private SAMSequenceDictionary refDict;
private BasicReference reference;
private SVIntervalTree cnvCalls;
private Set canonicalChromosomes;
private String currentContigName = null;
private final List readsForCurrentContig = new ArrayList<>();
private final Map
simpleEvidenceForNovelAdjacencyMap = new HashMap<>(10000);
private final Map>
complexEventContigsMap =
new HashMap<>(1000);
private final List complexContigs = new ArrayList<>(1000);
@Override public boolean requiresReads() { return true; }
@Override public boolean requiresReference() { return true; }
@Override public List getDefaultReadFilters() {
return Arrays.asList(ReadFilterLibrary.MAPPED, ReadFilterLibrary.NOT_SECONDARY_ALIGNMENT);
}
@Override public void onTraversalStart() {
final SAMFileHeader header = getHeaderForReads();
if ( header.getSortOrder() != SAMFileHeader.SortOrder.queryname ) {
throw new UserException("This tool requires a queryname-sorted source of reads.");
}
sampleId = SVUtils.getSampleId(header);
refDict = header.getSequenceDictionary();
cnvCalls = discoverStageArgs.cnvCallsFile == null ? null :
CNVInputReader.loadCNVCalls(discoverStageArgs.cnvCallsFile, header);
canonicalChromosomes = SVUtils.getCanonicalChromosomes(nonCanonicalChromosomeNamesFile, refDict);
}
@Override public void apply( final GATKRead read,
final ReferenceContext referenceContext,
final FeatureContext featureContext ) {
reference = referenceContext;
final String contigName = read.getName();
if ( !contigName.equals(currentContigName) ) {
if ( !readsForCurrentContig.isEmpty() ) {
processContigAlignments(readsForCurrentContig);
readsForCurrentContig.clear();
}
currentContigName = contigName;
}
readsForCurrentContig.add(read);
}
@Override public Object onTraversalSuccess() {
final Object result = super.onTraversalSuccess();
if ( !readsForCurrentContig.isEmpty() ) {
processContigAlignments(readsForCurrentContig);
}
final List variants =
new ArrayList<>(2 * simpleEvidenceForNovelAdjacencyMap.size());
final Collection simpleEvidence =
simpleEvidenceForNovelAdjacencyMap.values();
for ( final SimpleNovelAdjacencyAndChimericAlignmentEvidence novelAdjacencyAndEvidence : simpleEvidence ) {
final List svTypes =
novelAdjacencyAndEvidence.getNovelAdjacencyReferenceLocations().toSimpleOrBNDTypes(reference);
variants.addAll(novelAdjacencyAndEvidence.toVariantContexts(svTypes, sampleId, refDict, cnvCalls));
}
final ZeroAndOneSegmentCpxVariantExtractor zeroAndOneSegmentCpxVariantExtractor =
new ZeroAndOneSegmentCpxVariantExtractor();
final List multiSegmentVariants = new ArrayList<>(complexEventContigsMap.size());
for ( final Map.Entry> entry :
complexEventContigsMap.entrySet() ) {
final VariantContext variantContext =
CpxVariantInterpreter.toVariantContext(entry.getKey(), entry.getValue(), reference);
final int refSegs =
SVUtils.getAttributeAsStringList(variantContext, GATKSVVCFConstants.CPX_SV_REF_SEGMENTS).size();
if ( refSegs < 2 ) {
variants.addAll(zeroAndOneSegmentCpxVariantExtractor.extract(variantContext, reference));
} else {
multiSegmentVariants.add(variantContext);
}
}
final Map contigNameToCpxVariantAttributes =
new HashMap<>(2 * multiSegmentVariants.size());
for ( final VariantContext variantContext : multiSegmentVariants ) {
final RelevantAttributes relevantAttributes = new RelevantAttributes(variantContext);
for ( final String contigName :
SVUtils.getAttributeAsStringList(variantContext, GATKSVVCFConstants.CONTIG_NAMES) ) {
contigNameToCpxVariantAttributes.put(contigName, relevantAttributes);
}
}
final Map> novelAdjacenciesToReinterpret = new HashMap<>();
for ( final AlignedContig alignedContig : complexContigs ) {
if ( contigNameToCpxVariantAttributes.containsKey(alignedContig.getContigName()) &&
alignedContig.getAlignments().size() > 1 ) {
final List chimeras =
ContigChimericAlignmentIterativeInterpreter.parseOneContig(
alignedContig,
refDict,
true,
DiscoverVariantsFromContigAlignmentsArgumentCollection.DEFAULT_MIN_ALIGNMENT_LENGTH,
DiscoverVariantsFromContigAlignmentsArgumentCollection.CHIMERIC_ALIGNMENTS_HIGHMQ_THRESHOLD,
true);
for ( final SimpleChimera simpleChimera : chimeras ) {
final NovelAdjacencyAndAltHaplotype novelAdjacency =
new NovelAdjacencyAndAltHaplotype(simpleChimera, alignedContig.getContigSequence(), refDict);
final List chimerasForNovelAdjacency =
novelAdjacenciesToReinterpret.get(novelAdjacency);
if ( chimerasForNovelAdjacency != null ) {
chimerasForNovelAdjacency.add(simpleChimera);
} else {
final List newList = new ArrayList<>(2);
newList.add(simpleChimera);
novelAdjacenciesToReinterpret.put(novelAdjacency, newList);
}
}
}
}
final List reinterpretedVariants = new ArrayList<>(novelAdjacenciesToReinterpret.size());
for ( final Map.Entry> entry :
novelAdjacenciesToReinterpret.entrySet() ) {
reinterpretedVariants.add(
new SimpleNovelAdjacencyAndChimericAlignmentEvidence(entry.getKey(), entry.getValue())
.produceAnnotatedVcFromAssemblyEvidence(
ContigChimericAlignmentIterativeInterpreter
.inferSimpleTypeFromNovelAdjacency(entry.getKey(), reference),
refDict, cnvCalls, sampleId).make());
}
final List extractedMultiSegmentVariants = new ArrayList<>(multiSegmentVariants.size());
final MultiSegmentsCpxVariantExtractor multiSegmentsCpxVariantExtractor =
new MultiSegmentsCpxVariantExtractor();
for ( final VariantContext variantContext : multiSegmentVariants ) {
extractedMultiSegmentVariants.addAll(multiSegmentsCpxVariantExtractor.extract(variantContext, reference));
}
final List consistentVariants =
SegmentedCpxVariantSimpleVariantExtractor
.filterForConsistency(reinterpretedVariants, contigNameToCpxVariantAttributes, reference);
variants.addAll(SegmentedCpxVariantSimpleVariantExtractor.removeDuplicates(extractedMultiSegmentVariants, consistentVariants));
final List filteredMergedVariantList =
AnnotatedVariantProducer.filterMergedVariantList(variants, discoverStageArgs);
SVVCFWriter.writeVCF(filteredMergedVariantList, outputVCFName, refDict, getDefaultToolVCFHeaderLines(), logger);
return result;
}
private void processContigAlignments( final List contigAlignments ) {
final List alignmentIntervals = new ArrayList<>(contigAlignments.size());
String contigName = null;
byte[] contigSequence = null;
for ( final GATKRead read : contigAlignments ) {
contigName = read.getName();
if ( !read.isSupplementaryAlignment() ) {
contigSequence = read.getBasesNoCopy();
if ( read.isReverseStrand() ) {
contigSequence = BaseUtils.simpleReverseComplement(contigSequence);
}
}
alignmentIntervals.add(new AlignmentInterval(read));
}
if ( contigSequence == null ) {
throw new UserException("No primary line for " + contigName);
}
final AlignedContig alignedContig = new AlignedContig(contigName, contigSequence, alignmentIntervals);
if ( !alignedContig.hasGoodMQ() ) return;
final List fineTunedAlignmentsList =
alignedContig.reconstructContigFromBestConfiguration(canonicalChromosomes, SCORE_DIFF_TOLERANCE);
for ( final AssemblyContigWithFineTunedAlignments fineTunedAlignment : fineTunedAlignmentsList ) {
if ( fineTunedAlignment.getAlignmentSignatureBasicType() == AlignmentSignatureBasicType.SIMPLE_CHIMERA ) {
if ( SimpleChimera.splitPairStrongEnoughEvidenceForCA(fineTunedAlignment.getHeadAlignment(),
fineTunedAlignment.getTailAlignment()) ) {
final SimpleChimera simpleChimera = fineTunedAlignment.extractSimpleChimera(refDict);
final NovelAdjacencyAndAltHaplotype novelAdjacency =
new NovelAdjacencyAndAltHaplotype(simpleChimera,
fineTunedAlignment.getContigSequence(),
refDict);
final SimpleNovelAdjacencyAndChimericAlignmentEvidence simpleEvidence =
simpleEvidenceForNovelAdjacencyMap.get(novelAdjacency);
if ( simpleEvidence != null ) {
simpleEvidence.getAlignmentEvidence().add(simpleChimera);
} else {
final SimpleNovelAdjacencyAndChimericAlignmentEvidence novelAdjacencyEvidence =
new SimpleNovelAdjacencyAndChimericAlignmentEvidence(novelAdjacency,
Collections.singletonList(simpleChimera));
simpleEvidenceForNovelAdjacencyMap.put(novelAdjacency, novelAdjacencyEvidence);
}
}
} else if ( fineTunedAlignment.getAlignmentSignatureBasicType() == AlignmentSignatureBasicType.COMPLEX ) {
complexContigs.add(fineTunedAlignment.getSourceContig());
final Tuple2 entry =
CpxVariantInterpreter.getOneVariantFromOneContig(fineTunedAlignment, refDict);
final List contigsList = complexEventContigsMap.get(entry._1);
if ( contigsList != null ) {
contigsList.add(entry._2);
} else {
final List newList = new ArrayList<>(2);
newList.add(entry._2);
complexEventContigsMap.put(entry._1, newList);
}
}
}
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy