org.broadinstitute.hellbender.tools.walkers.mutect.PermutectDatasetEngine Maven / Gradle / Ivy
The newest version!
package org.broadinstitute.hellbender.tools.walkers.mutect;
import htsjdk.samtools.SAMFileHeader;
import htsjdk.samtools.SAMReadGroupRecord;
import htsjdk.samtools.SAMSequenceDictionary;
import htsjdk.samtools.SAMSequenceRecord;
import htsjdk.variant.variantcontext.Allele;
import htsjdk.variant.variantcontext.Genotype;
import htsjdk.variant.variantcontext.VariantContext;
import org.apache.commons.lang3.tuple.Triple;
import org.broadinstitute.hellbender.engine.ReferenceContext;
import org.broadinstitute.hellbender.exceptions.UserException;
import org.broadinstitute.hellbender.tools.walkers.ReferenceConfidenceVariantContextMerger;
import org.broadinstitute.hellbender.tools.walkers.annotator.AssemblyComplexity;
import org.broadinstitute.hellbender.tools.walkers.annotator.ReferenceBases;
import org.broadinstitute.hellbender.tools.walkers.mutect.filtering.Mutect2FilteringEngine;
import org.broadinstitute.hellbender.utils.IndexRange;
import org.broadinstitute.hellbender.utils.MathUtils;
import org.broadinstitute.hellbender.utils.Utils;
import org.broadinstitute.hellbender.utils.genotyper.AlleleLikelihoods;
import org.broadinstitute.hellbender.utils.genotyper.LikelihoodMatrix;
import org.broadinstitute.hellbender.utils.haplotype.Haplotype;
import org.broadinstitute.hellbender.utils.read.Fragment;
import org.broadinstitute.hellbender.utils.read.GATKRead;
import org.broadinstitute.hellbender.utils.variant.GATKVCFConstants;
import org.broadinstitute.hellbender.utils.variant.VariantContextGetters;
import java.io.File;
import java.io.FileWriter;
import java.io.IOException;
import java.io.PrintWriter;
import java.util.*;
import java.util.concurrent.ArrayBlockingQueue;
import java.util.stream.Collectors;
public class PermutectDatasetEngine implements AutoCloseable {
public static final int CAPACITY = 100000;
private enum VariantType {
SNV, INSERTION, DELETION
}
private enum Label {
ARTIFACT, VARIANT, UNLABELED, IGNORE
}
private final SAMSequenceDictionary sequenceDictionary;
private final Map readGroupIndices = new HashMap<>();
// number of additional variant features for assembly complexity (4), TLOD / tumor depth (1)
private static final int NUM_EXTRA_FEATURES = 5;
// threshold of negative log-10 population allele frequency to consider something an artifact for the purposes of training data
// we want to be really sure we don't get germline variants
// TODO: is this really necessary?
private static final double RARE_POPAF_THRESHOLD = 5.9;
// very cautious threshold of negative log-10 population allele frequency to consider something germline for training data.
// There are so many germline variants we can be wasteful!
private static final double COMMON_POPAF_THRESHOLD = 1;
// below this tumor log odds we don't consider it an artifact, just a sequencing error
private static final double TLOD_THRESHOLD = 6.0;
private final int maxRefCount;
private final int maxAltCount;
// TODO: is this necessary?
private static final int MIN_REF = 5;
private final PrintWriter printWriter;
private final PrintWriter contigPrintWriter;
private final PrintWriter readGroupPrintWriter;
// number of nonartifact data to keep for each artifact datum
private final int nonArtifactPerArtifact;
// are we generating dataset for training a model or for filtering calls with a pre-trained model?
private final boolean trainingMode;
private final Set normalSamples;
// simple method to balance data: for each k-alt-read artifact there are
// nonArtifactPerArtifact (downsampled) k-alt-read non-artifacts.
private final EnumMap> unmatchedArtifactAltCounts;
private final Random random = new Random(1);
public PermutectDatasetEngine(final File datasetFile, final boolean trainingMode, final int maxRefCount,
final int maxAltCount, final int nonArtifactPerArtifact, final Set normalSamples,
final SAMFileHeader header, final SAMSequenceDictionary sequenceDictionary) {
try {
printWriter = new PrintWriter(new FileWriter(Utils.nonNull(datasetFile)));
final File contigTableFile = datasetFile.toPath().resolveSibling("contigs.table").toFile();
final File readGroupTableFile = datasetFile.toPath().resolveSibling("read-groups.table").toFile();
contigPrintWriter = new PrintWriter(new FileWriter(contigTableFile));
readGroupPrintWriter = new PrintWriter(new FileWriter(readGroupTableFile));
} catch (IOException ex) {
throw new UserException.BadInput("Could not create dataset file writer");
}
this.normalSamples = Utils.nonNull(normalSamples);
this.trainingMode = trainingMode;
this.nonArtifactPerArtifact = nonArtifactPerArtifact;
this.maxRefCount = maxRefCount;
this.maxAltCount = maxAltCount;
this.sequenceDictionary = sequenceDictionary;
final List readGroups = header.getReadGroups();
for (int n = 0; n < readGroups.size(); n++) {
readGroupIndices.put(readGroups.get(n).getReadGroupId(), n);
}
unmatchedArtifactAltCounts = new EnumMap<>(VariantType.class);
for (final VariantType type : VariantType.values()) {
unmatchedArtifactAltCounts.put(type, new ArrayBlockingQueue<>(CAPACITY));
}
}
// add one datum per alt allele
public void addData(final ReferenceContext ref, final VariantContext vc, Optional> truthVCs,
final AlleleLikelihoods likelihoods,
final AlleleLikelihoods logFragmentLikelihoods,
final AlleleLikelihoods logFragmentAlleleLikelihoods,
final M2ArgumentCollection.PermutectDatasetMode permutectDatasetMode) {
final String refBases = ReferenceBases.annotate(ref, vc);
final String refAllele = vc.getReference().getBaseString();
final int contigIndex = sequenceDictionary.getSequenceIndex(vc.getContig());
final int position = vc.getStart();
final Set tumorSamples = likelihoods.samples().stream().filter(sample -> !normalSamples.contains(sample)).collect(Collectors.toSet());
final int numAlt = vc.getNAlleles() - 1;
// the variant has already been annotated, so we have POPAF, AD, and in-PON status
final double[] popafs = VariantContextGetters.getAttributeAsDoubleArray(vc, GATKVCFConstants.POPULATION_AF_KEY);
final double[] tumorLods = Mutect2FilteringEngine.getTumorLogOdds(vc);
// the PoN is NOT for calling! We only use it to assign some unlabeled data as artifacts in training
final boolean isBlacklisted = vc.hasAttribute(GATKVCFConstants.IN_PON_KEY);
// These ADs, which later make up the pre-downsampling depths, come from the genotype AD field applied by Mutect2.
// This means that uninformative reads are not discarded; rather the expected non-integral ADs are rounded to
// the nearest integer. Also, these ADs are *fragment* ADs from the log fragment-allele likelihoods in the
// SomaticGenotypingEngine.
final int[] tumorADs = sumADsOverSamples(vc, tumorSamples);
final int[] normalADs = sumADsOverSamples(vc, normalSamples);
final int tumorDepth = (int) MathUtils.sum(tumorADs);
final int normalDepth = (int) MathUtils.sum(normalADs);
final boolean hasNormal = normalDepth > 0;
final List allRefAlleles = new ArrayList<>();
allRefAlleles.add(vc.getReference());
truthVCs.ifPresent(vcs -> vcs.forEach(var -> allRefAlleles.add(var.getReference())));
final Allele longestRef = allRefAlleles.stream().sorted(Comparator.comparingInt(Allele::length).reversed()).findFirst().get();
// skip(1) excludes the reference allele
final List remappedAltAlleles = ReferenceConfidenceVariantContextMerger.remapAlleles(vc, longestRef).stream()
.skip(1).toList();
final Set truthAlleles = !truthVCs.isPresent() ? Collections.emptySet() : truthVCs.get().stream()
.filter(truthVC -> ! truthVC.isFiltered())
.flatMap(truthVC -> ReferenceConfidenceVariantContextMerger.remapAlleles(truthVC, longestRef).stream())
.collect(Collectors.toSet());
final List
© 2015 - 2025 Weber Informatics LLC | Privacy Policy