All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.broadinstitute.hellbender.tools.walkers.mutect.PermutectDatasetEngine Maven / Gradle / Ivy

The newest version!
package org.broadinstitute.hellbender.tools.walkers.mutect;

import htsjdk.samtools.SAMFileHeader;
import htsjdk.samtools.SAMReadGroupRecord;
import htsjdk.samtools.SAMSequenceDictionary;
import htsjdk.samtools.SAMSequenceRecord;
import htsjdk.variant.variantcontext.Allele;
import htsjdk.variant.variantcontext.Genotype;
import htsjdk.variant.variantcontext.VariantContext;
import org.apache.commons.lang3.tuple.Triple;
import org.broadinstitute.hellbender.engine.ReferenceContext;
import org.broadinstitute.hellbender.exceptions.UserException;
import org.broadinstitute.hellbender.tools.walkers.ReferenceConfidenceVariantContextMerger;
import org.broadinstitute.hellbender.tools.walkers.annotator.AssemblyComplexity;
import org.broadinstitute.hellbender.tools.walkers.annotator.ReferenceBases;
import org.broadinstitute.hellbender.tools.walkers.mutect.filtering.Mutect2FilteringEngine;
import org.broadinstitute.hellbender.utils.IndexRange;
import org.broadinstitute.hellbender.utils.MathUtils;
import org.broadinstitute.hellbender.utils.Utils;
import org.broadinstitute.hellbender.utils.genotyper.AlleleLikelihoods;
import org.broadinstitute.hellbender.utils.genotyper.LikelihoodMatrix;
import org.broadinstitute.hellbender.utils.haplotype.Haplotype;
import org.broadinstitute.hellbender.utils.read.Fragment;
import org.broadinstitute.hellbender.utils.read.GATKRead;
import org.broadinstitute.hellbender.utils.variant.GATKVCFConstants;
import org.broadinstitute.hellbender.utils.variant.VariantContextGetters;

import java.io.File;
import java.io.FileWriter;
import java.io.IOException;
import java.io.PrintWriter;
import java.util.*;
import java.util.concurrent.ArrayBlockingQueue;
import java.util.stream.Collectors;

public class PermutectDatasetEngine implements AutoCloseable {

    public static final int CAPACITY = 100000;

    private enum VariantType {
        SNV, INSERTION, DELETION
    }

    private enum Label {
        ARTIFACT, VARIANT, UNLABELED, IGNORE
    }

    private final SAMSequenceDictionary sequenceDictionary;

    private final Map readGroupIndices = new HashMap<>();

    // number of additional variant features for assembly complexity (4), TLOD / tumor depth (1)
    private static final int NUM_EXTRA_FEATURES = 5;

    // threshold of negative log-10 population allele frequency to consider something an artifact for the purposes of training data
    // we want to be really sure we don't get germline variants
    // TODO: is this really necessary?
    private static final double RARE_POPAF_THRESHOLD = 5.9;

    // very cautious threshold of negative log-10 population allele frequency to consider something germline for training data.
    // There are so many germline variants we can be wasteful!
    private static final double COMMON_POPAF_THRESHOLD = 1;

    // below this tumor log odds we don't consider it an artifact, just a sequencing error
    private static final double TLOD_THRESHOLD = 6.0;

    private final int maxRefCount;
    private final int maxAltCount;

    // TODO: is this necessary?
    private static final int MIN_REF = 5;

    private final PrintWriter printWriter;
    private final PrintWriter contigPrintWriter;
    private final PrintWriter readGroupPrintWriter;

    // number of nonartifact data to keep for each artifact datum
    private final int nonArtifactPerArtifact;

    // are we generating dataset for training a model or for filtering calls with a pre-trained model?
    private final boolean trainingMode;

    private final Set normalSamples;

    // simple method to balance data: for each k-alt-read artifact there are
    // nonArtifactPerArtifact (downsampled) k-alt-read non-artifacts.
    private final EnumMap> unmatchedArtifactAltCounts;

    private final Random random = new Random(1);


    public PermutectDatasetEngine(final File datasetFile, final boolean trainingMode, final int maxRefCount,
                                  final int maxAltCount, final int nonArtifactPerArtifact, final Set normalSamples,
                                  final SAMFileHeader header, final SAMSequenceDictionary sequenceDictionary) {
        try {
            printWriter = new PrintWriter(new FileWriter(Utils.nonNull(datasetFile)));
            final File contigTableFile = datasetFile.toPath().resolveSibling("contigs.table").toFile();
            final File readGroupTableFile = datasetFile.toPath().resolveSibling("read-groups.table").toFile();
            contigPrintWriter = new PrintWriter(new FileWriter(contigTableFile));
            readGroupPrintWriter = new PrintWriter(new FileWriter(readGroupTableFile));
        } catch (IOException ex) {
            throw new UserException.BadInput("Could not create dataset file writer");
        }

        this.normalSamples = Utils.nonNull(normalSamples);
        this.trainingMode = trainingMode;
        this.nonArtifactPerArtifact = nonArtifactPerArtifact;
        this.maxRefCount = maxRefCount;
        this.maxAltCount = maxAltCount;

        this.sequenceDictionary = sequenceDictionary;
        final List readGroups = header.getReadGroups();
        for (int n = 0; n < readGroups.size(); n++) {
            readGroupIndices.put(readGroups.get(n).getReadGroupId(), n);
        }

        unmatchedArtifactAltCounts = new EnumMap<>(VariantType.class);
        for (final VariantType type : VariantType.values()) {
            unmatchedArtifactAltCounts.put(type, new ArrayBlockingQueue<>(CAPACITY));
        }
    }

    // add one datum per alt allele
    public void addData(final ReferenceContext ref, final VariantContext vc, Optional> truthVCs,
                        final AlleleLikelihoods likelihoods,
                        final AlleleLikelihoods logFragmentLikelihoods,
                        final AlleleLikelihoods logFragmentAlleleLikelihoods,
                        final M2ArgumentCollection.PermutectDatasetMode permutectDatasetMode) {
        final String refBases = ReferenceBases.annotate(ref, vc);
        final String refAllele = vc.getReference().getBaseString();
        final int contigIndex = sequenceDictionary.getSequenceIndex(vc.getContig());
        final int position = vc.getStart();
        final Set tumorSamples = likelihoods.samples().stream().filter(sample -> !normalSamples.contains(sample)).collect(Collectors.toSet());
        final int numAlt = vc.getNAlleles() - 1;


        // the variant has already been annotated, so we have POPAF, AD, and in-PON status
        final double[] popafs = VariantContextGetters.getAttributeAsDoubleArray(vc, GATKVCFConstants.POPULATION_AF_KEY);
        final double[] tumorLods = Mutect2FilteringEngine.getTumorLogOdds(vc);
        // the PoN is NOT for calling! We only use it to assign some unlabeled data as artifacts in training
        final boolean isBlacklisted = vc.hasAttribute(GATKVCFConstants.IN_PON_KEY);

        // These ADs, which later make up the pre-downsampling depths, come from the genotype AD field applied by Mutect2.
        // This means that uninformative reads are not discarded; rather the expected non-integral ADs are rounded to
        // the nearest integer.  Also, these ADs are *fragment* ADs from the log fragment-allele likelihoods in the
        // SomaticGenotypingEngine.
        final int[] tumorADs = sumADsOverSamples(vc, tumorSamples);
        final int[] normalADs = sumADsOverSamples(vc, normalSamples);
        final int tumorDepth = (int) MathUtils.sum(tumorADs);
        final int normalDepth = (int) MathUtils.sum(normalADs);
        final boolean hasNormal = normalDepth > 0;

        final List allRefAlleles = new ArrayList<>();
        allRefAlleles.add(vc.getReference());
        truthVCs.ifPresent(vcs -> vcs.forEach(var -> allRefAlleles.add(var.getReference())));
        final Allele longestRef = allRefAlleles.stream().sorted(Comparator.comparingInt(Allele::length).reversed()).findFirst().get();

        // skip(1) excludes the reference allele
        final List remappedAltAlleles = ReferenceConfidenceVariantContextMerger.remapAlleles(vc, longestRef).stream()
                .skip(1).toList();

        final Set truthAlleles = !truthVCs.isPresent() ? Collections.emptySet() : truthVCs.get().stream()
                .filter(truthVC -> ! truthVC.isFiltered())
                .flatMap(truthVC -> ReferenceConfidenceVariantContextMerger.remapAlleles(truthVC, longestRef).stream())
                .collect(Collectors.toSet());

        final List




© 2015 - 2025 Weber Informatics LLC | Privacy Policy