org.broadinstitute.hellbender.tools.walkers.mutect.FeaturizedReadSets Maven / Gradle / Ivy
The newest version!
package org.broadinstitute.hellbender.tools.walkers.mutect;
import htsjdk.samtools.CigarOperator;
import htsjdk.variant.variantcontext.Allele;
import htsjdk.variant.variantcontext.VariantContext;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
import org.broadinstitute.gatk.nativebindings.smithwaterman.SWOverhangStrategy;
import org.broadinstitute.hellbender.tools.walkers.annotator.BaseQuality;
import org.broadinstitute.hellbender.tools.walkers.annotator.ReadPosition;
import org.broadinstitute.hellbender.utils.Utils;
import org.broadinstitute.hellbender.utils.genotyper.AlleleLikelihoods;
import org.broadinstitute.hellbender.utils.haplotype.Haplotype;
import org.broadinstitute.hellbender.utils.locusiterator.AlignmentStateMachine;
import org.broadinstitute.hellbender.utils.pileup.PileupElement;
import org.broadinstitute.hellbender.utils.read.Fragment;
import org.broadinstitute.hellbender.utils.read.GATKRead;
import org.broadinstitute.hellbender.utils.smithwaterman.SmithWatermanAligner;
import org.broadinstitute.hellbender.utils.smithwaterman.SmithWatermanAlignment;
import org.broadinstitute.hellbender.utils.smithwaterman.SmithWatermanAlignmentConstants;
import java.util.*;
import java.util.stream.Collectors;
import java.util.stream.IntStream;
/**
* For each sample and for each allele a list feature vectors of supporting reads
* In order to reduce the number of delimiter characters, we flatten featurized reads. For example, suppose allele 1 has featurized reads
* [1,2] and [3,4] and allele 2 has featurized reads [5,6] and [7,8], the annotation is
* 1,2,3,4|5,6,7,8
*/
public class FeaturizedReadSets {
private static final Logger logger = LogManager.getLogger(FeaturizedReadSets.class);
public static final int DEFAULT_BASE_QUALITY = 25;
private static final SmithWatermanAligner aligner = SmithWatermanAligner.getAligner(SmithWatermanAligner.Implementation.JAVA);
private static final int FEATURES_PER_RANGE = 5;
private static final List RANGES = List.of(5, 10, 20);
public static final int NUM_RANGED_FEATURES = FEATURES_PER_RANGE * RANGES.size();
private static final int VERY_BAD_QUAL_THRESHOLD = 10;
private static final int BAD_QUAL_THRESHOLD = 20;
private FeaturizedReadSets() { }
public static List>> getReadVectors(final VariantContext vc,
final Collection samples,
final AlleleLikelihoods likelihoods,
final AlleleLikelihoods haplotypeLikelihoods,
final int refDownsample,
final int altDownsample,
final M2ArgumentCollection.PermutectDatasetMode permutectDatasetMode,
final Map readGroupIndices) {
return getReadVectors(vc, samples, likelihoods, haplotypeLikelihoods, refDownsample, altDownsample, Collections.emptyMap(), permutectDatasetMode, readGroupIndices);
}
// returns Lists (in allele order) of lists of read vectors supporting each allele
public static List>> getReadVectors(final VariantContext vc,
final Collection samples,
final AlleleLikelihoods likelihoods,
final AlleleLikelihoods haplotypeLikelihoods,
final int refDownsample,
final int altDownsample,
final Map altDownsampleMap,
final M2ArgumentCollection.PermutectDatasetMode permutectDatasetMode,
final Map readGroupIndices) {
final Map> readsByAllele = likelihoods.alleles().stream()
.collect(Collectors.toMap(a -> a, a -> new ArrayList<>()));
samples.stream().flatMap(s -> likelihoods.bestAllelesBreakingTies(s).stream())
.filter(ba -> ba.isInformative())
.forEach(ba -> readsByAllele.get(ba.allele).add(ba.evidence));
// downsample if necessary
final Allele refAllele = likelihoods.alleles().stream().filter(Allele::isReference).findFirst().get();
for (final Allele allele : likelihoods.alleles()) {
final int downsample = allele.isReference() ? refDownsample : altDownsampleMap.getOrDefault(allele, altDownsample);
if (readsByAllele.get(allele).size() > downsample) {
Collections.shuffle(readsByAllele.get(allele));
readsByAllele.put(allele, readsByAllele.get(allele).subList(0, downsample));
}
}
final Map bestHaplotypes = new HashMap<>();
samples.stream().flatMap(s -> haplotypeLikelihoods.bestAllelesBreakingTies(s).stream())
.forEach(ba -> ba.evidence.getReads().forEach(read -> bestHaplotypes.put(read, ba.allele)));
return vc.getAlleles().stream()
.map(allele -> readsByAllele.get(allele).stream().map(read -> featurize(read, vc, bestHaplotypes, permutectDatasetMode, readGroupIndices)).collect(Collectors.toList()))
.collect(Collectors.toList());
}
private static List featurize(final GATKRead read, final VariantContext vc,
final Map bestHaplotypes,
final M2ArgumentCollection.PermutectDatasetMode permutectDatasetMode,
final Map readGroupIndices) {
final List result = new ArrayList<>();
result.add(readGroupIndices.get(read.getReadGroup())); // this is read group metadata rather than part of the tensor
result.add(read.getMappingQuality());
// TODO: why not add BQ before and after as well -- especially useful for indels
result.add(BaseQuality.getBaseQuality(read, vc).orElse(DEFAULT_BASE_QUALITY));
result.add(read.isFirstOfPair() ? 1 : 0);
result.add(read.isReverseStrand() ? 1 : 0);
// distances from ends of read
// this DOES account for the hard clips due to fitting the read inside the assembly window!!
final int readPositionOfVariantStart = ReadPosition.getPosition(read, vc).orElse(0);
result.add(readPositionOfVariantStart);
// read.getLength(), however, does not account for hard clips and we need to add the length of any leading or trailing hard clips in the read's CIGAR
final int totalHardClips = read.getCigarElements().stream().filter(el -> el.getOperator() == CigarOperator.HARD_CLIP).mapToInt(el -> el.getLength()).sum();
final int actualReadLength = read.getLength() + totalHardClips;
result.add(actualReadLength - readPositionOfVariantStart);
result.add(Math.abs(read.getFragmentLength()));
if (permutectDatasetMode == M2ArgumentCollection.PermutectDatasetMode.ILLUMINA) {
// distances from ends of fragment
final int fragmentStart = Math.min(read.getMateStart(), read.getUnclippedStart());
final int fragmentEnd = fragmentStart + Math.abs(read.getFragmentLength());
result.add(vc.getStart() - fragmentStart);
result.add(fragmentEnd - vc.getEnd());
}
// Ultima-specific read tags
if (permutectDatasetMode == M2ArgumentCollection.PermutectDatasetMode.ULTIMA) {
result.add(read.getAttributeAsInteger("si")); // si is an integer on the order of 100s or 1000s
result.add((int) (1000*read.getAttributeAsFloat("rq"))); // rq is a float from 0 and 1, so we multiply by 1000 and round
}
// mismatches versus best haplotype
final Haplotype haplotype = bestHaplotypes.get(read);
// TODO: fix this
// I have no idea why this edge case occurs in Ultima data and maybe sometimes in Illumina
if (!bestHaplotypes.containsKey(read)) {
logger.warn(String.format("Best haplotypes don't contain read %s at position %s:%d.", read.getName(),
vc.getContig(), vc.getStart()));
//result.add(3);
//result.add(2);
for (int n = 0; n < NUM_RANGED_FEATURES; n++) {
result.add(0);
}
} else {
byte[] haplotypeBases = haplotype.getBases();
final SmithWatermanAlignment readToHaplotypeAlignment = aligner.align(haplotypeBases, read.getBases(), SmithWatermanAlignmentConstants.ALIGNMENT_TO_BEST_HAPLOTYPE_SW_PARAMETERS, SWOverhangStrategy.SOFTCLIP);
final GATKRead copy = read.copy();
copy.setCigar(readToHaplotypeAlignment.getCigar());
//final int mismatchCount = AlignmentUtils.getMismatchCount(copy, haplotypeBases, readToHaplotypeAlignment.getAlignmentOffset()).numMismatches;
//result.add(mismatchCount);
//final long indelsVsBestHaplotype = readToHaplotypeAlignment.getCigar().getCigarElements().stream().filter(el -> el.getOperator().isIndel()).count();
//result.add((int) indelsVsBestHaplotype);
final int readStartInHaplotype = readToHaplotypeAlignment.getAlignmentOffset();
final AlignmentStateMachine asm = new AlignmentStateMachine(copy);
asm.stepForwardOnGenome();
final List rangedFeatures = RANGES.stream().map(range -> new int[FEATURES_PER_RANGE]).toList();
while (!asm.isRightEdge()) {
final PileupElement pe = asm.makePileupElement();
final int distanceFromVariant = Math.abs(asm.getReadOffset() - readPositionOfVariantStart);
// pick which array's features we are accounting. If the ranges are 5, 10, 20, and the distance is, say 8, then the '<= 10' range is relevant
final OptionalInt relevantRange = IntStream.range(0, RANGES.size()).filter(n -> distanceFromVariant <= RANGES.get(n)).findFirst();
if (relevantRange.isPresent()) {
final int[] featuresToAddTo = rangedFeatures.get(relevantRange.getAsInt());
if (pe.isBeforeInsertion()) {
featuresToAddTo[0] += pe.getLengthOfImmediatelyFollowingIndel();
}
if (pe.isDeletion()) {
featuresToAddTo[1]++;
} else {
final byte base = pe.getBase();
final byte qual = pe.getQual();
final byte haplotypeBase = haplotypeBases[asm.getGenomeOffset() + readStartInHaplotype];
if (base != haplotypeBase) {
featuresToAddTo[2]++;
}
if (qual < VERY_BAD_QUAL_THRESHOLD) {
featuresToAddTo[3]++;
} else if (qual < BAD_QUAL_THRESHOLD) {
featuresToAddTo[4]++;
}
}
}
asm.stepForwardOnGenome();
}
for (final int[] featuresToAdd : rangedFeatures) {
for (final int val : featuresToAdd) {
result.add(val);
}
}
}
// the +1 is for the read group index that comes before the features
Utils.validate(result.size() == permutectDatasetMode.getNumReadFeatures() + 1, "Wrong number of features");
return result;
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy