org.broadinstitute.hellbender.tools.spark.pathseq.PSBwaUtils Maven / Gradle / Ivy
package org.broadinstitute.hellbender.tools.spark.pathseq;
import htsjdk.samtools.SAMFileHeader;
import htsjdk.samtools.SAMSequenceDictionary;
import htsjdk.samtools.SAMSequenceRecord;
import org.apache.logging.log4j.Logger;
import org.apache.spark.api.java.JavaRDD;
import org.broadinstitute.hellbender.tools.spark.sv.utils.SVUtils;
import org.broadinstitute.hellbender.utils.read.GATKRead;
import org.broadinstitute.hellbender.utils.reference.ReferenceUtils;
import java.io.File;
import java.util.*;
import java.util.stream.Collectors;
/**
* Utility functions for PathSeq Bwa tool
*/
public final class PSBwaUtils {
static void addReferenceSequencesToHeader(final SAMFileHeader header,
final String referenceDictionaryPath) {
final List refSeqs = getReferenceSequences(referenceDictionaryPath);
for (final SAMSequenceRecord rec : refSeqs) {
if (header.getSequence(rec.getSequenceName()) == null) {
header.addSequence(rec);
}
}
}
private static List getReferenceSequences(final String referenceDictionaryPath) {
final SAMSequenceDictionary referenceDictionary = ReferenceUtils.loadFastaDictionary(new File(referenceDictionaryPath));
return referenceDictionary.getSequences();
}
/**
* Returns header with sequences that were aligned to at least once in reads
*/
static SAMFileHeader removeUnmappedHeaderSequences(final SAMFileHeader header,
final JavaRDD reads,
final Logger logger) {
final List usedSequences = PSBwaUtils.getAlignedSequenceNames(reads);
final List usedSequenceRecords = usedSequences.stream()
.map(seqName -> header.getSequence(seqName))
.filter(seq -> {
if (seq != null) return true;
logger.warn("One or more reads are aligned to sequence " + seq + " but it is not in the header");
return false;
})
.collect(Collectors.toList());
header.setSequenceDictionary(new SAMSequenceDictionary(usedSequenceRecords));
return header;
}
/**
* Returns list of sequence names that were aligned to at least once in the reads
*/
static List getAlignedSequenceNames(final JavaRDD reads) {
return reads.flatMap(PSBwaUtils::getSequenceNames).distinct().collect();
}
/**
* Returns set of sequence names of the read
*/
private static Iterator getSequenceNames(final GATKRead read) {
if (read.isUnmapped() || read.getAssignedContig().equals("*")) return Collections.emptyIterator();
if (!read.hasAttribute("SA")) return Collections.singleton(read.getAssignedContig()).iterator();
final String[] saTokens = read.getAttributeAsString("SA").split(";");
final Set sequenceNames = new HashSet<>(SVUtils.hashMapCapacity(1 + saTokens.length));
sequenceNames.add(read.getAssignedContig());
for (final String token : saTokens) {
final String[] alignmentTokens = token.split(",", 1);
sequenceNames.add(alignmentTokens[0]);
}
return sequenceNames.iterator();
}
}