All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.broadinstitute.hellbender.tools.spark.pathseq.PSBwaUtils Maven / Gradle / Ivy

There is a newer version: 4.6.0.0
Show newest version
package org.broadinstitute.hellbender.tools.spark.pathseq;

import htsjdk.samtools.SAMFileHeader;
import htsjdk.samtools.SAMSequenceDictionary;
import htsjdk.samtools.SAMSequenceRecord;
import org.apache.logging.log4j.Logger;
import org.apache.spark.api.java.JavaRDD;
import org.broadinstitute.hellbender.tools.spark.sv.utils.SVUtils;
import org.broadinstitute.hellbender.utils.read.GATKRead;
import org.broadinstitute.hellbender.utils.reference.ReferenceUtils;

import java.io.File;
import java.util.*;
import java.util.stream.Collectors;

/**
 * Utility functions for PathSeq Bwa tool
 */
public final class PSBwaUtils {

    static void addReferenceSequencesToHeader(final SAMFileHeader header,
                                              final String referenceDictionaryPath) {
        final List refSeqs = getReferenceSequences(referenceDictionaryPath);
        for (final SAMSequenceRecord rec : refSeqs) {
            if (header.getSequence(rec.getSequenceName()) == null) {
                header.addSequence(rec);
            }
        }
    }

    private static List getReferenceSequences(final String referenceDictionaryPath) {
        final SAMSequenceDictionary referenceDictionary = ReferenceUtils.loadFastaDictionary(new File(referenceDictionaryPath));
        return referenceDictionary.getSequences();
    }

    /**
     * Returns header with sequences that were aligned to at least once in reads
     */
    static SAMFileHeader removeUnmappedHeaderSequences(final SAMFileHeader header,
                                                       final JavaRDD reads,
                                                       final Logger logger) {
        final List usedSequences = PSBwaUtils.getAlignedSequenceNames(reads);
        final List usedSequenceRecords = usedSequences.stream()
                .map(seqName -> header.getSequence(seqName))
                .filter(seq -> {
                    if (seq != null) return true;
                    logger.warn("One or more reads are aligned to sequence " + seq + " but it is not in the header");
                    return false;
                })
                .collect(Collectors.toList());
        header.setSequenceDictionary(new SAMSequenceDictionary(usedSequenceRecords));
        return header;
    }

    /**
     * Returns list of sequence names that were aligned to at least once in the reads
     */
    static List getAlignedSequenceNames(final JavaRDD reads) {
        return reads.flatMap(PSBwaUtils::getSequenceNames).distinct().collect();
    }

    /**
     * Returns set of sequence names of the read
     */
    private static Iterator getSequenceNames(final GATKRead read) {
        if (read.isUnmapped() || read.getAssignedContig().equals("*")) return Collections.emptyIterator();
        if (!read.hasAttribute("SA")) return Collections.singleton(read.getAssignedContig()).iterator();
        final String[] saTokens = read.getAttributeAsString("SA").split(";");
        final Set sequenceNames = new HashSet<>(SVUtils.hashMapCapacity(1 + saTokens.length));
        sequenceNames.add(read.getAssignedContig());
        for (final String token : saTokens) {
            final String[] alignmentTokens = token.split(",", 1);
            sequenceNames.add(alignmentTokens[0]);
        }
        return sequenceNames.iterator();
    }

}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy