All Downloads are FREE. Search and download functionalities are using the official Maven repository.

picard.sam.SamAlignmentMerger Maven / Gradle / Ivy

There is a newer version: 3.3.0
Show newest version
package picard.sam;

import htsjdk.samtools.*;
import htsjdk.samtools.SAMFileHeader.SortOrder;
import htsjdk.samtools.filter.OverclippedReadFilter;
import htsjdk.samtools.util.*;
import htsjdk.variant.utils.SAMSequenceDictionaryExtractor;
import picard.PicardException;

import java.io.File;
import java.util.ArrayList;
import java.util.List;

/**
 * Class that takes in a set of alignment information in SAM format and merges it with the set
 * of all reads for which alignment was attempted, stored in an unmapped SAM file.  This
 * class overrides mergeAlignment in AbstractAlignmentNMerger and proceeds on the assumption that
 * the underlying alignment records are aleady in query-name sorted order (true for bwa).  If
 * they are not, the mergeAlignment method catches the IllegalStateException, forces a sort
 * of the underlying alignment records, and tries again.
 *
 * @author [email protected]
 */
public class SamAlignmentMerger extends AbstractAlignmentMerger {

    private final Log log = Log.getInstance(SamAlignmentMerger.class);
    private final List alignedSamFile;
    private final List read1AlignedSamFile;
    private final List read2AlignedSamFile;
    private final int maxGaps;
    private final int minUnclippedBases;
    private boolean forceSort = false;
    private final OverclippedReadFilter contaminationFilter;
    private final List requiredMatchingDictionaryTags;
    private SAMSequenceDictionary alignedSamDictionary;

    /**
     *  Constructor with a default value for unmappingReadStrategy
     *
     */
    public SamAlignmentMerger(final File unmappedBamFile, final File targetBamFile, final File referenceFasta,
                              final SAMProgramRecord programRecord, final boolean clipAdapters, final boolean bisulfiteSequence,
                              final boolean alignedReadsOnly,
                              final List alignedSamFile, final int maxGaps, final List attributesToRetain,
                              final List attributesToRemove,
                              final Integer read1BasesTrimmed, final Integer read2BasesTrimmed,
                              final List read1AlignedSamFile, final List read2AlignedSamFile,
                              final List expectedOrientations,
                              final SortOrder sortOrder,
                              final PrimaryAlignmentSelectionStrategy primaryAlignmentSelectionStrategy,
                              final boolean addMateCigar,
                              final boolean unmapContaminantReads,
                              final int minUnclippedBases) {
        this(unmappedBamFile,
                targetBamFile,
                referenceFasta,
                programRecord,
                clipAdapters,
                bisulfiteSequence,
                alignedReadsOnly,
                alignedSamFile,
                maxGaps,
                attributesToRetain,
                attributesToRemove,
                read1BasesTrimmed,
                read2BasesTrimmed,
                read1AlignedSamFile,
                read2AlignedSamFile,
                expectedOrientations,
                sortOrder,
                primaryAlignmentSelectionStrategy,
                addMateCigar,
                unmapContaminantReads,
                minUnclippedBases,
                UnmappingReadStrategy.DO_NOT_CHANGE,
                SAMSequenceDictionary.DEFAULT_DICTIONARY_EQUAL_TAG
                );
    }

        /**
         * Constructor
         *
         * @param unmappedBamFile                   The BAM file that was used as the input to the aligner, which will
         *                                          include info on all the reads that did not map.  Required.
         * @param targetBamFile                     The file to which to write the merged SAM records. Required.
         * @param referenceFasta                    The reference sequence for the map files. Required.
         * @param programRecord                     Program record for taget file SAMRecords created.
         * @param clipAdapters                      Whether adapters marked in unmapped BAM file should be marked as
         *                                          soft clipped in the merged bam. Required.
         * @param bisulfiteSequence                 Whether the reads are bisulfite sequence (used when calculating the
         *                                          NM and UQ tags). Required.
         * @param alignedReadsOnly                  Whether to output only those reads that have alignment data
         * @param alignedSamFile                    The SAM file(s) with alignment information.  Optional.  If this is
         *                                          not provided, then read1AlignedSamFile and read2AlignedSamFile must be.
         * @param maxGaps                           The maximum number of insertions or deletions permitted in an
         *                                          alignment.  Alignments with more than this many gaps will be ignored.
         *                                          -1 means to allow any number of gaps.
         * @param attributesToRetain                attributes from the alignment record that should be
         *                                          retained when merging, overridden by attributesToRemove if they share
         *                                          common tags.
         * @param attributesToRemove                attributes from the alignment record that should be
         *                                          removed when merging.  This overrides attributesToRetain if they share
         *                                          common tags.
         * @param read1BasesTrimmed                 The number of bases trimmed from start of read 1 prior to alignment.  Optional.
         * @param read2BasesTrimmed                 The number of bases trimmed from start of read 2 prior to alignment.  Optional.
         * @param read1AlignedSamFile               The alignment records for read1.  Used when the two ends of a read are
         *                                          aligned separately.  This is optional, but must be specified if
         *                                          alignedSamFile is not.
         * @param read2AlignedSamFile               The alignment records for read1.  Used when the two ends of a read are
         *                                          aligned separately.  This is optional, but must be specified if
         *                                          alignedSamFile is not.
         * @param expectedOrientations              A List of SamPairUtil.PairOrientations that are expected for
         *                                          aligned pairs.  Used to determine the properPair flag.
         * @param sortOrder                         The order in which the merged records should be output.  If null,
         *                                          output will be coordinate-sorted
         * @param primaryAlignmentSelectionStrategy How to handle multiple alignments for a fragment or read pair,
         *                                          in which none are primary, or more than one is marked primary
         * @param addMateCigar                      True if we are to add or maintain the mate CIGAR (MC) tag, false if we are to remove or not include.
         * @param unmapContaminantReads             If true, identify reads having the signature of cross-species contamination (i.e. mostly clipped bases),
         *                                          and mark them as unmapped.
         * @param minUnclippedBases                 If unmapContaminantReads is set, require this many unclipped bases or else the read will be marked as contaminant.
         * @param unmappingReadStrategy             An enum describing how to deal with reads whose mapping information are being removed (currently this happens due to cross-species
         *                                          contamination). Ignored unless unmapContaminantReads is true.
         * @param requiredMatchingDictionaryTags            A list of SAMSequenceRecord tags that must be equal (if present) in the aligned bam and the reference dictionary.
         *                                          Program will issue a warning about other tags, if present in both files and are different.
         */
    public SamAlignmentMerger(final File unmappedBamFile, final File targetBamFile, final File referenceFasta,
                              final SAMProgramRecord programRecord, final boolean clipAdapters, final boolean bisulfiteSequence,
                              final boolean alignedReadsOnly,
                              final List alignedSamFile, final int maxGaps, final List attributesToRetain,
                              final List attributesToRemove,
                              final Integer read1BasesTrimmed, final Integer read2BasesTrimmed,
                              final List read1AlignedSamFile, final List read2AlignedSamFile,
                              final List expectedOrientations,
                              final SortOrder sortOrder,
                              final PrimaryAlignmentSelectionStrategy primaryAlignmentSelectionStrategy,
                              final boolean addMateCigar,
                              final boolean unmapContaminantReads,
                              final int minUnclippedBases,
                              final UnmappingReadStrategy unmappingReadStrategy,
                              final List requiredMatchingDictionaryTags) {

        super(unmappedBamFile, targetBamFile, referenceFasta, clipAdapters, bisulfiteSequence,
                alignedReadsOnly, programRecord, attributesToRetain, attributesToRemove, read1BasesTrimmed,
                read2BasesTrimmed, expectedOrientations, sortOrder, primaryAlignmentSelectionStrategy, addMateCigar,
                unmapContaminantReads, unmappingReadStrategy);

        if ((alignedSamFile == null || alignedSamFile.isEmpty()) &&
                (read1AlignedSamFile == null || read1AlignedSamFile.isEmpty() ||
                        read2AlignedSamFile == null || read2AlignedSamFile.isEmpty())) {
            throw new IllegalArgumentException("Either alignedSamFile or BOTH of read1AlignedSamFile and " +
                    "read2AlignedSamFile must be specified.");
        }

        if (alignedSamFile != null) {
            alignedSamFile.forEach(IOUtil::assertFileIsReadable);
        } else {
            read1AlignedSamFile.forEach(IOUtil::assertFileIsReadable);
            read2AlignedSamFile.forEach(IOUtil::assertFileIsReadable);
        }

        this.alignedSamFile = alignedSamFile;
        this.read1AlignedSamFile = read1AlignedSamFile;
        this.read2AlignedSamFile = read2AlignedSamFile;
        this.maxGaps = maxGaps;
        this.minUnclippedBases = minUnclippedBases;
        this.contaminationFilter = new OverclippedReadFilter(minUnclippedBases, false);
        this.requiredMatchingDictionaryTags = requiredMatchingDictionaryTags;

        log.info("Processing SAM file(s): " + ((alignedSamFile != null) ? alignedSamFile : (read1AlignedSamFile + "," + read2AlignedSamFile)));
    }

    /**
     * Merges the alignment from the map file with the non-aligned records from the source BAM file.
     * Overrides mergeAlignment in AbstractAlignmentMerger.  Tries first to proceed on the assumption
     * that the alignment records are pre-sorted.  If not, catches the exception, forces a sort, and
     * tries again.
     */
    public void mergeAlignment(final File referenceFasta) {
        try {
            super.mergeAlignment(referenceFasta);
        } catch (final IllegalStateException ise) {
            log.warn("Exception merging bam alignment - attempting to sort aligned reads and try again: ", ise.getMessage());
            forceSort = true;
            resetRefSeqFileWalker();
            super.mergeAlignment(referenceFasta);
        }
    }

    @Override
    protected SAMSequenceDictionary getDictionaryForMergedBam() {
        SAMSequenceDictionary referenceDict = SAMSequenceDictionaryExtractor.extractDictionary(referenceFasta);
        if (referenceDict == null) {
            throw new PicardException("No sequence dictionary found for " + referenceFasta.getAbsolutePath() +
                    ".  Use Picard's CreateSequenceDictionary to create a sequence dictionary.");
        }
        return SAMSequenceDictionary.mergeDictionaries(alignedSamDictionary, referenceDict, requiredMatchingDictionaryTags);
    }

    /**
     * Reads the aligned SAM records into a SortingCollection and returns an iterator over that collection
     */
    protected CloseableIterator getQuerynameSortedAlignedRecords() {

        final CloseableIterator mergingIterator;
        final SAMFileHeader header;

        // When the alignment records, including both ends of a pair, are in SAM files
        if (alignedSamFile != null && !alignedSamFile.isEmpty()) {
            final List headers = new ArrayList<>(alignedSamFile.size());
            final List readers = new ArrayList<>(alignedSamFile.size());
            for (final File f : this.alignedSamFile) {
                final SamReader r = SamReaderFactory.makeDefault().referenceSequence(referenceFasta).open(f);
                headers.add(r.getFileHeader());
                readers.add(r);

                // As we're going through and opening the aligned files, if we don't have a @PG yet
                // and there is only a single one in the input file, use that!
                if (getProgramRecord() == null && r.getFileHeader().getProgramRecords().size() == 1) {
                    setProgramRecord(r.getFileHeader().getProgramRecords().iterator().next());
                }
            }

            // assert that all the dictionaries are the same before merging the headers.
            alignedSamDictionary = headers.get(0).getSequenceDictionary();
            headers.stream()
                    .map(SAMFileHeader::getSequenceDictionary)
                    .forEach(alignedSamDictionary::assertSameDictionary);

            final SamFileHeaderMerger headerMerger = new SamFileHeaderMerger(SortOrder.queryname, headers, false);

            mergingIterator = new MergingSamRecordIterator(headerMerger, readers, true);
            header = headerMerger.getMergedHeader();

        }
        // When the ends are aligned separately and don't have firstOfPair information correctly
        // set we use this branch.
        else {
            // this merging iterator already asserts that all the headers are the same
            mergingIterator = new SeparateEndAlignmentIterator(this.read1AlignedSamFile, this.read2AlignedSamFile, referenceFasta);
            header = ((SeparateEndAlignmentIterator) mergingIterator).getHeader();

            alignedSamDictionary = header.getSequenceDictionary();

            // As we're going through and opening the aligned files, if we don't have a @PG yet
            // and there is only a single one in the input file, use that!
            if (getProgramRecord() == null && header.getProgramRecords().size() == 1) {
                setProgramRecord(header.getProgramRecords().iterator().next());
            }
        }

        if (!forceSort) {
            return mergingIterator;
        }

        final SortingCollection alignmentSorter = SortingCollection.newInstance(SAMRecord.class,
                new BAMRecordCodec(header), new SAMRecordQueryNameComparator(), MAX_RECORDS_IN_RAM);

        int count = 0;
        while (mergingIterator.hasNext()) {
            alignmentSorter.add(mergingIterator.next());
            count++;
            if (count > 0 && count % 1000000 == 0) {
                log.info("Read " + count + " records from alignment SAM/BAM.");
            }
        }
        log.info("Finished reading " + count + " total records from alignment SAM/BAM.");

        mergingIterator.close();
        return new DelegatingIterator(alignmentSorter.iterator()) {
            @Override
            public void close() {
                super.close();
                alignmentSorter.cleanup();
            }
        };
    }

    private final class SuffixTrimingSamRecordIterator implements CloseableIterator {
        private final CloseableIterator underlyingIterator;
        private final String suffixToTrim;

        private SuffixTrimingSamRecordIterator(final CloseableIterator underlyingIterator, final String suffixToTrim) {
            this.underlyingIterator = underlyingIterator;
            this.suffixToTrim = suffixToTrim;
        }

        @Override
        public void close() {
            underlyingIterator.close();
        }

        @Override
        public boolean hasNext() {
            return underlyingIterator.hasNext();
        }

        @Override
        public SAMRecord next() {
            final SAMRecord rec = underlyingIterator.next();
            final String readName = rec.getReadName();
            if (readName.endsWith(suffixToTrim)) {
                rec.setReadName(readName.substring(0, readName.length() - suffixToTrim.length()));
            }
            return rec;
        }

        @Override
        public void remove() {
            underlyingIterator.remove();
        }
    }

    private class SeparateEndAlignmentIterator implements CloseableIterator {

        private final PeekableIterator read1Iterator;
        private final PeekableIterator read2Iterator;
        private final SAMFileHeader header;

        public SeparateEndAlignmentIterator(final List read1Alignments, final List read2Alignments, File referenceFasta) {
            final List headers = new ArrayList<>();
            final List read1 = new ArrayList<>(read1Alignments.size());
            final List read2 = new ArrayList<>(read2Alignments.size());
            for (final File f : read1Alignments) {
                final SamReader r = SamReaderFactory.makeDefault().referenceSequence(referenceFasta).open(f);
                headers.add(r.getFileHeader());
                read1.add(r);
            }
            for (final File f : read2Alignments) {
                final SamReader r = SamReaderFactory.makeDefault().referenceSequence(referenceFasta).open(f);
                headers.add(r.getFileHeader());
                read2.add(r);
            }

            final SamFileHeaderMerger headerMerger = new SamFileHeaderMerger(SAMFileHeader.SortOrder.coordinate, headers, false);
            read1Iterator = new PeekableIterator<>(
                    new SuffixTrimingSamRecordIterator(new MergingSamRecordIterator(headerMerger, read1, true), "/1"));
            read2Iterator = new PeekableIterator<>(
                    new SuffixTrimingSamRecordIterator(new MergingSamRecordIterator(headerMerger, read2, true), "/2"));

            header = headerMerger.getMergedHeader();
        }

        public void close() {
            read1Iterator.close();
            read2Iterator.close();
        }

        public boolean hasNext() {
            return read1Iterator.hasNext() || read2Iterator.hasNext();
        }

        public SAMRecord next() {
            if (read1Iterator.hasNext()) {
                if (read2Iterator.hasNext()) {
                    return (read1Iterator.peek().getReadName().compareTo(read2Iterator.peek().getReadName()) <= 0)
                            ? setPairFlags(read1Iterator.next(), true)
                            : setPairFlags(read2Iterator.next(), false);
                } else {
                    return setPairFlags(read1Iterator.next(), true);
                }
            } else {
                return setPairFlags(read2Iterator.next(), false);
            }
        }

        public void remove() {
            throw new UnsupportedOperationException("remove() not supported");
        }

        public SAMFileHeader getHeader() { return this.header; }

        private SAMRecord setPairFlags(final SAMRecord sam, final boolean firstOfPair) {
            sam.setReadPairedFlag(true);
            sam.setFirstOfPairFlag(firstOfPair);
            sam.setSecondOfPairFlag(!firstOfPair);
            return sam;
        }
    }

    /**
     * For now, we only ignore those alignments that have more than {@link maxGaps} insertions
     * or deletions.
     */
    protected boolean ignoreAlignment(final SAMRecord sam) {
        if (maxGaps == -1) return false;
        int gaps = 0;
        for (final CigarElement el : sam.getCigar().getCigarElements()) {
            if (el.getOperator() == CigarOperator.I || el.getOperator() == CigarOperator.D) {
                gaps++;
            }
        }
        return gaps > maxGaps;
    }

    /**
     * Criteria for contaminant reads:
     * 1. primary alignment has fewer than minUnclippedBases unclipped bases
     * 2. primary alignment has both ends clipped
     */
    protected boolean isContaminant(final HitsForInsert hits) {
        boolean isContaminant = false;
        if (hits.numHits() > 0) {
            final int primaryIndex = hits.getIndexOfEarliestPrimary();
            if (primaryIndex < 0) throw new IllegalStateException("No primary alignment was found, despite having nonzero hits.");
            final SAMRecord primaryRead1 = hits.getFirstOfPair(primaryIndex);
            final SAMRecord primaryRead2 = hits.getSecondOfPair(primaryIndex);
            if (primaryRead1 != null && primaryRead2 != null) isContaminant = contaminationFilter.filterOut(primaryRead1, primaryRead2);
            else if (primaryRead1 != null) isContaminant = contaminationFilter.filterOut(primaryRead1);
            else if (primaryRead2 != null) isContaminant = contaminationFilter.filterOut(primaryRead2);
            else throw new IllegalStateException("Neither read1 or read2 exist for chosen primary alignment");
        }
        return isContaminant;
    }

    // Accessor for testing
    public boolean getForceSort() {return this.forceSort; }
}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy