All Downloads are FREE. Search and download functionalities are using the official Maven repository.

htsjdk.samtools.SamReader Maven / Gradle / Ivy

There is a newer version: 4.1.3
Show newest version
/*
 * The MIT License
 *
 * Copyright (c) 2016 The Broad Institute
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
 * in the Software without restriction, including without limitation the rights
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be included in
 * all copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 * THE SOFTWARE.
 */

package htsjdk.samtools;

import htsjdk.samtools.util.CloseableIterator;

import java.io.Closeable;
import java.text.MessageFormat;

/**
 * Describes functionality for objects that produce {@link SAMRecord}s and associated information.
 *
 * Currently, only deprecated readers implement this directly; actual readers implement this
 * via {@link ReaderImplementation} and {@link PrimitiveSamReader}, which {@link SamReaderFactory}
 * converts into full readers by using {@link PrimitiveSamReaderToSamReaderAdapter}.
 *
 * @author mccowan
 */
public interface SamReader extends Iterable, Closeable {

    /** Describes a type of SAM file. */
    public abstract class Type {
        /** A string representation of this type. */
        public abstract String name();

        /** The recommended file extension for SAMs of this type, without a period. */
        public abstract String fileExtension();

        /** The recommended file extension for SAM indexes of this type, without a period, or null if this type is not associated with indexes. */
        public abstract String indexExtension();

        static class TypeImpl extends Type {
            final String name, fileExtension, indexExtension;

            TypeImpl(final String name, final String fileExtension, final String indexExtension) {
                this.name = name;
                this.fileExtension = fileExtension;
                this.indexExtension = indexExtension;
            }

            @Override
            public String name() {
                return name;
            }

            @Override
            public String fileExtension() {
                return fileExtension;
            }

            @Override
            public String indexExtension() {
                return indexExtension;
            }

            @Override
            public String toString() {
                return String.format("TypeImpl{name='%s', fileExtension='%s', indexExtension='%s'}", name, fileExtension, indexExtension);
            }
        }

        public static final Type SRA_TYPE = new TypeImpl("SRA", "sra", null);
        public static final Type CRAM_TYPE = new TypeImpl("CRAM", "cram", "crai");
        public static final Type BAM_TYPE = new TypeImpl("BAM", "bam", "bai");
        public static final Type SAM_TYPE = new TypeImpl("SAM", "sam", null);
        public static final Type BAM_CSI_TYPE = new TypeImpl("BAM", "bam", "csi");

        public boolean hasValidFileExtension(final String fileName) {
            return fileName != null && fileName.endsWith("." + fileExtension());
        }
    }

    /**
     * Facet for index-related operations.
     */
    public interface Indexing {
        /**
         * Retrieves the index for the given file type.  Ensure that the index is of the specified type.
         *
         * @return An index of the given type.
         */
        public BAMIndex getIndex();

        /**
         * Returns true if the supported index is browseable, meaning the bins in it can be traversed
         * and chunk data inspected and retrieved.
         *
         * @return True if the index supports the BrowseableBAMIndex interface.  False otherwise.
         */
        public boolean hasBrowseableIndex();

        /**
         * Gets an index tagged with the BrowseableBAMIndex interface.  Throws an exception if no such
         * index is available.
         *
         * @return An index with a browseable interface, if possible.
         * @throws SAMException if no such index is available.
         */
        public BrowseableBAMIndex getBrowseableIndex();

        /**
         * Iterate through the given chunks in the file.
         *
         * @param chunks List of chunks for which to retrieve data.
         * @return An iterator over the given chunks.
         */
        public SAMRecordIterator iterator(final SAMFileSpan chunks);

        /**
         * Gets a pointer spanning all reads in the BAM file.
         *
         * @return Unbounded pointer to the first record, in chunk format.
         */
        public SAMFileSpan getFilePointerSpanningReads();

    }

    public SAMFileHeader getFileHeader();

    /**
     * @return the {@link htsjdk.samtools.SamReader.Type} of this {@link htsjdk.samtools.SamReader}
     */
    public Type type();

    /**
     * @return a human readable description of the resource backing this sam reader
     */
    public String getResourceDescription();

    /**
     * @return true if ths is a BAM file, and has an index
     */
    public boolean hasIndex();

    /**
     * Exposes the {@link SamReader.Indexing} facet of this {@link SamReader}.
     *
     * @throws java.lang.UnsupportedOperationException If {@link #hasIndex()} returns false.
     */
    public Indexing indexing();

    /**
     * Iterate through file in order.  For a SamReader constructed from an InputStream, and for any SAM file,
     * a 2nd iteration starts where the 1st one left off.  For a BAM constructed from a SeekableStream or File, each new iteration
     * starts at the first record.
     * 

* Only a single open iterator on a SAM or BAM file may be extant at any one time. If you want to start * a second iteration, the first one must be closed first. */ @Override public SAMRecordIterator iterator(); /** * Iterate over records that match the given interval. Only valid to call this if hasIndex() == true. *

* Only a single open iterator on a given SamReader may be extant at any one time. If you want to start * a second iteration, the first one must be closed first. You can use a second SamReader to iterate * in parallel over the same underlying file. *

* Note that indexed lookup is not perfectly efficient in terms of disk I/O. I.e. some SAMRecords may be read * and then discarded because they do not match the interval of interest. *

* Note that an unmapped read will be returned by this call if it has a coordinate for the purpose of sorting that * is in the query region. * * @param sequence Reference sequence of interest. * @param start 1-based, inclusive start of interval of interest. Zero implies start of the reference sequence. * @param end 1-based, inclusive end of interval of interest. Zero implies end of the reference sequence. * @param contained If true, each SAMRecord returned will have its alignment completely contained in the * interval of interest. If false, the alignment of the returned SAMRecords need only overlap the interval of interest. * @return Iterator over the SAMRecords matching the interval. */ public SAMRecordIterator query(final String sequence, final int start, final int end, final boolean contained); /** * Iterate over records that overlap the given interval. Only valid to call this if hasIndex() == true. *

* Only a single open iterator on a given SamReader may be extant at any one time. If you want to start * a second iteration, the first one must be closed first. *

* Note that indexed lookup is not perfectly efficient in terms of disk I/O. I.e. some SAMRecords may be read * and then discarded because they do not match the interval of interest. *

* Note that an unmapped read will be returned by this call if it has a coordinate for the purpose of sorting that * is in the query region. * * @param sequence Reference sequence of interest. * @param start 1-based, inclusive start of interval of interest. Zero implies start of the reference sequence. * @param end 1-based, inclusive end of interval of interest. Zero implies end of the reference sequence. * @return Iterator over the SAMRecords overlapping the interval. */ public SAMRecordIterator queryOverlapping(final String sequence, final int start, final int end); /** * Iterate over records that are contained in the given interval. Only valid to call this if hasIndex() == true. *

* Only a single open iterator on a given SamReader may be extant at any one time. If you want to start * a second iteration, the first one must be closed first. *

* Note that indexed lookup is not perfectly efficient in terms of disk I/O. I.e. some SAMRecords may be read * and then discarded because they do not match the interval of interest. *

* Note that an unmapped read will be returned by this call if it has a coordinate for the purpose of sorting that * is in the query region. * * @param sequence Reference sequence of interest. * @param start 1-based, inclusive start of interval of interest. Zero implies start of the reference sequence. * @param end 1-based, inclusive end of interval of interest. Zero implies end of the reference sequence. * @return Iterator over the SAMRecords contained in the interval. */ public SAMRecordIterator queryContained(final String sequence, final int start, final int end); /** * Iterate over records that match one of the given intervals. This may be more efficient than querying * each interval separately, because multiple reads of the same SAMRecords is avoided. *

* Only valid to call this if hasIndex() == true. *

* Only a single open iterator on a given SamReader may be extant at any one time. If you want to start * a second iteration, the first one must be closed first. You can use a second SamReader to iterate * in parallel over the same underlying file. *

* Note that indexed lookup is not perfectly efficient in terms of disk I/O. I.e. some SAMRecords may be read * and then discarded because they do not match an interval of interest. *

* Note that an unmapped read will be returned by this call if it has a coordinate for the purpose of sorting that * is in the query region. * * @param intervals Intervals to be queried. The intervals must be optimized, i.e. in order, with overlapping * and abutting intervals merged. This can be done with {@link htsjdk.samtools.QueryInterval#optimizeIntervals} * @param contained If true, each SAMRecord returned is will have its alignment completely contained in one of the * intervals of interest. If false, the alignment of the returned SAMRecords need only overlap one of * the intervals of interest. * @return Iterator over the SAMRecords matching the interval. */ public SAMRecordIterator query(final QueryInterval[] intervals, final boolean contained); /** * Iterate over records that overlap any of the given intervals. This may be more efficient than querying * each interval separately, because multiple reads of the same SAMRecords is avoided. *

* Only valid to call this if hasIndex() == true. *

* Only a single open iterator on a given SamReader may be extant at any one time. If you want to start * a second iteration, the first one must be closed first. *

* Note that indexed lookup is not perfectly efficient in terms of disk I/O. I.e. some SAMRecords may be read * and then discarded because they do not match the interval of interest. *

* Note that an unmapped read will be returned by this call if it has a coordinate for the purpose of sorting that * is in the query region. * * @param intervals Intervals to be queried. The intervals must be optimized, i.e. in order, with overlapping * and abutting intervals merged. This can be done with {@link htsjdk.samtools.QueryInterval#optimizeIntervals} */ public SAMRecordIterator queryOverlapping(final QueryInterval[] intervals); /** * Iterate over records that are contained in the given interval. This may be more efficient than querying * each interval separately, because multiple reads of the same SAMRecords is avoided. *

* Only valid to call this if hasIndex() == true. *

* Only a single open iterator on a given SamReader may be extant at any one time. If you want to start * a second iteration, the first one must be closed first. *

* Note that indexed lookup is not perfectly efficient in terms of disk I/O. I.e. some SAMRecords may be read * and then discarded because they do not match the interval of interest. *

* Note that an unmapped read will be returned by this call if it has a coordinate for the purpose of sorting that * is in the query region. * * @param intervals Intervals to be queried. The intervals must be optimized, i.e. in order, with overlapping * and abutting intervals merged. This can be done with {@link htsjdk.samtools.QueryInterval#optimizeIntervals} * @return Iterator over the SAMRecords contained in any of the intervals. */ public SAMRecordIterator queryContained(final QueryInterval[] intervals); public SAMRecordIterator queryUnmapped(); /** * Iterate over records that map to the given sequence and start at the given position. Only valid to call this if hasIndex() == true. *

* Only a single open iterator on a given SamReader may be extant at any one time. If you want to start * a second iteration, the first one must be closed first. *

* Note that indexed lookup is not perfectly efficient in terms of disk I/O. I.e. some SAMRecords may be read * and then discarded because they do not match the interval of interest. *

* Note that an unmapped read will be returned by this call if it has a coordinate for the purpose of sorting that * matches the arguments. * * @param sequence Reference sequence of interest. * @param start Alignment start of interest. * @return Iterator over the SAMRecords with the given alignment start. */ public SAMRecordIterator queryAlignmentStart(final String sequence, final int start); /** * Fetch the mate for the given read. Only valid to call this if hasIndex() == true. * This will work whether the mate has a coordinate or not, so long as the given read has correct * mate information. This method iterates over the SAM file, so there may not be an unclosed * iterator on the SAM file when this method is called. *

* Note that it is not possible to call queryMate when iterating over the SamReader, because queryMate * requires its own iteration, and there cannot be two simultaneous iterations on the same SamReader. The * work-around is to open a second SamReader on the same input file, and call queryMate on the second * reader. * * @param rec Record for which mate is sought. Must be a paired read. * @return rec's mate, or null if it cannot be found. */ public SAMRecord queryMate(final SAMRecord rec); /** * The minimal subset of functionality needed for a {@link SAMRecord} data source. * {@link SamReader} itself is somewhat large and bulky, but the core functionality can be captured in * relatively few methods, which are included here. For documentation, see the corresponding methods * in {@link SamReader}. * * See also: {@link PrimitiveSamReaderToSamReaderAdapter}, {@link ReaderImplementation} * */ public interface PrimitiveSamReader { Type type(); boolean hasIndex(); BAMIndex getIndex(); SAMFileHeader getFileHeader(); CloseableIterator getIterator(); CloseableIterator getIterator(SAMFileSpan fileSpan); SAMFileSpan getFilePointerSpanningReads(); CloseableIterator query(QueryInterval[] intervals, boolean contained); CloseableIterator queryAlignmentStart(String sequence, int start); CloseableIterator queryUnmapped(); void close(); ValidationStringency getValidationStringency(); } /** * Decorator for a {@link SamReader.PrimitiveSamReader} that expands its functionality into a {@link SamReader}, * given the backing {@link SamInputResource}. * * Wraps the {@link Indexing} interface as well, which was originally separate from {@link SamReader} but in practice * the two are always implemented by the same class. * */ class PrimitiveSamReaderToSamReaderAdapter implements SamReader, Indexing { final PrimitiveSamReader p; final SamInputResource resource; public PrimitiveSamReaderToSamReaderAdapter(final PrimitiveSamReader p, final SamInputResource resource) { this.p = p; this.resource = resource; } /** * Access the underlying {@link PrimitiveSamReader} used by this adapter. * @return the {@link PrimitiveSamReader} used by this adapter. */ public PrimitiveSamReader underlyingReader() { return p; } @Override public SAMRecordIterator queryOverlapping(final String sequence, final int start, final int end) { return query(sequence, start, end, false); } @Override public SAMRecordIterator queryOverlapping(final QueryInterval[] intervals) { return query(intervals, false); } @Override public SAMRecordIterator queryContained(final String sequence, final int start, final int end) { return query(sequence, start, end, true); } @Override public SAMRecordIterator queryContained(final QueryInterval[] intervals) { return query(intervals, true); } /** * Wraps the boilerplate code for querying a record's mate, which is common across many implementations. * * @param rec Record for which mate is sought. Must be a paired read. * @return */ @Override public SAMRecord queryMate(final SAMRecord rec) { if (!rec.getReadPairedFlag()) { throw new IllegalArgumentException("queryMate called for unpaired read."); } if (rec.getFirstOfPairFlag() == rec.getSecondOfPairFlag()) { throw new IllegalArgumentException("SAMRecord must be either first and second of pair, but not both."); } final boolean firstOfPair = rec.getFirstOfPairFlag(); final CloseableIterator it; if (rec.getMateReferenceIndex() == SAMRecord.NO_ALIGNMENT_REFERENCE_INDEX) { it = queryUnmapped(); } else { it = queryAlignmentStart(rec.getMateReferenceName(), rec.getMateAlignmentStart()); } try { SAMRecord mateRec = null; while (it.hasNext()) { final SAMRecord next = it.next(); if (!next.getReadPairedFlag()) { if (rec.getReadName().equals(next.getReadName())) { throw new SAMFormatException("Paired and unpaired reads with same name: " + rec.getReadName()); } continue; } if (firstOfPair) { if (next.getFirstOfPairFlag()) continue; } else { if (next.getSecondOfPairFlag()) continue; } if (rec.getReadName().equals(next.getReadName())) { if (mateRec != null) { throw new SAMFormatException("Multiple SAMRecord with read name " + rec.getReadName() + " for " + (firstOfPair ? "second" : "first") + " end."); } mateRec = next; } } return mateRec; } finally { it.close(); } } @Override public boolean hasBrowseableIndex() { return hasIndex() && getIndex() instanceof BrowseableBAMIndex; } @Override public BrowseableBAMIndex getBrowseableIndex() { final BAMIndex index = getIndex(); if (!(index instanceof BrowseableBAMIndex)) throw new SAMException("Cannot return index: index created by BAM is not browseable."); return BrowseableBAMIndex.class.cast(index); } @Override public SAMRecordIterator iterator() { return new AssertingIterator(p.getIterator()); } @Override public SAMRecordIterator iterator(final SAMFileSpan chunks) { return new AssertingIterator(p.getIterator(chunks)); } @Override public void close() { p.close(); } @Override public SAMFileSpan getFilePointerSpanningReads() { return p.getFilePointerSpanningReads(); } @Override public SAMFileHeader getFileHeader() { return p.getFileHeader(); } @Override public Type type() { return p.type(); } @Override public String getResourceDescription() { return this.resource.toString(); } @Override public boolean hasIndex() { return p.hasIndex(); } @Override public Indexing indexing() { return this; } @Override public BAMIndex getIndex() { return p.getIndex(); } @Override public SAMRecordIterator query(final QueryInterval[] intervals, final boolean contained) { return AssertingIterator.of(p.query(intervals, contained)); } @Override public SAMRecordIterator query(final String sequence, final int start, final int end, final boolean contained) { return query(new QueryInterval[]{new QueryInterval(getFileHeader().getSequenceIndex(sequence), start, end)}, contained); } @Override public SAMRecordIterator queryUnmapped() { return AssertingIterator.of(p.queryUnmapped()); } @Override public SAMRecordIterator queryAlignmentStart(final String sequence, final int start) { return AssertingIterator.of(p.queryAlignmentStart(sequence, start)); } } static class AssertingIterator implements SAMRecordIterator { static AssertingIterator of(final CloseableIterator iterator) { return new AssertingIterator(iterator); } private final CloseableIterator wrappedIterator; private SAMSortOrderChecker checker; public AssertingIterator(final CloseableIterator iterator) { wrappedIterator = iterator; } @Override public SAMRecordIterator assertSorted(final SAMFileHeader.SortOrder sortOrder) { checker = new SAMSortOrderChecker(sortOrder); return this; } @Override public SAMRecord next() { final SAMRecord result = wrappedIterator.next(); if (checker != null) { final SAMRecord previous = checker.getPreviousRecord(); if (!checker.isSorted(result)) { throw new IllegalStateException(String.format( "Record %s should come after %s when sorting with %s ordering.", previous.getSAMString().trim(), result.getSAMString().trim(), checker.getSortOrder())); } } return result; } @Override public void close() { wrappedIterator.close(); } @Override public boolean hasNext() { return wrappedIterator.hasNext(); } @Override public void remove() { wrappedIterator.remove(); } } /** * Internal interface for SAM/BAM/CRAM file reader implementations, * as distinct from non-file-based readers. * * Implemented as an abstract class to enforce better access control. * * TODO -- Many of these methods only apply for a subset of implementations, * TODO -- and either no-op or throw an exception for the others. * TODO -- We should consider refactoring things to avoid this; * TODO -- perhaps we can get away with not having this class at all. */ abstract class ReaderImplementation implements PrimitiveSamReader { abstract void enableFileSource(final SamReader reader, final boolean enabled); abstract void enableIndexCaching(final boolean enabled); abstract void enableIndexMemoryMapping(final boolean enabled); abstract void enableCrcChecking(final boolean enabled); abstract void setSAMRecordFactory(final SAMRecordFactory factory); abstract void setValidationStringency(final ValidationStringency validationStringency); } }





© 2015 - 2024 Weber Informatics LLC | Privacy Policy