htsjdk.samtools.cram.structure.Slice Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of htsjdk Show documentation
htsjdk
There is a newer version: 4.1.3
/**
 * ****************************************************************************
 * Copyright 2013 EMBL-EBI
 * 
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at4
 * 

 * http://www.apache.org/licenses/LICENSE-2.0
 * 
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 * ****************************************************************************
 */
package htsjdk.samtools.cram.structure;

import htsjdk.samtools.*;
import htsjdk.samtools.cram.BAIEntry;
import htsjdk.samtools.cram.CRAIEntry;
import htsjdk.samtools.cram.CRAMException;
import htsjdk.samtools.cram.build.CRAMReferenceRegion;
import htsjdk.samtools.cram.common.CRAMVersion;
import htsjdk.samtools.cram.common.CramVersions;
import htsjdk.samtools.cram.digest.ContentDigests;
import htsjdk.samtools.cram.encoding.reader.CramRecordReader;
import htsjdk.samtools.cram.encoding.writer.CramRecordWriter;
import htsjdk.samtools.cram.io.CramIntArray;
import htsjdk.samtools.cram.io.ITF8;
import htsjdk.samtools.cram.io.InputStreamUtils;
import htsjdk.samtools.cram.io.LTF8;
import htsjdk.samtools.cram.ref.ReferenceContext;
import htsjdk.samtools.cram.structure.block.Block;
import htsjdk.samtools.cram.ref.ReferenceContextType;
import htsjdk.samtools.cram.structure.block.BlockContentType;
import htsjdk.samtools.util.BinaryCodec;
import htsjdk.samtools.util.Log;
import htsjdk.samtools.util.RuntimeIOException;
import htsjdk.samtools.util.SequenceUtil;
import htsjdk.utils.ValidationUtils;

import java.io.*;
import java.lang.reflect.Array;
import java.math.BigInteger;
import java.util.*;
import java.util.stream.Collectors;

/**
 * A CRAM slice is a logical construct that is just a subset of the blocks in a Slice.
 *
 * NOTE: Every Slice has a reference context (it is either single-reference (mapped), multi-reference, or
 * unmapped), reflecting depending on  the records it contains. Single-ref mapped doesn't mean that the records
 * are necessarily (that is, that their getMappedRead flag is true), only that the records in that slice are PLACED
 * on the corresponding reference contig.
 */
public class Slice {
    private static final Log log = Log.getInstance(Slice.class);
    private static final int MD5_BYTE_SIZE = 16;
    // for indexing purposes
    public static final int UNINITIALIZED_INDEXING_PARAMETER = -1;
    // the spec defines a special sentinel to indicate the absence of an embedded reference block
    public static final int EMBEDDED_REFERENCE_ABSENT_CONTENT_ID = -1;

    ////////////////////////////////
    // Slice header components as defined in the spec
    private final AlignmentContext alignmentContext; // ref sequence, alignment start and span
    private final int nRecords;
    private final long globalRecordCounter;
    private final int nSliceBlocks;              // includes the core block and external blocks, but not the header block
    private List contentIDs;
    private int embeddedReferenceBlockContentID = EMBEDDED_REFERENCE_ABSENT_CONTENT_ID;
    private byte[] referenceMD5 = new byte[MD5_BYTE_SIZE];
    private SAMBinaryTagAndValue sliceTags;
    // End Slice header components
    ////////////////////////////////

    private final CompressionHeader compressionHeader;
    private final SliceBlocks sliceBlocks;
    private final long byteOffsetOfContainer;

    private Block sliceHeaderBlock;

    // Modeling the embedded reference block here is somewhat redundant, since it can be retrieved from the
    // external blocks that are managed by the {@link SliceBlocks} object by using the externalBlockContentId,
    // but we retain it here to use for validation purposes.
    private Block embeddedReferenceBlock;
    private long baseCount;

    // Values used for indexing. We don't use an AlignmentSpan object to model these values because
    // AlignmentSpan contains an AlignmentContext, but the AlignmentContext for a Slice is maintained as part
    // of the Slice header, so using AlignmentSpan here would result in redundant AlignmentContext values.
    //
    // These values are only maintained for slices that are created from SAM/CRAMCompressionRecords. Slices that
    // are created from deserializing a stream do not have recorded values for these because those values
    // not part of the stream, and the individual records are not decoded until they're requested (they are
    // not decoded during indexing, with the exception of MULTI_REF slices, where its required that the slice
    // be resolved into individual reference contexts for inclusion in the index).
    private int mappedReadsCount = 0;   // mapped (rec.getReadUnmappedFlag() != true)
    private int unmappedReadsCount = 0; // unmapped (rec.getReadUnmappedFlag() == true)
    private int unplacedReadsCount = 0; // nocoord (alignmentStart == SAMRecord.NO_ALIGNMENT_START)

    private int byteOffsetOfSliceHeaderBlock = UNINITIALIZED_INDEXING_PARAMETER;
    private int byteSizeOfSliceBlocks = UNINITIALIZED_INDEXING_PARAMETER;
    private int landmarkIndex = UNINITIALIZED_INDEXING_PARAMETER;

    /**
     * Create a slice by reading a serialized Slice from an input stream.
     *
     * @param cramVersion the version of the CRAM stream being read
     * @param compressionHeader the compression header for the contain in which the Slice resides
     * @param inputStream the input stream to be read
     * @param containerByteOffset the stream byte offset of start of the container in which this Slice resides
     */
    public Slice(
            final CRAMVersion cramVersion,
            final CompressionHeader compressionHeader,
            final InputStream inputStream,
            final long containerByteOffset) {
        sliceHeaderBlock = Block.read(cramVersion, inputStream);
        if (sliceHeaderBlock.getContentType() != BlockContentType.MAPPED_SLICE) {
            throw new RuntimeException("Slice Header Block expected, found:  " + sliceHeaderBlock.getContentType().name());
        }

        final InputStream parseInputStream = new ByteArrayInputStream(sliceHeaderBlock.getRawContent());
        this.compressionHeader = compressionHeader;
        this.byteOffsetOfContainer = containerByteOffset;

        final ReferenceContext refContext = new ReferenceContext(ITF8.readUnsignedITF8(parseInputStream));
        final int alignmentStart = ITF8.readUnsignedITF8(parseInputStream);
        final int alignmentSpan = (ITF8.readUnsignedITF8(parseInputStream));
        this.alignmentContext = new AlignmentContext(refContext, alignmentStart, alignmentSpan);

        this.nRecords = ITF8.readUnsignedITF8(parseInputStream);
        this.globalRecordCounter = LTF8.readUnsignedLTF8(parseInputStream);
        this.nSliceBlocks = ITF8.readUnsignedITF8(parseInputStream);

        setContentIDs(CramIntArray.arrayAsList(parseInputStream));
        embeddedReferenceBlockContentID = ITF8.readUnsignedITF8(parseInputStream);

        referenceMD5 = new byte[MD5_BYTE_SIZE];
        InputStreamUtils.readFully(parseInputStream, referenceMD5, 0, referenceMD5.length);

        final byte[] readTagBytes = InputStreamUtils.readFully(parseInputStream);
        if (cramVersion.getMajor() >= CramVersions.CRAM_v3.getMajor()) {
            setSliceTags(BinaryTagCodec.readTags(
                    readTagBytes, 0, readTagBytes.length, ValidationStringency.DEFAULT_STRINGENCY));
        }

        //NOTE: this reads the underlying blocks from the stream, but doesn't decode them because we don't want
        // to do this automatically since there are case where we want to iterate through containers or slices
        // (i.e., during indexing, or when satisfying index queries) when we want to consume the underlying blocks,
        // but not actually decode them
        sliceBlocks = new SliceBlocks(cramVersion, nSliceBlocks, inputStream);

        if (embeddedReferenceBlockContentID != EMBEDDED_REFERENCE_ABSENT_CONTENT_ID) {
            // also adds this block to the external list
            setEmbeddedReferenceBlock(sliceBlocks.getExternalBlock(embeddedReferenceBlockContentID));
        }
    }

    /**
     * Create a single Slice from CRAM Compression Records and a Compression Header. The caller is
     * responsible for appropriate subdivision of records into containers and slices (see ContainerFactory}.
     *
     * @param records input CRAM Compression Records
     * @param compressionHeader the enclosing {@link Container}'s Compression Header
     * @param containerByteOffset
     * @param globalRecordCounter
     * @return a Slice corresponding to the given records
     *
     * Determines whether the slice is single ref, unmapped or multi reference, and derives alignment
     * boundaries for the slice if single ref.
     *
     * Valid Slice states, by individual record contents:
     *
     * Single Reference: all records have valid placements/alignments on the same reference sequence
     * - records can be unmapped-but-placed
     * - reference can be external or embedded
     *
     * Multiple Reference: records may be placed or not, and may have differing reference sequences
     * - reference must not be embedded (not checked here)
     *
     * Unmapped: all records are unmapped and unplaced
     * - note however that we do not actually check mapping flags for unplaced reads.
     * @see CRAMCompressionRecord#isPlaced()
     *
     * @see ReferenceContextType
     */
    public Slice(
            final List records,
            final CompressionHeader compressionHeader,
            final long containerByteOffset,
            final long globalRecordCounter) {
        ValidationUtils.validateArg(globalRecordCounter >= 0, "record counter must be >= 0");
        this.compressionHeader = compressionHeader;
        this.byteOffsetOfContainer = containerByteOffset;

        final ContentDigests hasher = ContentDigests.create(ContentDigests.ALL);
        final Set referenceContexts = new HashSet<>();
        // ignore these values if we later determine this Slice is not single-ref
        int singleRefAlignmentStart = Integer.MAX_VALUE;
        int singleRefAlignmentEnd = SAMRecord.NO_ALIGNMENT_START;

        int baseCount = 0;
        for (final CRAMCompressionRecord record : records) {
            hasher.add(record);
            baseCount += record.getReadLength();

            if (record.isPlaced()) {
                referenceContexts.add(new ReferenceContext(record.getReferenceIndex()));
                singleRefAlignmentStart = Math.min(record.getAlignmentStart(), singleRefAlignmentStart);
                singleRefAlignmentEnd = Math.max(record.getAlignmentEnd(), singleRefAlignmentEnd);

                if (record.isSegmentUnmapped()) {
                    unmappedReadsCount++;
                } else {
                    mappedReadsCount++;
                }
            } else {
                referenceContexts.add(ReferenceContext.UNMAPPED_UNPLACED_CONTEXT);
            }

            // This matches the logic of BAMIndexMetadata.recordMetaData(SAMRecord) and counts all reads
            // that are not placed, which will be a subset of reads that are included in unmappedReadsCount,
            // which counts all unmapped reads, whether placed or not.
            if (record.getAlignmentStart() == SAMRecord.NO_ALIGNMENT_START) {
                unplacedReadsCount++;
            }
        }

        this.alignmentContext = getDerivedAlignmentContext(
                referenceContexts,
                singleRefAlignmentStart,
                singleRefAlignmentEnd);

        sliceTags = hasher.getAsTags();
        nRecords = records.size();
        this.baseCount = baseCount;
        this.globalRecordCounter = globalRecordCounter;

        final CramRecordWriter writer = new CramRecordWriter(this);
        sliceBlocks = writer.writeToSliceBlocks(records, alignmentContext.getAlignmentStart());

        // we can't calculate the number of blocks until after the record writer has written everything out
        nSliceBlocks = caclulateNumberOfBlocks();
    }

    // May be null
    public Block getSliceHeaderBlock() { return sliceHeaderBlock; }

    public AlignmentContext getAlignmentContext() { return alignmentContext; }
    public SliceBlocks getSliceBlocks() { return sliceBlocks; }
    public int getNumberOfRecords() {
        return nRecords;
    }
    public long getGlobalRecordCounter() {
        return globalRecordCounter;
    }

    /**
     * @return the number of blocks as defined by the CRAM spec; this is 1 for the
     * core block plus the number of external blocks (does not include the slice header block);
     */
    public int getNumberOfBlocks() { return nSliceBlocks; }
    public List getContentIDs() {
        return contentIDs;
    }

    private void setContentIDs(final List contentIDs) {
        this.contentIDs = contentIDs;
    }
    public byte[] getReferenceMD5() { return referenceMD5; }

    /**
     * The Slice's offset in bytes from the beginning of the Container's Compression Header
     * (or the end of the Container Header), equal to {@link ContainerHeader#getLandmarks()}
     *
     * Used by BAI and CRAI indexing
     */
    public int getByteOffsetOfSliceHeaderBlock() {
        return byteOffsetOfSliceHeaderBlock;
    }

    public void setByteOffsetOfSliceHeaderBlock(int byteOffsetOfSliceHeaderBlock) {
        this.byteOffsetOfSliceHeaderBlock = byteOffsetOfSliceHeaderBlock;
    }

    /**
     * The Slice's size in bytes
     *
     * Used by CRAI indexing only
     */
    public int getByteSizeOfSliceBlocks() {
        return byteSizeOfSliceBlocks;
    }

    public void setByteSizeOfSliceBlocks(int byteSizeOfSliceBlocks) {
        this.byteSizeOfSliceBlocks = byteSizeOfSliceBlocks;
    }

    public void setLandmarkIndex(int landmarkIndex) {
        this.landmarkIndex = landmarkIndex;
    }

    public long getBaseCount() {
        return baseCount;
    }

    public SAMBinaryTagAndValue getSliceTags() {
        return sliceTags;
    }

    private void setSliceTags(SAMBinaryTagAndValue sliceTags) {
        this.sliceTags = sliceTags;
    }

    private int getMappedReadsCount() {
        return mappedReadsCount;
    }

    private int getUnmappedReadsCount() {
        return unmappedReadsCount;
    }

    private int getUnplacedReadsCount() {
        return unplacedReadsCount;
    }

    /**
     * Set the content ID of the embedded reference block. Per the CRAM spec, the value can be
     * -1 ({@link #EMBEDDED_REFERENCE_ABSENT_CONTENT_ID}) to indicate no embedded reference block is
     * present. If the reference block content ID already has a non-{@link #EMBEDDED_REFERENCE_ABSENT_CONTENT_ID}
     * value, it cannot be reset. If the embedded reference block has already been set, the provided
     * reference block content ID must agree with the content ID of the existing block.
     * @param embeddedReferenceBlockContentID
     */
    public void setEmbeddedReferenceContentID(final int embeddedReferenceBlockContentID) {
        if (this.embeddedReferenceBlockContentID != EMBEDDED_REFERENCE_ABSENT_CONTENT_ID &&
                this.embeddedReferenceBlockContentID != embeddedReferenceBlockContentID) {
            throw new CRAMException(
                    String.format("Can't reset embedded reference content ID (old %d new %d)",
                            this.embeddedReferenceBlockContentID, embeddedReferenceBlockContentID));

        }
        if (this.embeddedReferenceBlock != null &&
                this.embeddedReferenceBlock.getContentId() != embeddedReferenceBlockContentID) {
            throw new CRAMException(
                    String.format("Attempt to set embedded reference block content ID (%d) that is in conflict" +
                                    "with the content ID (%d) of the existing reference block ID",
                            embeddedReferenceBlockContentID,
                            this.embeddedReferenceBlock.getContentId()));
        }
        this.embeddedReferenceBlockContentID = embeddedReferenceBlockContentID;
    }

    /**
     * Get the content ID of the embedded reference block. Per the CRAM spec, the value
     * can be {@link #EMBEDDED_REFERENCE_ABSENT_CONTENT_ID} (-1) to indicate no embedded reference block is
     * present.
     * @return id of embedded reference block if present, otherwise {@link #EMBEDDED_REFERENCE_ABSENT_CONTENT_ID}
     */
    public int getEmbeddedReferenceContentID() {
        return embeddedReferenceBlockContentID;
    }

    public void setEmbeddedReferenceBlock(final Block embeddedReferenceBlock) {
        ValidationUtils.nonNull(embeddedReferenceBlock, "Embedded reference block must be non-null");
        ValidationUtils.validateArg(embeddedReferenceBlock.getContentId() != EMBEDDED_REFERENCE_ABSENT_CONTENT_ID,
                String.format("Invalid content ID (%d) for embedded reference block", embeddedReferenceBlock.getContentId()));
        ValidationUtils.validateArg(embeddedReferenceBlock.getContentType() == BlockContentType.EXTERNAL,
                String.format("Invalid embedded reference block type (%s)", embeddedReferenceBlock.getContentType()));
        if (this.embeddedReferenceBlock != null) {
            throw new CRAMException("Can't reset the slice embedded reference block");
        } else if (this.embeddedReferenceBlockContentID != EMBEDDED_REFERENCE_ABSENT_CONTENT_ID &&
                embeddedReferenceBlock.getContentId() != this.embeddedReferenceBlockContentID) {
            throw new CRAMException(
                    String.format(
                            "Embedded reference block content id (%d) conflicts with existing block if (%d)",
                            embeddedReferenceBlock.getContentId(),
                            this.embeddedReferenceBlockContentID));
        }

        setEmbeddedReferenceContentID(embeddedReferenceBlock.getContentId());
        this.embeddedReferenceBlock = embeddedReferenceBlock;
    }

    /**
     * Return the embedded reference block, if any.
     * @return embedded reference block. May be null.
     */
    // Unused because embedded reference isn't implemented for write
    public Block getEmbeddedReferenceBlock() { return embeddedReferenceBlock; }

    public CompressionHeader getCompressionHeader() { return compressionHeader; }

    /**
     * Reads and decodes the underlying blocks and returns a list of CRAMCompressionRecord. This isn't done initially
     * when the blocks are read from the underlying stream since there are cases where we want to iterate
     * through containers or slices and consume the underlying blocks, but not actually pay the price to
     * decode the records (i.e., during indexing, or when satisfying index queries).
     *
     * The CRAMRecords returned from this are not normalized (read bases, quality scores and mates have not
     * been resolved). See {@link #normalizeCRAMRecords} for more information about normalization.
     *
     * @param compressorCache cached compressor objects to use to decode streams
     * @param validationStringency validation stringency to use
     * @return list of raw (not normalized) CRAMCompressionRecord for this Slice ({@link #normalizeCRAMRecords})
     */
    public ArrayList deserializeCRAMRecords(
            final CompressorCache compressorCache,
            final ValidationStringency validationStringency) {
        final CramRecordReader cramRecordReader = new CramRecordReader(this, compressorCache, validationStringency);
        final ArrayList cramCompressionRecords = new ArrayList<>(nRecords);

        // in the case where APDelta = true, the first record in the slice has a 0 position delta, so initialize
        // prevAlignmentStart using the slice alignment start
        int prevAlignmentStart = alignmentContext.getAlignmentStart();
        for (int i = 0; i < nRecords; i++) {
            // read the new record and update the running prevAlignmentStart
            final CRAMCompressionRecord cramCompressionRecord = cramRecordReader.readCRAMRecord(globalRecordCounter + i, prevAlignmentStart);
            prevAlignmentStart = cramCompressionRecord.getAlignmentStart();
            cramCompressionRecords.add(cramCompressionRecord);
        }

        return cramCompressionRecords;
    }

    /**
     * Normalize a list of CRAMCompressionRecord that have been read in from a CRAM stream. Normalization converts raw
     * CRAM records to a state suitable for conversion to SAMRecords, resolving read bases against
     * the reference, as well as quality scores and mates.
     * The records in this list being normalized should be the records from a Slice, not an entire Container,
     * since the relative positions of mate records are determined relative to the Slice (downstream)
     * offsets.
     *
     * NOTE: This mutates (normalizes) the CRAM records in place.
     *
     * @param cramCompressionRecords CRAMCompressionRecords to normalize
     * @param cramReferenceRegion the reference region for this slice
     */
    public void normalizeCRAMRecords(final List cramCompressionRecords,
                                     final CRAMReferenceRegion cramReferenceRegion) {
        boolean hasEmbeddedReference = false;
        if (compressionHeader.isReferenceRequired()) {
            // get the reference bases required for the entire slice and validate the reference MD5
            final AlignmentContext sliceAlignmentContext = getAlignmentContext();
            if (sliceAlignmentContext.getReferenceContext().isMappedSingleRef()) {
                cramReferenceRegion.fetchReferenceBasesByRegion(sliceAlignmentContext);
                validateReferenceBases(cramReferenceRegion);
            }
        } else {
            // RR = false might mean that no reference compression was used, or that an embedded reference
            // was used, so if there is an embedded ref block, use it, and either way, skip MD5 validation
            final Block embeddedReferenceBlock = getEmbeddedReferenceBlock();
            if (embeddedReferenceBlock != null) {
                hasEmbeddedReference = true;
                cramReferenceRegion.setEmbeddedReferenceBases(
                        embeddedReferenceBlock.getUncompressedContent(new CompressorCache()),
                        getAlignmentContext().getReferenceContext().getReferenceSequenceID(),
                        alignmentContext.getAlignmentStart() - 1);
            }
        }

        // restore mate pairing first:
        for (final CRAMCompressionRecord record : cramCompressionRecords) {
            if (record.isReadPaired() &&
                    !record.isDetached() &&
                    record.isHasMateDownStream()) {
                final CRAMCompressionRecord downMate = cramCompressionRecords.get(
                        // getRecordsToNextFragment returns the value from the NF ("next fragment") data series,
                        // which is interpreted as the number of records to skip within this slice to find the next
                        // mate for this fragment
                        (int) (record.getSequentialIndex() + record.getRecordsToNextFragment() + 1L - globalRecordCounter));
                record.setNextSegment(downMate);
                downMate.setPreviousSegment(record);
            }
        }
        for (final CRAMCompressionRecord record : cramCompressionRecords) {
            if (record.getPreviousSegment() == null && record.getNextSegment() != null) {
                record.restoreMateInfo();
            }
        }

        // assign read names if needed (should be called after mate resolution so generated names
        // can be propagated to mates):
        for (final CRAMCompressionRecord record : cramCompressionRecords) {
            record.assignReadName();
        }

        // resolve bases:
        for (final CRAMCompressionRecord record : cramCompressionRecords) {
            if (!record.isSegmentUnmapped()) { // read bases for unmapped are restored directly from the input stream
                if (compressionHeader.isReferenceRequired() &&
                        getAlignmentContext().getReferenceContext().isMultiRef() &&
                        !record.isUnknownBases() &&
                        !hasEmbeddedReference) {
                    // if the slice is a multi-ref slice because the reads are not coordinate-sorted, this code
                    // can wind up needing to re-resolve the reference bases for *each* record in the slice, which
                    // can in turn be pathologically slow, especially if the reference source is remote
                    cramReferenceRegion.fetchReferenceBasesByRegion(
                            record.getReferenceIndex(),
                            record.getAlignmentStart() - 1,  // 1 based to 0-based
                            record.getAlignmentEnd() - record.getAlignmentStart() + 1);
                }
                record.restoreReadBases(
                        cramReferenceRegion,
                        getCompressionHeader().getSubstitutionMatrix());
            }
        }

        for (final CRAMCompressionRecord record : cramCompressionRecords) {
            // resolve quality scores:
            record.resolveQualityScores();

            // in this last pass, set all records as normalized
            record.setIsNormalized();
        }
     }

    private int caclulateNumberOfBlocks() {
        // Each Slice has 1 core data block, plus zero or more external data blocks.
        // Since an embedded reference block is just stored as an external block, it is included in
        // the external block count, and does not need to be counted separately.
        return 1 + getSliceBlocks().getNumberOfExternalBlocks();
    }

    public void write(final CRAMVersion cramVersion, final OutputStream outputStream) {
        // establish our header block, then write it out
        sliceHeaderBlock = Block.createRawSliceHeaderBlock(createSliceHeaderBlockContent(cramVersion));
        sliceHeaderBlock.write(cramVersion, outputStream);

        // write the core and external blocks
        getSliceBlocks().writeBlocks(cramVersion, outputStream);
    }

    private byte[] createSliceHeaderBlockContent(final CRAMVersion cramVersion) {
        final ByteArrayOutputStream byteArrayOutputStream = new ByteArrayOutputStream();
        ITF8.writeUnsignedITF8(getAlignmentContext().getReferenceContext().getReferenceContextID(), byteArrayOutputStream);
        ITF8.writeUnsignedITF8(getAlignmentContext().getAlignmentStart(), byteArrayOutputStream);
        ITF8.writeUnsignedITF8(getAlignmentContext().getAlignmentSpan(), byteArrayOutputStream);
        ITF8.writeUnsignedITF8(getNumberOfRecords(), byteArrayOutputStream);
        LTF8.writeUnsignedLTF8(getGlobalRecordCounter(), byteArrayOutputStream);
        ITF8.writeUnsignedITF8(getNumberOfBlocks(), byteArrayOutputStream);

        setContentIDs(getSliceBlocks().getExternalContentIDs());
        CramIntArray.write(getContentIDs(), byteArrayOutputStream);
        ITF8.writeUnsignedITF8(getEmbeddedReferenceContentID(), byteArrayOutputStream);
        try {
            byteArrayOutputStream.write(getReferenceMD5() == null ? new byte[MD5_BYTE_SIZE] : getReferenceMD5());
        } catch (final IOException e) {
            throw new RuntimeIOException(e);
        }

        if (cramVersion.getMajor() >= CramVersions.CRAM_v3.getMajor()) {
            SAMBinaryTagAndValue samBinaryTagAndValue = getSliceTags();
            if (samBinaryTagAndValue != null) {
                try (final BinaryCodec binaryCodec = new BinaryCodec(byteArrayOutputStream)) {
                    final BinaryTagCodec binaryTagCodec = new BinaryTagCodec(binaryCodec);
                    while (samBinaryTagAndValue != null) {
                        binaryTagCodec.writeTag(
                                samBinaryTagAndValue.tag,
                                samBinaryTagAndValue.value,
                                samBinaryTagAndValue.isUnsignedArray());
                        samBinaryTagAndValue = samBinaryTagAndValue.getNext();
                    }
                }
            }
        }

        return byteArrayOutputStream.toByteArray();
    }

    /**
     * Confirm that we have initialized the 3 BAI index parameters:
     * byteOffsetFromCompressionHeaderStart, containerByteOffset, and index
     */
    private void baiIndexInitializationCheck() {
        final StringBuilder error = new StringBuilder();

        if (byteOffsetOfSliceHeaderBlock == UNINITIALIZED_INDEXING_PARAMETER) {
            error.append("Cannot index this Slice for BAI because its byteOffsetFromCompressionHeaderStart is unknown.").append(System.lineSeparator());
        }

        if (landmarkIndex == UNINITIALIZED_INDEXING_PARAMETER) {
            error.append("Cannot index this Slice for BAI because its index is unknown.").append(System.lineSeparator());
        }

        if (error.length() > 0) {
            throw new CRAMException(error.toString());
        }
    }

    /**
     * Confirm that we have initialized the 3 CRAI index parameters:
     * byteOffsetFromCompressionHeaderStart, containerByteOffset, and byteSize
     */
    private void craiIndexInitializationCheck() {
        final StringBuilder error = new StringBuilder();

        if (byteOffsetOfSliceHeaderBlock == UNINITIALIZED_INDEXING_PARAMETER) {
            error.append("Cannot index this Slice for CRAI because its byteOffsetFromCompressionHeaderStart is unknown.").append(System.lineSeparator());
        }

        if (byteSizeOfSliceBlocks == UNINITIALIZED_INDEXING_PARAMETER) {
            error.append("Cannot index this Slice for CRAI because its byteSize is unknown.").append(System.lineSeparator());
        }

        if (error.length() > 0) {
            throw new CRAMException(error.toString());
        }
    }

    private static final AlignmentContext getDerivedAlignmentContext(
            final Set sliceReferenceContexts,
            final int singleRefAlignmentStart,
            final int singleRefAlignmentEnd) {
        ReferenceContext referenceContext;
        switch (sliceReferenceContexts.size()) {
            case 0:
                referenceContext = ReferenceContext.UNMAPPED_UNPLACED_CONTEXT;
                break;
            case 1:
                // there is only one reference context, all reads placed are either on the same reference
                // or are unmapped
                referenceContext = sliceReferenceContexts.iterator().next();
                break;
            default:
                // placed reads on multiple references and/or a combination of placed and unplaced reads
                referenceContext = ReferenceContext.MULTIPLE_REFERENCE_CONTEXT;
        }

        if (referenceContext.isMappedSingleRef()) {
            AlignmentContext.validateAlignmentContext(
                    true, referenceContext,
                    singleRefAlignmentStart,
                    singleRefAlignmentEnd - singleRefAlignmentStart + 1);
            return new AlignmentContext(
                    referenceContext,
                    singleRefAlignmentStart,
                    singleRefAlignmentEnd - singleRefAlignmentStart + 1);
        } else if (referenceContext.isUnmappedUnplaced()) {
            return AlignmentContext.UNMAPPED_UNPLACED_CONTEXT;
        } else {
            return AlignmentContext.MULTIPLE_REFERENCE_CONTEXT;
        }
    }

    private void validateAlignmentSpanForReference(final CRAMReferenceRegion cramReferenceRegion) {
        final byte[] referenceBases = cramReferenceRegion.getCurrentReferenceBases();
        if (alignmentContext.getReferenceContext().isUnmappedUnplaced()) {
            return;
        }

        if (referenceBases == null &&
                alignmentContext.getAlignmentStart() > 0 &&
                alignmentContext.getReferenceContext().isMappedSingleRef()) {
            throw new CRAMException ("No reference bases found for mapped slice .");
        }

        //TODO: CRAMComplianceTest/c1#bounds triggers this (the reads are mapped beyond reference length),
        // and CRAMEdgeCasesTest.testNullsAndBeyondRef seems to deliberately test that reads that extend
        // beyond the reference length should be ok ?
        if (((alignmentContext.getAlignmentStart()-1)  < cramReferenceRegion.getRegionStart()) ||
                (alignmentContext.getAlignmentSpan() > cramReferenceRegion.getRegionLength())) {
            log.warn(String.format(
                    "Slice mapped outside of reference bases length %d: slice reference context=%s, start=%d, span=%d, counter=%d.",
                    cramReferenceRegion.getFullContigLength(),
                    alignmentContext.getReferenceContext(),
                    alignmentContext.getAlignmentStart(),
                    alignmentContext.getAlignmentSpan(),
                    globalRecordCounter));
        }
    }

    //VisibleForTesting
    void validateReferenceBases(final CRAMReferenceRegion cramReferenceRegion) {
        if (alignmentContext.getReferenceContext().isMappedSingleRef() && compressionHeader.isReferenceRequired()) {
            validateAlignmentSpanForReference(cramReferenceRegion);
            if (!referenceMD5IsValid(
                    cramReferenceRegion,
                    alignmentContext.getAlignmentSpan(),
                    referenceMD5)) {
                throw new CRAMException(
                        String.format(
                            "The MD5 for the reference failed to validate against the expected value %032x. %s.",
                                new BigInteger(1, referenceMD5),
                                "This indicates that the supplied reference is not the one originally used to create the CRAM."));
            }
        }
    }

    private static boolean referenceMD5IsValid(
            final CRAMReferenceRegion cramReferenceRegion,
            final int alignmentSpan,
            final byte[] expectedMD5) {
        final byte[] referenceBases = cramReferenceRegion.getCurrentReferenceBases();
        final int span = Math.min(alignmentSpan, referenceBases.length);
        // use offset 0 here, based on the assumption that we're always using a CRAMReferenceRegion that
        // has bases that start exactly at the start of the span of the slice alignment context
        final byte md5[] = SequenceUtil.calculateMD5(referenceBases, 0, span);
        return Arrays.equals(md5, expectedMD5);
    }

    @Override
    public String toString() {
        return String.format(
                "slice: %s globalRecordCounter=%d, nRecords=%d, sliceHeaderOffset=%d, sizeOfBlocks=%d, landmark=%d, mapped/unmapped/unplaced: %d/%d/%d, md5=%s",
                alignmentContext,
                globalRecordCounter,
                nRecords,
                getByteOffsetOfSliceHeaderBlock(),
                getByteSizeOfSliceBlocks(),
                landmarkIndex,
                mappedReadsCount,
                unmappedReadsCount,
                unmappedReadsCount,
                String.format("%032x", new BigInteger(1, getReferenceMD5())));
    }

    // *calculate* the MD5 for this reference
    public void setReferenceMD5(final CRAMReferenceRegion cramReferenceRegion) {
        validateAlignmentSpanForReference(cramReferenceRegion);

        final byte[] referenceBases = cramReferenceRegion.getCurrentReferenceBases();
        //TODO: how can an alignment context have a start "< 1" ?
        if (! alignmentContext.getReferenceContext().isMappedSingleRef() && alignmentContext.getAlignmentStart() < 1) {
            referenceMD5 = new byte[MD5_BYTE_SIZE];
        } else {
            final int span = Math.min(alignmentContext.getAlignmentSpan(), referenceBases.length);
            // use offset 0, which is correct given the assumption that we are ALWAYS using
            // a region that has bases that reflect the span of the alignment context
            referenceMD5 = SequenceUtil.calculateMD5(referenceBases, 0, span);
        }
    }

    /**
     * Uses a Multiple Reference Slice Alignment Reader to determine the reference spans of a MULTI_REF Slice.
     * Used for creating CRAI/BAI index entries.
     *
     * @param validationStringency how strict to be when reading CRAM records
     */
    //VisibleForTesting
    public Map getMultiRefAlignmentSpans(
            final CompressorCache compressorCache,
            final ValidationStringency validationStringency) {
        if (!getAlignmentContext().getReferenceContext().isMultiRef()) {
            throw new IllegalStateException("can only create multiref span reader for multiref context slice");
        }

        // Ideally, we wouldn't have to decode the records just to get the slice spans; multi-ref
        // slices use the RI series for the reference index so in theory we could just decode that; but we also
        // need the alignment start and span for indexing. Is it possible to do this more efficiently ?
        // See https://github.com/samtools/htsjdk/issues/1347.
        // Note that this doesn't normalize the CRAMCompressionRecord, which bypasses resolution of bases
        // against the reference.
        final List cramCompressionRecords = deserializeCRAMRecords(compressorCache, validationStringency);

        final Map spans = new HashMap<>();
        cramCompressionRecords.forEach(r -> mergeRecordSpan(r, spans));
        return Collections.unmodifiableMap(spans);
    }

    private void mergeRecordSpan(final CRAMCompressionRecord cramCompressionRecord, final Map spans) {
        // if unplaced: create or replace the current spans map entry.
        // we don't need to combine entries for different records because
        // we count them elsewhere and span is irrelevant

        // we need to combine the records' spans for counting and span calculation
        if (cramCompressionRecord.isSegmentUnmapped()) {
            if (cramCompressionRecord.getAlignmentStart() == SAMRecord.NO_ALIGNMENT_START) {
                // count it as both unmapped *and* unplaced, since for BAI we distinguish between them
                final AlignmentSpan span = new AlignmentSpan(
                        SAMRecord.NO_ALIGNMENT_START,
                        0,
                        0,
                        1,
                        1);
                spans.merge(ReferenceContext.UNMAPPED_UNPLACED_CONTEXT, span, AlignmentSpan::combine);
            } else {
                // merge it in with the reference context its mapped to
                final AlignmentSpan span = new AlignmentSpan(
                        cramCompressionRecord.getAlignmentStart(),
                        cramCompressionRecord.getReadLength(),
                        0,
                        1,
                        0);
                final int refIndex = cramCompressionRecord.getReferenceIndex();
                spans.merge(new ReferenceContext(refIndex), span, AlignmentSpan::combine);
            }
        } else {
            // 1 mapped, 0 unmapped, 0 unplaced
            final AlignmentSpan span = new AlignmentSpan(
                    cramCompressionRecord.getAlignmentStart(),
                    cramCompressionRecord.getAlignmentEnd() - cramCompressionRecord.getAlignmentStart(),
                    1,
                    0,
                    0);
            final ReferenceContext recordContext = new ReferenceContext(cramCompressionRecord.getReferenceIndex());
            spans.merge(recordContext, span, AlignmentSpan::combine);
        }
    }

    /**
     * Generate a CRAI Index entry from this Slice and other container parameters,
     * splitting Multiple Reference slices into constituent reference sequence entries.
     *
     * @return a list of CRAI Index Entries derived from this Slice
     */
    // Each line represents a slice in the CRAM file. Please note that all slices must be listed in the index file.
    // Multi-reference slices may need to have multiple lines for the same slice; one for each reference contained
    // within that slice. In this case the index reference sequence ID will be the actual reference ID (from the
    // “RI” data series) and not -2.
    //
    // Slices containing solely unmapped unplaced data (reference ID -1) still require values for all columns,
    // although the alignment start and span will be ignored. It is recommended that they are both set to zero.
    public List getCRAIEntries(final CompressorCache compressorCache) {
        craiIndexInitializationCheck();

        if (alignmentContext.getReferenceContext().isMultiRef()) {
            final Map spans = getMultiRefAlignmentSpans(
                    compressorCache,
                    ValidationStringency.DEFAULT_STRINGENCY);

            return spans.entrySet().stream()
                    .map(e -> new CRAIEntry(
                            e.getKey().getReferenceContextID(),
                            e.getValue().getAlignmentStart(),
                            e.getValue().getAlignmentSpan(),
                            byteOffsetOfContainer,
                            byteOffsetOfSliceHeaderBlock,
                            byteSizeOfSliceBlocks)
                    )
                    .sorted()
                    .collect(Collectors.toList());
        } else {
            // single ref or unmapped
            final int sequenceId = alignmentContext.getReferenceContext().getReferenceContextID();
            return Collections.singletonList(
                    new CRAIEntry(
                        sequenceId,
                        alignmentContext.getAlignmentStart(),
                        alignmentContext.getAlignmentSpan(),
                        byteOffsetOfContainer,
                        byteOffsetOfSliceHeaderBlock,
                        byteSizeOfSliceBlocks)
            );
        }
    }

    /**
     * Generate a BAIEntry Index entry from this Slice and other container parameters,
     * splitting Multiple Reference slices into constituent reference sequence entries.
     *
     * @return a list of BAIEntry Index Entries derived from this Slice
     */
    public List getBAIEntries(final CompressorCache compressorCache) {
        baiIndexInitializationCheck();

        final List baiEntries = new ArrayList<>();
        switch (getAlignmentContext().getReferenceContext().getType()) {
            case UNMAPPED_UNPLACED_TYPE:
                baiEntries.add(
                        new BAIEntry(
                            getAlignmentContext().getReferenceContext(),
                            new AlignmentSpan(
                                    0,
                                    0,
                                    mappedReadsCount, //aligned
                                    unmappedReadsCount,
                                    unplacedReadsCount),
                            byteOffsetOfContainer,
                            byteOffsetOfSliceHeaderBlock,
                            landmarkIndex
                        )
                );
                break;

            case MULTIPLE_REFERENCE_TYPE:
                // NOTE: its possible that there are several different reference contexts embedded in this slice
                // i.e., there might be only one record per reference context, and thus not enough of any one
                // to warrant a separate slice)
                // unmapped span needs to go last
                final Map sliceSpanMap = getMultiRefAlignmentSpans(
                        compressorCache,
                        ValidationStringency.LENIENT);
                sliceSpanMap.entrySet().stream().filter(as -> !as.getKey().equals(ReferenceContext.UNMAPPED_UNPLACED_CONTEXT)).forEach(
                        entry -> baiEntries.add(
                                new BAIEntry(
                                    entry.getKey(),
                                    new AlignmentSpan(
                                            entry.getValue().getAlignmentStart(),
                                            entry.getValue().getAlignmentSpan(),
                                            entry.getValue().getMappedCount(),
                                            entry.getValue().getUnmappedCount(),
                                            entry.getValue().getUnmappedUnplacedCount()),
                                    byteOffsetOfContainer,
                                    byteOffsetOfSliceHeaderBlock,
                                    landmarkIndex)
                        )
                );
                final AlignmentSpan unmappedSpan = sliceSpanMap.get(ReferenceContext.UNMAPPED_UNPLACED_CONTEXT);
                if (unmappedSpan != null) {
                    baiEntries.add(
                            new BAIEntry(
                                    ReferenceContext.UNMAPPED_UNPLACED_CONTEXT,
                                    unmappedSpan,
                                    byteOffsetOfContainer,
                                    byteOffsetOfSliceHeaderBlock,
                                    landmarkIndex
                            )
                     );
                }
                break;

            default:
                baiEntries.add(
                        new BAIEntry(
                            getAlignmentContext().getReferenceContext(),
                            new AlignmentSpan(
                                    getAlignmentContext().getAlignmentStart(),
                                    getAlignmentContext().getAlignmentSpan(),
                                    getMappedReadsCount(),
                                    getUnmappedReadsCount(),
                                    getUnplacedReadsCount()),
                            byteOffsetOfContainer,
                            byteOffsetOfSliceHeaderBlock,
                            landmarkIndex
                        )
                );
                break;
        }

        return baiEntries;
    }

}