All Downloads are FREE. Search and download functionalities are using the official Maven repository.

htsjdk.samtools.cram.structure.Slice Maven / Gradle / Ivy

There is a newer version: 4.1.3
Show newest version
/**
 * ****************************************************************************
 * Copyright 2013 EMBL-EBI
 * 

* Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at4 *

* http://www.apache.org/licenses/LICENSE-2.0 *

* Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * **************************************************************************** */ package htsjdk.samtools.cram.structure; import htsjdk.samtools.*; import htsjdk.samtools.cram.BAIEntry; import htsjdk.samtools.cram.CRAIEntry; import htsjdk.samtools.cram.CRAMException; import htsjdk.samtools.cram.build.CRAMReferenceRegion; import htsjdk.samtools.cram.common.CRAMVersion; import htsjdk.samtools.cram.common.CramVersions; import htsjdk.samtools.cram.digest.ContentDigests; import htsjdk.samtools.cram.encoding.reader.CramRecordReader; import htsjdk.samtools.cram.encoding.writer.CramRecordWriter; import htsjdk.samtools.cram.io.CramIntArray; import htsjdk.samtools.cram.io.ITF8; import htsjdk.samtools.cram.io.InputStreamUtils; import htsjdk.samtools.cram.io.LTF8; import htsjdk.samtools.cram.ref.ReferenceContext; import htsjdk.samtools.cram.structure.block.Block; import htsjdk.samtools.cram.ref.ReferenceContextType; import htsjdk.samtools.cram.structure.block.BlockContentType; import htsjdk.samtools.util.BinaryCodec; import htsjdk.samtools.util.Log; import htsjdk.samtools.util.RuntimeIOException; import htsjdk.samtools.util.SequenceUtil; import htsjdk.utils.ValidationUtils; import java.io.*; import java.lang.reflect.Array; import java.math.BigInteger; import java.util.*; import java.util.stream.Collectors; /** * A CRAM slice is a logical construct that is just a subset of the blocks in a Slice. * * NOTE: Every Slice has a reference context (it is either single-reference (mapped), multi-reference, or * unmapped), reflecting depending on the records it contains. Single-ref mapped doesn't mean that the records * are necessarily (that is, that their getMappedRead flag is true), only that the records in that slice are PLACED * on the corresponding reference contig. */ public class Slice { private static final Log log = Log.getInstance(Slice.class); private static final int MD5_BYTE_SIZE = 16; // for indexing purposes public static final int UNINITIALIZED_INDEXING_PARAMETER = -1; // the spec defines a special sentinel to indicate the absence of an embedded reference block public static final int EMBEDDED_REFERENCE_ABSENT_CONTENT_ID = -1; //////////////////////////////// // Slice header components as defined in the spec private final AlignmentContext alignmentContext; // ref sequence, alignment start and span private final int nRecords; private final long globalRecordCounter; private final int nSliceBlocks; // includes the core block and external blocks, but not the header block private List contentIDs; private int embeddedReferenceBlockContentID = EMBEDDED_REFERENCE_ABSENT_CONTENT_ID; private byte[] referenceMD5 = new byte[MD5_BYTE_SIZE]; private SAMBinaryTagAndValue sliceTags; // End Slice header components //////////////////////////////// private final CompressionHeader compressionHeader; private final SliceBlocks sliceBlocks; private final long byteOffsetOfContainer; private Block sliceHeaderBlock; // Modeling the embedded reference block here is somewhat redundant, since it can be retrieved from the // external blocks that are managed by the {@link SliceBlocks} object by using the externalBlockContentId, // but we retain it here to use for validation purposes. private Block embeddedReferenceBlock; private long baseCount; // Values used for indexing. We don't use an AlignmentSpan object to model these values because // AlignmentSpan contains an AlignmentContext, but the AlignmentContext for a Slice is maintained as part // of the Slice header, so using AlignmentSpan here would result in redundant AlignmentContext values. // // These values are only maintained for slices that are created from SAM/CRAMCompressionRecords. Slices that // are created from deserializing a stream do not have recorded values for these because those values // not part of the stream, and the individual records are not decoded until they're requested (they are // not decoded during indexing, with the exception of MULTI_REF slices, where its required that the slice // be resolved into individual reference contexts for inclusion in the index). private int mappedReadsCount = 0; // mapped (rec.getReadUnmappedFlag() != true) private int unmappedReadsCount = 0; // unmapped (rec.getReadUnmappedFlag() == true) private int unplacedReadsCount = 0; // nocoord (alignmentStart == SAMRecord.NO_ALIGNMENT_START) private int byteOffsetOfSliceHeaderBlock = UNINITIALIZED_INDEXING_PARAMETER; private int byteSizeOfSliceBlocks = UNINITIALIZED_INDEXING_PARAMETER; private int landmarkIndex = UNINITIALIZED_INDEXING_PARAMETER; /** * Create a slice by reading a serialized Slice from an input stream. * * @param cramVersion the version of the CRAM stream being read * @param compressionHeader the compression header for the contain in which the Slice resides * @param inputStream the input stream to be read * @param containerByteOffset the stream byte offset of start of the container in which this Slice resides */ public Slice( final CRAMVersion cramVersion, final CompressionHeader compressionHeader, final InputStream inputStream, final long containerByteOffset) { sliceHeaderBlock = Block.read(cramVersion, inputStream); if (sliceHeaderBlock.getContentType() != BlockContentType.MAPPED_SLICE) { throw new RuntimeException("Slice Header Block expected, found: " + sliceHeaderBlock.getContentType().name()); } final InputStream parseInputStream = new ByteArrayInputStream(sliceHeaderBlock.getRawContent()); this.compressionHeader = compressionHeader; this.byteOffsetOfContainer = containerByteOffset; final ReferenceContext refContext = new ReferenceContext(ITF8.readUnsignedITF8(parseInputStream)); final int alignmentStart = ITF8.readUnsignedITF8(parseInputStream); final int alignmentSpan = (ITF8.readUnsignedITF8(parseInputStream)); this.alignmentContext = new AlignmentContext(refContext, alignmentStart, alignmentSpan); this.nRecords = ITF8.readUnsignedITF8(parseInputStream); this.globalRecordCounter = LTF8.readUnsignedLTF8(parseInputStream); this.nSliceBlocks = ITF8.readUnsignedITF8(parseInputStream); setContentIDs(CramIntArray.arrayAsList(parseInputStream)); embeddedReferenceBlockContentID = ITF8.readUnsignedITF8(parseInputStream); referenceMD5 = new byte[MD5_BYTE_SIZE]; InputStreamUtils.readFully(parseInputStream, referenceMD5, 0, referenceMD5.length); final byte[] readTagBytes = InputStreamUtils.readFully(parseInputStream); if (cramVersion.getMajor() >= CramVersions.CRAM_v3.getMajor()) { setSliceTags(BinaryTagCodec.readTags( readTagBytes, 0, readTagBytes.length, ValidationStringency.DEFAULT_STRINGENCY)); } //NOTE: this reads the underlying blocks from the stream, but doesn't decode them because we don't want // to do this automatically since there are case where we want to iterate through containers or slices // (i.e., during indexing, or when satisfying index queries) when we want to consume the underlying blocks, // but not actually decode them sliceBlocks = new SliceBlocks(cramVersion, nSliceBlocks, inputStream); if (embeddedReferenceBlockContentID != EMBEDDED_REFERENCE_ABSENT_CONTENT_ID) { // also adds this block to the external list setEmbeddedReferenceBlock(sliceBlocks.getExternalBlock(embeddedReferenceBlockContentID)); } } /** * Create a single Slice from CRAM Compression Records and a Compression Header. The caller is * responsible for appropriate subdivision of records into containers and slices (see ContainerFactory}. * * @param records input CRAM Compression Records * @param compressionHeader the enclosing {@link Container}'s Compression Header * @param containerByteOffset * @param globalRecordCounter * @return a Slice corresponding to the given records * * Determines whether the slice is single ref, unmapped or multi reference, and derives alignment * boundaries for the slice if single ref. * * Valid Slice states, by individual record contents: * * Single Reference: all records have valid placements/alignments on the same reference sequence * - records can be unmapped-but-placed * - reference can be external or embedded * * Multiple Reference: records may be placed or not, and may have differing reference sequences * - reference must not be embedded (not checked here) * * Unmapped: all records are unmapped and unplaced * - note however that we do not actually check mapping flags for unplaced reads. * @see CRAMCompressionRecord#isPlaced() * * @see ReferenceContextType */ public Slice( final List records, final CompressionHeader compressionHeader, final long containerByteOffset, final long globalRecordCounter) { ValidationUtils.validateArg(globalRecordCounter >= 0, "record counter must be >= 0"); this.compressionHeader = compressionHeader; this.byteOffsetOfContainer = containerByteOffset; final ContentDigests hasher = ContentDigests.create(ContentDigests.ALL); final Set referenceContexts = new HashSet<>(); // ignore these values if we later determine this Slice is not single-ref int singleRefAlignmentStart = Integer.MAX_VALUE; int singleRefAlignmentEnd = SAMRecord.NO_ALIGNMENT_START; int baseCount = 0; for (final CRAMCompressionRecord record : records) { hasher.add(record); baseCount += record.getReadLength(); if (record.isPlaced()) { referenceContexts.add(new ReferenceContext(record.getReferenceIndex())); singleRefAlignmentStart = Math.min(record.getAlignmentStart(), singleRefAlignmentStart); singleRefAlignmentEnd = Math.max(record.getAlignmentEnd(), singleRefAlignmentEnd); if (record.isSegmentUnmapped()) { unmappedReadsCount++; } else { mappedReadsCount++; } } else { referenceContexts.add(ReferenceContext.UNMAPPED_UNPLACED_CONTEXT); } // This matches the logic of BAMIndexMetadata.recordMetaData(SAMRecord) and counts all reads // that are not placed, which will be a subset of reads that are included in unmappedReadsCount, // which counts all unmapped reads, whether placed or not. if (record.getAlignmentStart() == SAMRecord.NO_ALIGNMENT_START) { unplacedReadsCount++; } } this.alignmentContext = getDerivedAlignmentContext( referenceContexts, singleRefAlignmentStart, singleRefAlignmentEnd); sliceTags = hasher.getAsTags(); nRecords = records.size(); this.baseCount = baseCount; this.globalRecordCounter = globalRecordCounter; final CramRecordWriter writer = new CramRecordWriter(this); sliceBlocks = writer.writeToSliceBlocks(records, alignmentContext.getAlignmentStart()); // we can't calculate the number of blocks until after the record writer has written everything out nSliceBlocks = caclulateNumberOfBlocks(); } // May be null public Block getSliceHeaderBlock() { return sliceHeaderBlock; } public AlignmentContext getAlignmentContext() { return alignmentContext; } public SliceBlocks getSliceBlocks() { return sliceBlocks; } public int getNumberOfRecords() { return nRecords; } public long getGlobalRecordCounter() { return globalRecordCounter; } /** * @return the number of blocks as defined by the CRAM spec; this is 1 for the * core block plus the number of external blocks (does not include the slice header block); */ public int getNumberOfBlocks() { return nSliceBlocks; } public List getContentIDs() { return contentIDs; } private void setContentIDs(final List contentIDs) { this.contentIDs = contentIDs; } public byte[] getReferenceMD5() { return referenceMD5; } /** * The Slice's offset in bytes from the beginning of the Container's Compression Header * (or the end of the Container Header), equal to {@link ContainerHeader#getLandmarks()} * * Used by BAI and CRAI indexing */ public int getByteOffsetOfSliceHeaderBlock() { return byteOffsetOfSliceHeaderBlock; } public void setByteOffsetOfSliceHeaderBlock(int byteOffsetOfSliceHeaderBlock) { this.byteOffsetOfSliceHeaderBlock = byteOffsetOfSliceHeaderBlock; } /** * The Slice's size in bytes * * Used by CRAI indexing only */ public int getByteSizeOfSliceBlocks() { return byteSizeOfSliceBlocks; } public void setByteSizeOfSliceBlocks(int byteSizeOfSliceBlocks) { this.byteSizeOfSliceBlocks = byteSizeOfSliceBlocks; } public void setLandmarkIndex(int landmarkIndex) { this.landmarkIndex = landmarkIndex; } public long getBaseCount() { return baseCount; } public SAMBinaryTagAndValue getSliceTags() { return sliceTags; } private void setSliceTags(SAMBinaryTagAndValue sliceTags) { this.sliceTags = sliceTags; } private int getMappedReadsCount() { return mappedReadsCount; } private int getUnmappedReadsCount() { return unmappedReadsCount; } private int getUnplacedReadsCount() { return unplacedReadsCount; } /** * Set the content ID of the embedded reference block. Per the CRAM spec, the value can be * -1 ({@link #EMBEDDED_REFERENCE_ABSENT_CONTENT_ID}) to indicate no embedded reference block is * present. If the reference block content ID already has a non-{@link #EMBEDDED_REFERENCE_ABSENT_CONTENT_ID} * value, it cannot be reset. If the embedded reference block has already been set, the provided * reference block content ID must agree with the content ID of the existing block. * @param embeddedReferenceBlockContentID */ public void setEmbeddedReferenceContentID(final int embeddedReferenceBlockContentID) { if (this.embeddedReferenceBlockContentID != EMBEDDED_REFERENCE_ABSENT_CONTENT_ID && this.embeddedReferenceBlockContentID != embeddedReferenceBlockContentID) { throw new CRAMException( String.format("Can't reset embedded reference content ID (old %d new %d)", this.embeddedReferenceBlockContentID, embeddedReferenceBlockContentID)); } if (this.embeddedReferenceBlock != null && this.embeddedReferenceBlock.getContentId() != embeddedReferenceBlockContentID) { throw new CRAMException( String.format("Attempt to set embedded reference block content ID (%d) that is in conflict" + "with the content ID (%d) of the existing reference block ID", embeddedReferenceBlockContentID, this.embeddedReferenceBlock.getContentId())); } this.embeddedReferenceBlockContentID = embeddedReferenceBlockContentID; } /** * Get the content ID of the embedded reference block. Per the CRAM spec, the value * can be {@link #EMBEDDED_REFERENCE_ABSENT_CONTENT_ID} (-1) to indicate no embedded reference block is * present. * @return id of embedded reference block if present, otherwise {@link #EMBEDDED_REFERENCE_ABSENT_CONTENT_ID} */ public int getEmbeddedReferenceContentID() { return embeddedReferenceBlockContentID; } public void setEmbeddedReferenceBlock(final Block embeddedReferenceBlock) { ValidationUtils.nonNull(embeddedReferenceBlock, "Embedded reference block must be non-null"); ValidationUtils.validateArg(embeddedReferenceBlock.getContentId() != EMBEDDED_REFERENCE_ABSENT_CONTENT_ID, String.format("Invalid content ID (%d) for embedded reference block", embeddedReferenceBlock.getContentId())); ValidationUtils.validateArg(embeddedReferenceBlock.getContentType() == BlockContentType.EXTERNAL, String.format("Invalid embedded reference block type (%s)", embeddedReferenceBlock.getContentType())); if (this.embeddedReferenceBlock != null) { throw new CRAMException("Can't reset the slice embedded reference block"); } else if (this.embeddedReferenceBlockContentID != EMBEDDED_REFERENCE_ABSENT_CONTENT_ID && embeddedReferenceBlock.getContentId() != this.embeddedReferenceBlockContentID) { throw new CRAMException( String.format( "Embedded reference block content id (%d) conflicts with existing block if (%d)", embeddedReferenceBlock.getContentId(), this.embeddedReferenceBlockContentID)); } setEmbeddedReferenceContentID(embeddedReferenceBlock.getContentId()); this.embeddedReferenceBlock = embeddedReferenceBlock; } /** * Return the embedded reference block, if any. * @return embedded reference block. May be null. */ // Unused because embedded reference isn't implemented for write public Block getEmbeddedReferenceBlock() { return embeddedReferenceBlock; } public CompressionHeader getCompressionHeader() { return compressionHeader; } /** * Reads and decodes the underlying blocks and returns a list of CRAMCompressionRecord. This isn't done initially * when the blocks are read from the underlying stream since there are cases where we want to iterate * through containers or slices and consume the underlying blocks, but not actually pay the price to * decode the records (i.e., during indexing, or when satisfying index queries). * * The CRAMRecords returned from this are not normalized (read bases, quality scores and mates have not * been resolved). See {@link #normalizeCRAMRecords} for more information about normalization. * * @param compressorCache cached compressor objects to use to decode streams * @param validationStringency validation stringency to use * @return list of raw (not normalized) CRAMCompressionRecord for this Slice ({@link #normalizeCRAMRecords}) */ public ArrayList deserializeCRAMRecords( final CompressorCache compressorCache, final ValidationStringency validationStringency) { final CramRecordReader cramRecordReader = new CramRecordReader(this, compressorCache, validationStringency); final ArrayList cramCompressionRecords = new ArrayList<>(nRecords); // in the case where APDelta = true, the first record in the slice has a 0 position delta, so initialize // prevAlignmentStart using the slice alignment start int prevAlignmentStart = alignmentContext.getAlignmentStart(); for (int i = 0; i < nRecords; i++) { // read the new record and update the running prevAlignmentStart final CRAMCompressionRecord cramCompressionRecord = cramRecordReader.readCRAMRecord(globalRecordCounter + i, prevAlignmentStart); prevAlignmentStart = cramCompressionRecord.getAlignmentStart(); cramCompressionRecords.add(cramCompressionRecord); } return cramCompressionRecords; } /** * Normalize a list of CRAMCompressionRecord that have been read in from a CRAM stream. Normalization converts raw * CRAM records to a state suitable for conversion to SAMRecords, resolving read bases against * the reference, as well as quality scores and mates. * The records in this list being normalized should be the records from a Slice, not an entire Container, * since the relative positions of mate records are determined relative to the Slice (downstream) * offsets. * * NOTE: This mutates (normalizes) the CRAM records in place. * * @param cramCompressionRecords CRAMCompressionRecords to normalize * @param cramReferenceRegion the reference region for this slice */ public void normalizeCRAMRecords(final List cramCompressionRecords, final CRAMReferenceRegion cramReferenceRegion) { boolean hasEmbeddedReference = false; if (compressionHeader.isReferenceRequired()) { // get the reference bases required for the entire slice and validate the reference MD5 final AlignmentContext sliceAlignmentContext = getAlignmentContext(); if (sliceAlignmentContext.getReferenceContext().isMappedSingleRef()) { cramReferenceRegion.fetchReferenceBasesByRegion(sliceAlignmentContext); validateReferenceBases(cramReferenceRegion); } } else { // RR = false might mean that no reference compression was used, or that an embedded reference // was used, so if there is an embedded ref block, use it, and either way, skip MD5 validation final Block embeddedReferenceBlock = getEmbeddedReferenceBlock(); if (embeddedReferenceBlock != null) { hasEmbeddedReference = true; cramReferenceRegion.setEmbeddedReferenceBases( embeddedReferenceBlock.getUncompressedContent(new CompressorCache()), getAlignmentContext().getReferenceContext().getReferenceSequenceID(), alignmentContext.getAlignmentStart() - 1); } } // restore mate pairing first: for (final CRAMCompressionRecord record : cramCompressionRecords) { if (record.isReadPaired() && !record.isDetached() && record.isHasMateDownStream()) { final CRAMCompressionRecord downMate = cramCompressionRecords.get( // getRecordsToNextFragment returns the value from the NF ("next fragment") data series, // which is interpreted as the number of records to skip within this slice to find the next // mate for this fragment (int) (record.getSequentialIndex() + record.getRecordsToNextFragment() + 1L - globalRecordCounter)); record.setNextSegment(downMate); downMate.setPreviousSegment(record); } } for (final CRAMCompressionRecord record : cramCompressionRecords) { if (record.getPreviousSegment() == null && record.getNextSegment() != null) { record.restoreMateInfo(); } } // assign read names if needed (should be called after mate resolution so generated names // can be propagated to mates): for (final CRAMCompressionRecord record : cramCompressionRecords) { record.assignReadName(); } // resolve bases: for (final CRAMCompressionRecord record : cramCompressionRecords) { if (!record.isSegmentUnmapped()) { // read bases for unmapped are restored directly from the input stream if (compressionHeader.isReferenceRequired() && getAlignmentContext().getReferenceContext().isMultiRef() && !record.isUnknownBases() && !hasEmbeddedReference) { // if the slice is a multi-ref slice because the reads are not coordinate-sorted, this code // can wind up needing to re-resolve the reference bases for *each* record in the slice, which // can in turn be pathologically slow, especially if the reference source is remote cramReferenceRegion.fetchReferenceBasesByRegion( record.getReferenceIndex(), record.getAlignmentStart() - 1, // 1 based to 0-based record.getAlignmentEnd() - record.getAlignmentStart() + 1); } record.restoreReadBases( cramReferenceRegion, getCompressionHeader().getSubstitutionMatrix()); } } for (final CRAMCompressionRecord record : cramCompressionRecords) { // resolve quality scores: record.resolveQualityScores(); // in this last pass, set all records as normalized record.setIsNormalized(); } } private int caclulateNumberOfBlocks() { // Each Slice has 1 core data block, plus zero or more external data blocks. // Since an embedded reference block is just stored as an external block, it is included in // the external block count, and does not need to be counted separately. return 1 + getSliceBlocks().getNumberOfExternalBlocks(); } public void write(final CRAMVersion cramVersion, final OutputStream outputStream) { // establish our header block, then write it out sliceHeaderBlock = Block.createRawSliceHeaderBlock(createSliceHeaderBlockContent(cramVersion)); sliceHeaderBlock.write(cramVersion, outputStream); // write the core and external blocks getSliceBlocks().writeBlocks(cramVersion, outputStream); } private byte[] createSliceHeaderBlockContent(final CRAMVersion cramVersion) { final ByteArrayOutputStream byteArrayOutputStream = new ByteArrayOutputStream(); ITF8.writeUnsignedITF8(getAlignmentContext().getReferenceContext().getReferenceContextID(), byteArrayOutputStream); ITF8.writeUnsignedITF8(getAlignmentContext().getAlignmentStart(), byteArrayOutputStream); ITF8.writeUnsignedITF8(getAlignmentContext().getAlignmentSpan(), byteArrayOutputStream); ITF8.writeUnsignedITF8(getNumberOfRecords(), byteArrayOutputStream); LTF8.writeUnsignedLTF8(getGlobalRecordCounter(), byteArrayOutputStream); ITF8.writeUnsignedITF8(getNumberOfBlocks(), byteArrayOutputStream); setContentIDs(getSliceBlocks().getExternalContentIDs()); CramIntArray.write(getContentIDs(), byteArrayOutputStream); ITF8.writeUnsignedITF8(getEmbeddedReferenceContentID(), byteArrayOutputStream); try { byteArrayOutputStream.write(getReferenceMD5() == null ? new byte[MD5_BYTE_SIZE] : getReferenceMD5()); } catch (final IOException e) { throw new RuntimeIOException(e); } if (cramVersion.getMajor() >= CramVersions.CRAM_v3.getMajor()) { SAMBinaryTagAndValue samBinaryTagAndValue = getSliceTags(); if (samBinaryTagAndValue != null) { try (final BinaryCodec binaryCodec = new BinaryCodec(byteArrayOutputStream)) { final BinaryTagCodec binaryTagCodec = new BinaryTagCodec(binaryCodec); while (samBinaryTagAndValue != null) { binaryTagCodec.writeTag( samBinaryTagAndValue.tag, samBinaryTagAndValue.value, samBinaryTagAndValue.isUnsignedArray()); samBinaryTagAndValue = samBinaryTagAndValue.getNext(); } } } } return byteArrayOutputStream.toByteArray(); } /** * Confirm that we have initialized the 3 BAI index parameters: * byteOffsetFromCompressionHeaderStart, containerByteOffset, and index */ private void baiIndexInitializationCheck() { final StringBuilder error = new StringBuilder(); if (byteOffsetOfSliceHeaderBlock == UNINITIALIZED_INDEXING_PARAMETER) { error.append("Cannot index this Slice for BAI because its byteOffsetFromCompressionHeaderStart is unknown.").append(System.lineSeparator()); } if (landmarkIndex == UNINITIALIZED_INDEXING_PARAMETER) { error.append("Cannot index this Slice for BAI because its index is unknown.").append(System.lineSeparator()); } if (error.length() > 0) { throw new CRAMException(error.toString()); } } /** * Confirm that we have initialized the 3 CRAI index parameters: * byteOffsetFromCompressionHeaderStart, containerByteOffset, and byteSize */ private void craiIndexInitializationCheck() { final StringBuilder error = new StringBuilder(); if (byteOffsetOfSliceHeaderBlock == UNINITIALIZED_INDEXING_PARAMETER) { error.append("Cannot index this Slice for CRAI because its byteOffsetFromCompressionHeaderStart is unknown.").append(System.lineSeparator()); } if (byteSizeOfSliceBlocks == UNINITIALIZED_INDEXING_PARAMETER) { error.append("Cannot index this Slice for CRAI because its byteSize is unknown.").append(System.lineSeparator()); } if (error.length() > 0) { throw new CRAMException(error.toString()); } } private static final AlignmentContext getDerivedAlignmentContext( final Set sliceReferenceContexts, final int singleRefAlignmentStart, final int singleRefAlignmentEnd) { ReferenceContext referenceContext; switch (sliceReferenceContexts.size()) { case 0: referenceContext = ReferenceContext.UNMAPPED_UNPLACED_CONTEXT; break; case 1: // there is only one reference context, all reads placed are either on the same reference // or are unmapped referenceContext = sliceReferenceContexts.iterator().next(); break; default: // placed reads on multiple references and/or a combination of placed and unplaced reads referenceContext = ReferenceContext.MULTIPLE_REFERENCE_CONTEXT; } if (referenceContext.isMappedSingleRef()) { AlignmentContext.validateAlignmentContext( true, referenceContext, singleRefAlignmentStart, singleRefAlignmentEnd - singleRefAlignmentStart + 1); return new AlignmentContext( referenceContext, singleRefAlignmentStart, singleRefAlignmentEnd - singleRefAlignmentStart + 1); } else if (referenceContext.isUnmappedUnplaced()) { return AlignmentContext.UNMAPPED_UNPLACED_CONTEXT; } else { return AlignmentContext.MULTIPLE_REFERENCE_CONTEXT; } } private void validateAlignmentSpanForReference(final CRAMReferenceRegion cramReferenceRegion) { final byte[] referenceBases = cramReferenceRegion.getCurrentReferenceBases(); if (alignmentContext.getReferenceContext().isUnmappedUnplaced()) { return; } if (referenceBases == null && alignmentContext.getAlignmentStart() > 0 && alignmentContext.getReferenceContext().isMappedSingleRef()) { throw new CRAMException ("No reference bases found for mapped slice ."); } //TODO: CRAMComplianceTest/c1#bounds triggers this (the reads are mapped beyond reference length), // and CRAMEdgeCasesTest.testNullsAndBeyondRef seems to deliberately test that reads that extend // beyond the reference length should be ok ? if (((alignmentContext.getAlignmentStart()-1) < cramReferenceRegion.getRegionStart()) || (alignmentContext.getAlignmentSpan() > cramReferenceRegion.getRegionLength())) { log.warn(String.format( "Slice mapped outside of reference bases length %d: slice reference context=%s, start=%d, span=%d, counter=%d.", cramReferenceRegion.getFullContigLength(), alignmentContext.getReferenceContext(), alignmentContext.getAlignmentStart(), alignmentContext.getAlignmentSpan(), globalRecordCounter)); } } //VisibleForTesting void validateReferenceBases(final CRAMReferenceRegion cramReferenceRegion) { if (alignmentContext.getReferenceContext().isMappedSingleRef() && compressionHeader.isReferenceRequired()) { validateAlignmentSpanForReference(cramReferenceRegion); if (!referenceMD5IsValid( cramReferenceRegion, alignmentContext.getAlignmentSpan(), referenceMD5)) { throw new CRAMException( String.format( "The MD5 for the reference failed to validate against the expected value %032x. %s.", new BigInteger(1, referenceMD5), "This indicates that the supplied reference is not the one originally used to create the CRAM.")); } } } private static boolean referenceMD5IsValid( final CRAMReferenceRegion cramReferenceRegion, final int alignmentSpan, final byte[] expectedMD5) { final byte[] referenceBases = cramReferenceRegion.getCurrentReferenceBases(); final int span = Math.min(alignmentSpan, referenceBases.length); // use offset 0 here, based on the assumption that we're always using a CRAMReferenceRegion that // has bases that start exactly at the start of the span of the slice alignment context final byte md5[] = SequenceUtil.calculateMD5(referenceBases, 0, span); return Arrays.equals(md5, expectedMD5); } @Override public String toString() { return String.format( "slice: %s globalRecordCounter=%d, nRecords=%d, sliceHeaderOffset=%d, sizeOfBlocks=%d, landmark=%d, mapped/unmapped/unplaced: %d/%d/%d, md5=%s", alignmentContext, globalRecordCounter, nRecords, getByteOffsetOfSliceHeaderBlock(), getByteSizeOfSliceBlocks(), landmarkIndex, mappedReadsCount, unmappedReadsCount, unmappedReadsCount, String.format("%032x", new BigInteger(1, getReferenceMD5()))); } // *calculate* the MD5 for this reference public void setReferenceMD5(final CRAMReferenceRegion cramReferenceRegion) { validateAlignmentSpanForReference(cramReferenceRegion); final byte[] referenceBases = cramReferenceRegion.getCurrentReferenceBases(); //TODO: how can an alignment context have a start "< 1" ? if (! alignmentContext.getReferenceContext().isMappedSingleRef() && alignmentContext.getAlignmentStart() < 1) { referenceMD5 = new byte[MD5_BYTE_SIZE]; } else { final int span = Math.min(alignmentContext.getAlignmentSpan(), referenceBases.length); // use offset 0, which is correct given the assumption that we are ALWAYS using // a region that has bases that reflect the span of the alignment context referenceMD5 = SequenceUtil.calculateMD5(referenceBases, 0, span); } } /** * Uses a Multiple Reference Slice Alignment Reader to determine the reference spans of a MULTI_REF Slice. * Used for creating CRAI/BAI index entries. * * @param validationStringency how strict to be when reading CRAM records */ //VisibleForTesting public Map getMultiRefAlignmentSpans( final CompressorCache compressorCache, final ValidationStringency validationStringency) { if (!getAlignmentContext().getReferenceContext().isMultiRef()) { throw new IllegalStateException("can only create multiref span reader for multiref context slice"); } // Ideally, we wouldn't have to decode the records just to get the slice spans; multi-ref // slices use the RI series for the reference index so in theory we could just decode that; but we also // need the alignment start and span for indexing. Is it possible to do this more efficiently ? // See https://github.com/samtools/htsjdk/issues/1347. // Note that this doesn't normalize the CRAMCompressionRecord, which bypasses resolution of bases // against the reference. final List cramCompressionRecords = deserializeCRAMRecords(compressorCache, validationStringency); final Map spans = new HashMap<>(); cramCompressionRecords.forEach(r -> mergeRecordSpan(r, spans)); return Collections.unmodifiableMap(spans); } private void mergeRecordSpan(final CRAMCompressionRecord cramCompressionRecord, final Map spans) { // if unplaced: create or replace the current spans map entry. // we don't need to combine entries for different records because // we count them elsewhere and span is irrelevant // we need to combine the records' spans for counting and span calculation if (cramCompressionRecord.isSegmentUnmapped()) { if (cramCompressionRecord.getAlignmentStart() == SAMRecord.NO_ALIGNMENT_START) { // count it as both unmapped *and* unplaced, since for BAI we distinguish between them final AlignmentSpan span = new AlignmentSpan( SAMRecord.NO_ALIGNMENT_START, 0, 0, 1, 1); spans.merge(ReferenceContext.UNMAPPED_UNPLACED_CONTEXT, span, AlignmentSpan::combine); } else { // merge it in with the reference context its mapped to final AlignmentSpan span = new AlignmentSpan( cramCompressionRecord.getAlignmentStart(), cramCompressionRecord.getReadLength(), 0, 1, 0); final int refIndex = cramCompressionRecord.getReferenceIndex(); spans.merge(new ReferenceContext(refIndex), span, AlignmentSpan::combine); } } else { // 1 mapped, 0 unmapped, 0 unplaced final AlignmentSpan span = new AlignmentSpan( cramCompressionRecord.getAlignmentStart(), cramCompressionRecord.getAlignmentEnd() - cramCompressionRecord.getAlignmentStart(), 1, 0, 0); final ReferenceContext recordContext = new ReferenceContext(cramCompressionRecord.getReferenceIndex()); spans.merge(recordContext, span, AlignmentSpan::combine); } } /** * Generate a CRAI Index entry from this Slice and other container parameters, * splitting Multiple Reference slices into constituent reference sequence entries. * * @return a list of CRAI Index Entries derived from this Slice */ // Each line represents a slice in the CRAM file. Please note that all slices must be listed in the index file. // Multi-reference slices may need to have multiple lines for the same slice; one for each reference contained // within that slice. In this case the index reference sequence ID will be the actual reference ID (from the // “RI” data series) and not -2. // // Slices containing solely unmapped unplaced data (reference ID -1) still require values for all columns, // although the alignment start and span will be ignored. It is recommended that they are both set to zero. public List getCRAIEntries(final CompressorCache compressorCache) { craiIndexInitializationCheck(); if (alignmentContext.getReferenceContext().isMultiRef()) { final Map spans = getMultiRefAlignmentSpans( compressorCache, ValidationStringency.DEFAULT_STRINGENCY); return spans.entrySet().stream() .map(e -> new CRAIEntry( e.getKey().getReferenceContextID(), e.getValue().getAlignmentStart(), e.getValue().getAlignmentSpan(), byteOffsetOfContainer, byteOffsetOfSliceHeaderBlock, byteSizeOfSliceBlocks) ) .sorted() .collect(Collectors.toList()); } else { // single ref or unmapped final int sequenceId = alignmentContext.getReferenceContext().getReferenceContextID(); return Collections.singletonList( new CRAIEntry( sequenceId, alignmentContext.getAlignmentStart(), alignmentContext.getAlignmentSpan(), byteOffsetOfContainer, byteOffsetOfSliceHeaderBlock, byteSizeOfSliceBlocks) ); } } /** * Generate a BAIEntry Index entry from this Slice and other container parameters, * splitting Multiple Reference slices into constituent reference sequence entries. * * @return a list of BAIEntry Index Entries derived from this Slice */ public List getBAIEntries(final CompressorCache compressorCache) { baiIndexInitializationCheck(); final List baiEntries = new ArrayList<>(); switch (getAlignmentContext().getReferenceContext().getType()) { case UNMAPPED_UNPLACED_TYPE: baiEntries.add( new BAIEntry( getAlignmentContext().getReferenceContext(), new AlignmentSpan( 0, 0, mappedReadsCount, //aligned unmappedReadsCount, unplacedReadsCount), byteOffsetOfContainer, byteOffsetOfSliceHeaderBlock, landmarkIndex ) ); break; case MULTIPLE_REFERENCE_TYPE: // NOTE: its possible that there are several different reference contexts embedded in this slice // i.e., there might be only one record per reference context, and thus not enough of any one // to warrant a separate slice) // unmapped span needs to go last final Map sliceSpanMap = getMultiRefAlignmentSpans( compressorCache, ValidationStringency.LENIENT); sliceSpanMap.entrySet().stream().filter(as -> !as.getKey().equals(ReferenceContext.UNMAPPED_UNPLACED_CONTEXT)).forEach( entry -> baiEntries.add( new BAIEntry( entry.getKey(), new AlignmentSpan( entry.getValue().getAlignmentStart(), entry.getValue().getAlignmentSpan(), entry.getValue().getMappedCount(), entry.getValue().getUnmappedCount(), entry.getValue().getUnmappedUnplacedCount()), byteOffsetOfContainer, byteOffsetOfSliceHeaderBlock, landmarkIndex) ) ); final AlignmentSpan unmappedSpan = sliceSpanMap.get(ReferenceContext.UNMAPPED_UNPLACED_CONTEXT); if (unmappedSpan != null) { baiEntries.add( new BAIEntry( ReferenceContext.UNMAPPED_UNPLACED_CONTEXT, unmappedSpan, byteOffsetOfContainer, byteOffsetOfSliceHeaderBlock, landmarkIndex ) ); } break; default: baiEntries.add( new BAIEntry( getAlignmentContext().getReferenceContext(), new AlignmentSpan( getAlignmentContext().getAlignmentStart(), getAlignmentContext().getAlignmentSpan(), getMappedReadsCount(), getUnmappedReadsCount(), getUnplacedReadsCount()), byteOffsetOfContainer, byteOffsetOfSliceHeaderBlock, landmarkIndex ) ); break; } return baiEntries; } }





© 2015 - 2024 Weber Informatics LLC | Privacy Policy