All Downloads are FREE. Search and download functionalities are using the official Maven repository.

htsjdk.samtools.cram.encoding.reader.CramRecordReader Maven / Gradle / Ivy

There is a newer version: 4.1.3
Show newest version
/**
 * ****************************************************************************
 * Copyright 2013 EMBL-EBI
 * 

* Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at *

* http://www.apache.org/licenses/LICENSE-2.0 *

* Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * **************************************************************************** */ package htsjdk.samtools.cram.encoding.reader; import htsjdk.samtools.SAMFormatException; import htsjdk.samtools.SAMRecord; import htsjdk.samtools.ValidationStringency; import htsjdk.samtools.cram.encoding.readfeatures.*; import htsjdk.samtools.cram.ref.ReferenceContext; import htsjdk.samtools.cram.structure.*; import htsjdk.samtools.cram.io.BitInputStream; import htsjdk.samtools.util.RuntimeIOException; import java.io.ByteArrayInputStream; import java.nio.charset.Charset; import java.util.LinkedList; import java.util.Map; import java.util.stream.Collectors; public class CramRecordReader { private final DataSeriesReader bitFlagsCodec; private final DataSeriesReader compressionBitFlagsCodec; private final DataSeriesReader readLengthCodec; private final DataSeriesReader alignmentStartCodec; private final DataSeriesReader readGroupCodec; private final DataSeriesReader readNameCodec; private final DataSeriesReader distanceToNextFragmentCodec; private final Map> tagValueCodecs; private final DataSeriesReader numberOfReadFeaturesCodec; private final DataSeriesReader readFeaturePositionCodec; private final DataSeriesReader readFeatureCodeCodec; private final DataSeriesReader baseCodec; private final DataSeriesReader qualityScoreCodec; private final DataSeriesReader qualityScoreArrayCodec; private final DataSeriesReader baseSubstitutionCodec; private final DataSeriesReader insertionCodec; private final DataSeriesReader softClipCodec; private final DataSeriesReader hardClipCodec; private final DataSeriesReader paddingCodec; private final DataSeriesReader deletionLengthCodec; private final DataSeriesReader mappingScoreCodec; private final DataSeriesReader mateBitFlagCodec; private final DataSeriesReader mateReferenceIdCodec; private final DataSeriesReader mateAlignmentStartCodec; private final DataSeriesReader insertSizeCodec; private final DataSeriesReader tagIdListCodec; private final DataSeriesReader refIdCodec; private final DataSeriesReader refSkipCodec; private final DataSeriesReader basesCodec; private final DataSeriesReader scoresCodec; private final Charset charset = Charset.forName("UTF8"); private final boolean captureReadNames; private final byte[][][] tagIdDictionary; private final ReferenceContext refContext; protected final ValidationStringency validationStringency; protected final boolean APDelta; private final Map encodingMap; private final BitInputStream coreBlockInputStream; private final Map externalBlockInputMap; private CramCompressionRecord prevRecord; private int recordCounter = 0; /** * Initialize a Cram Record Reader * * @param coreInputStream Core data block bit stream, to be read by non-external Encodings * @param externalInputMap External data block byte stream map, to be read by external Encodings * @param header the associated Cram Compression Header * @param refContext the reference context to assign to these records * @param validationStringency how strict to be when reading this CRAM record */ public CramRecordReader(final BitInputStream coreInputStream, final Map externalInputMap, final CompressionHeader header, final ReferenceContext refContext, final ValidationStringency validationStringency) { this.captureReadNames = header.readNamesIncluded; this.tagIdDictionary = header.dictionary; this.refContext = refContext; this.validationStringency = validationStringency; this.APDelta = header.APDelta; this.encodingMap = header.encodingMap; this.coreBlockInputStream = coreInputStream; this.externalBlockInputMap = externalInputMap; bitFlagsCodec = createDataReader(DataSeries.BF_BitFlags); compressionBitFlagsCodec = createDataReader(DataSeries.CF_CompressionBitFlags); readLengthCodec = createDataReader(DataSeries.RL_ReadLength); alignmentStartCodec = createDataReader(DataSeries.AP_AlignmentPositionOffset); readGroupCodec = createDataReader(DataSeries.RG_ReadGroup); readNameCodec = createDataReader(DataSeries.RN_ReadName); distanceToNextFragmentCodec = createDataReader(DataSeries.NF_RecordsToNextFragment); numberOfReadFeaturesCodec = createDataReader(DataSeries.FN_NumberOfReadFeatures); readFeaturePositionCodec = createDataReader(DataSeries.FP_FeaturePosition); readFeatureCodeCodec = createDataReader(DataSeries.FC_FeatureCode); baseCodec = createDataReader(DataSeries.BA_Base); qualityScoreCodec = createDataReader(DataSeries.QS_QualityScore); baseSubstitutionCodec = createDataReader(DataSeries.BS_BaseSubstitutionCode); insertionCodec = createDataReader(DataSeries.IN_Insertion); softClipCodec = createDataReader(DataSeries.SC_SoftClip); hardClipCodec = createDataReader(DataSeries.HC_HardClip); paddingCodec = createDataReader(DataSeries.PD_padding); deletionLengthCodec = createDataReader(DataSeries.DL_DeletionLength); mappingScoreCodec = createDataReader(DataSeries.MQ_MappingQualityScore); mateBitFlagCodec = createDataReader(DataSeries.MF_MateBitFlags); mateReferenceIdCodec = createDataReader(DataSeries.NS_NextFragmentReferenceSequenceID); mateAlignmentStartCodec = createDataReader(DataSeries.NP_NextFragmentAlignmentStart); insertSizeCodec = createDataReader(DataSeries.TS_InsertSize); tagIdListCodec = createDataReader(DataSeries.TL_TagIdList); refIdCodec = createDataReader(DataSeries.RI_RefId); refSkipCodec = createDataReader(DataSeries.RS_RefSkip); basesCodec = createDataReader(DataSeries.BB_bases); scoresCodec = createDataReader(DataSeries.QQ_scores); // special case: re-encodes QS as a byte array qualityScoreArrayCodec = new DataSeriesReader<>(DataSeriesType.BYTE_ARRAY, header.encodingMap.get(DataSeries.QS_QualityScore), coreInputStream, externalInputMap); tagValueCodecs = header.tMap.entrySet() .stream() .collect(Collectors.toMap(Map.Entry::getKey, mapEntry -> new DataSeriesReader<>(DataSeriesType.BYTE_ARRAY, mapEntry.getValue(), coreInputStream, externalInputMap))); } /** * Look up a Data Series in the Cram Compression Header's Encoding Map. If found, create a Data Reader * * @param dataSeries Which Data Series to write * @param The Java data type associated with the Data Series * @return a Data Reader for the given Data Series, or null if it's not in the encoding map */ private DataSeriesReader createDataReader(DataSeries dataSeries) { if (encodingMap.containsKey(dataSeries)) { return new DataSeriesReader<>(dataSeries.getType(), encodingMap.get(dataSeries), coreBlockInputStream, externalBlockInputMap); } else { return null; } } /** * Read a Cram Compression Record, using this class's Encodings * * @param cramRecord the Cram Compression Record to read into * @param prevAlignmentStart the alignmentStart of the previous record, for delta calculation * @return the alignmentStart of the newly-read record */ public int read(final CramCompressionRecord cramRecord, final int prevAlignmentStart) { try { // int mark = testCodec.readData(); // if (Writer.TEST_MARK != mark) { // System.err.println("Record counter=" + recordCount); // System.err.println(cramRecord.toString()); // throw new RuntimeException("Test mark not found."); // } cramRecord.flags = bitFlagsCodec.readData(); cramRecord.compressionFlags = compressionBitFlagsCodec.readData(); if (refContext.isMultiRef()) { cramRecord.sequenceId = refIdCodec.readData(); } else { // either unmapped (-1) or a valid ref cramRecord.sequenceId = refContext.getSerializableId(); } cramRecord.readLength = readLengthCodec.readData(); if (APDelta) { cramRecord.alignmentStart = prevAlignmentStart + alignmentStartCodec.readData(); } else { cramRecord.alignmentStart = alignmentStartCodec.readData(); } cramRecord.readGroupID = readGroupCodec.readData(); if (captureReadNames) { cramRecord.readName = new String(readNameCodec.readData(), charset); } // mate record: if (cramRecord.isDetached()) { cramRecord.mateFlags = mateBitFlagCodec.readData(); if (!captureReadNames) { cramRecord.readName = new String(readNameCodec.readData(), charset); } cramRecord.mateSequenceID = mateReferenceIdCodec.readData(); cramRecord.mateAlignmentStart = mateAlignmentStartCodec.readData(); cramRecord.templateSize = insertSizeCodec.readData(); } else if (cramRecord.isHasMateDownStream()) { cramRecord.recordsToNextFragment = distanceToNextFragmentCodec.readData(); } final Integer tagIdList = tagIdListCodec.readData(); final byte[][] ids = tagIdDictionary[tagIdList]; if (ids.length > 0) { final int tagCount = ids.length; cramRecord.tags = new ReadTag[tagCount]; for (int i = 0; i < ids.length; i++) { final int id = ReadTag.name3BytesToInt(ids[i]); final DataSeriesReader dataSeriesReader = tagValueCodecs.get(id); final ReadTag tag = new ReadTag(id, dataSeriesReader.readData(), validationStringency); cramRecord.tags[i] = tag; } } if (!cramRecord.isSegmentUnmapped()) { // reading read features: final int size = numberOfReadFeaturesCodec.readData(); int prevPos = 0; final java.util.List readFeatures = new LinkedList<>(); cramRecord.readFeatures = readFeatures; for (int i = 0; i < size; i++) { final Byte operator = readFeatureCodeCodec.readData(); final int pos = prevPos + readFeaturePositionCodec.readData(); prevPos = pos; switch (operator) { case ReadBase.operator: final ReadBase readBase = new ReadBase(pos, baseCodec.readData(), qualityScoreCodec.readData()); readFeatures.add(readBase); break; case Substitution.operator: final Substitution substitution = new Substitution(); substitution.setPosition(pos); final byte code = baseSubstitutionCodec.readData(); substitution.setCode(code); readFeatures.add(substitution); break; case Insertion.operator: final Insertion insertion = new Insertion(pos, insertionCodec.readData()); readFeatures.add(insertion); break; case SoftClip.operator: final SoftClip softClip = new SoftClip(pos, softClipCodec.readData()); readFeatures.add(softClip); break; case HardClip.operator: final HardClip hardCLip = new HardClip(pos, hardClipCodec.readData()); readFeatures.add(hardCLip); break; case Padding.operator: final Padding padding = new Padding(pos, paddingCodec.readData()); readFeatures.add(padding); break; case Deletion.operator: final Deletion deletion = new Deletion(pos, deletionLengthCodec.readData()); readFeatures.add(deletion); break; case RefSkip.operator: final RefSkip refSkip = new RefSkip(pos, refSkipCodec.readData()); readFeatures.add(refSkip); break; case InsertBase.operator: final InsertBase insertBase = new InsertBase(pos, baseCodec.readData()); readFeatures.add(insertBase); break; case BaseQualityScore.operator: final BaseQualityScore baseQualityScore = new BaseQualityScore(pos, qualityScoreCodec.readData()); readFeatures.add(baseQualityScore); break; case Bases.operator: final Bases bases = new Bases(pos, basesCodec.readData()); readFeatures.add(bases); break; case Scores.operator: final Scores scores = new Scores(pos, scoresCodec.readData()); readFeatures.add(scores); break; default: throw new RuntimeException("Unknown read feature operator: " + operator); } } // mapping quality: cramRecord.mappingQuality = mappingScoreCodec.readData(); if (cramRecord.isForcePreserveQualityScores()) { cramRecord.qualityScores = qualityScoreArrayCodec.readDataArray(cramRecord.readLength); } } else { if (cramRecord.isUnknownBases()) { cramRecord.readBases = SAMRecord.NULL_SEQUENCE; cramRecord.qualityScores = SAMRecord.NULL_QUALS; } else { final byte[] bases = new byte[cramRecord.readLength]; for (int i = 0; i < bases.length; i++) { bases[i] = baseCodec.readData(); } cramRecord.readBases = bases; if (cramRecord.isForcePreserveQualityScores()) { cramRecord.qualityScores = qualityScoreArrayCodec.readDataArray(cramRecord.readLength); } } } recordCounter++; prevRecord = cramRecord; } catch (final SAMFormatException e) { if (prevRecord != null) { System.err.printf("Failed at record %d. Here is the previously read record: %s\n", recordCounter, prevRecord.toString()); } throw e; } catch (final Exception e) { if (prevRecord != null) { System.err.printf("Failed at record %d. Here is the previously read record: %s\n", recordCounter, prevRecord.toString()); } throw new RuntimeIOException(e); } return cramRecord.alignmentStart; } }





© 2015 - 2024 Weber Informatics LLC | Privacy Policy