All Downloads are FREE. Search and download functionalities are using the official Maven repository.

htsjdk.samtools.SAMTextHeaderCodec Maven / Gradle / Ivy

There is a newer version: 4.1.3
Show newest version
/*
 * The MIT License
 *
 * Copyright (c) 2009 The Broad Institute
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
 * in the Software without restriction, including without limitation the rights
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be included in
 * all copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 * THE SOFTWARE.
 */
package htsjdk.samtools;

import htsjdk.samtools.SAMFileHeader.SortOrder;
import htsjdk.samtools.SAMValidationError.Type;
import htsjdk.samtools.util.DateParser;
import htsjdk.samtools.util.LineReader;
import htsjdk.samtools.util.RuntimeIOException;
import htsjdk.samtools.util.StringUtil;
import htsjdk.samtools.util.Log;

import java.io.BufferedWriter;
import java.io.IOException;
import java.io.Writer;
import java.util.ArrayList;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.Map;
import java.util.regex.Pattern;

/**
 * Parser for a SAM text header, and a generator of SAM text header.
 */
public class SAMTextHeaderCodec {
    private static final String HEADER_LINE_START = "@";

    // These attributes are populated when parsing or generating
    private SAMFileHeader mFileHeader;
    private final TextTagCodec mTagCodec = new TextTagCodec();

    // These attributes are populated when parsing text
    private String mCurrentLine;
    private LineReader mReader;
    private String mSource;
    private List sequences;
    private List readGroups;

    // For error reporting when parsing
    private ValidationStringency validationStringency = ValidationStringency.SILENT;

    // These attributes are populated when generating text
    private Writer writer;

    private static final String TAG_KEY_VALUE_SEPARATOR = ":";
    private static final char TAG_KEY_VALUE_SEPARATOR_CHAR = ':';
    private static final String FIELD_SEPARATOR = "\t";
    private static final char FIELD_SEPARATOR_CHAR = '\t';
    private static final Pattern FIELD_SEPARATOR_RE = Pattern.compile(FIELD_SEPARATOR);

    public static final String COMMENT_PREFIX = HEADER_LINE_START + HeaderRecordType.CO.name() + FIELD_SEPARATOR;
    private static final Log log = Log.getInstance(SAMTextHeaderCodec.class);

    void setWriter(final Writer writer) {
        this.writer = writer;
    }

    void setmFileHeader(final SAMFileHeader header) {
        this.mFileHeader = header;
    }

    /**
     * Reads text SAM header and converts to a SAMFileHeader object.
     * @param reader Where to get header text from.
     * @param source Name of the input file, for error messages.  May be null.
     * @return complete header object.
     */
    public SAMFileHeader decode(final LineReader reader, final String source) {
        mFileHeader = new SAMFileHeader();
        mReader = reader;
        mSource = source;
        sequences = new ArrayList<>();
        readGroups = new ArrayList<>();

        while (advanceLine() != null) {
            final ParsedHeaderLine parsedHeaderLine = new ParsedHeaderLine(mCurrentLine);
            if (!parsedHeaderLine.isLineValid()) {
                continue;
            }
            switch (parsedHeaderLine.getHeaderRecordType()) {

                case HD:
                    parseHDLine(parsedHeaderLine);
                    break;
                case PG:
                    parsePGLine(parsedHeaderLine);
                    break;
                case RG:
                    parseRGLine(parsedHeaderLine);
                    break;
                case SQ:
                    parseSQLine(parsedHeaderLine);
                    break;
                case CO:
                    mFileHeader.addComment(mCurrentLine);
                    break;
                default:
                    throw new IllegalStateException("Unrecognized header record type: " +
                            parsedHeaderLine.getHeaderRecordType());
            }
        }
        mFileHeader.setSequenceDictionary(new SAMSequenceDictionary(sequences));
        mFileHeader.setReadGroups(readGroups);

        SAMUtils.processValidationErrors(mFileHeader.getValidationErrors(), -1, validationStringency);
        return mFileHeader;
    }

    private String advanceLine() {
        final int nextChar = mReader.peek();
        this.mCurrentLine = (nextChar == '@') ? mReader.readLine() : null;
        return this.mCurrentLine;
    }

    /**
     * Transfer standard and non-standard tags from text representation to in-memory representation.
     * All values are now stored as Strings.
     * @param record attributes get set into this object.
     * @param textAttributes Map of tag type to value.  Some values may be removed by this method.
     */
    private void transferAttributes(final AbstractSAMHeaderRecord record, final Map textAttributes) {
        // All header tags are now of type String, so no need to distinguish standard from non-standard.
        for (final Map.Entry entry : textAttributes.entrySet()) {
            record.setAttribute(entry.getKey(), entry.getValue());
        }

    }

    private void parsePGLine(final ParsedHeaderLine parsedHeaderLine) {
        assert(HeaderRecordType.PG.equals(parsedHeaderLine.getHeaderRecordType()));
        if (!parsedHeaderLine.requireTag(SAMProgramRecord.PROGRAM_GROUP_ID_TAG)) {
            return;
        }
        final SAMProgramRecord programRecord = new SAMProgramRecord(parsedHeaderLine.removeValue(SAMProgramRecord.PROGRAM_GROUP_ID_TAG));

        transferAttributes(programRecord, parsedHeaderLine.mKeyValuePairs);
        mFileHeader.addProgramRecord(programRecord);
    }

    private void parseRGLine(final ParsedHeaderLine parsedHeaderLine) {
        assert(HeaderRecordType.RG.equals(parsedHeaderLine.getHeaderRecordType()));
        if (!parsedHeaderLine.requireTag(SAMReadGroupRecord.READ_GROUP_ID_TAG)) {
            return;
        }
        // Allow no SM tag if validation stringency is not strict.  This call has the side effect of reporting an error
        // or throwing an exception depending on validation stringency if this is missing.
        parsedHeaderLine.requireTag(SAMReadGroupRecord.READ_GROUP_SAMPLE_TAG);
        final SAMReadGroupRecord samReadGroupRecord = new SAMReadGroupRecord(parsedHeaderLine.removeValue(SAMReadGroupRecord.READ_GROUP_ID_TAG));
        transferAttributes(samReadGroupRecord, parsedHeaderLine.mKeyValuePairs);

        // Convert non-String attributes to the appropriate types
        final String predictedMedianInsertSize =
                (String)samReadGroupRecord.getAttribute(SAMReadGroupRecord.PREDICTED_MEDIAN_INSERT_SIZE_TAG);
        if (predictedMedianInsertSize != null) {
            try {
                Integer.parseInt(predictedMedianInsertSize);
                samReadGroupRecord.setAttribute(SAMReadGroupRecord.PREDICTED_MEDIAN_INSERT_SIZE_TAG,predictedMedianInsertSize);
            } catch (NumberFormatException e) {
                reportErrorParsingLine(SAMReadGroupRecord.PREDICTED_MEDIAN_INSERT_SIZE_TAG +
                        " is not numeric: " + predictedMedianInsertSize, SAMValidationError.Type.INVALID_PREDICTED_MEDIAN_INSERT_SIZE,
                        e);
            }
        }

        final String dateRunProduced = (String)samReadGroupRecord.getAttribute(SAMReadGroupRecord.DATE_RUN_PRODUCED_TAG);
        if (dateRunProduced != null) {
            Object date;
            try {
                date = mTagCodec.decodeDate(dateRunProduced);
            } catch (DateParser.InvalidDateException e) {
                // Can't convert date string into Date object.  Treat it as a string if validation
                //  stringency allows it.
                date = dateRunProduced;
                reportErrorParsingLine(SAMReadGroupRecord.DATE_RUN_PRODUCED_TAG + " tag value '" +
                dateRunProduced + "' is not parseable as a date", SAMValidationError.Type.INVALID_DATE_STRING,
                        e);
            }
            samReadGroupRecord.setAttribute(SAMReadGroupRecord.DATE_RUN_PRODUCED_TAG, date.toString());
        }

        readGroups.add(samReadGroupRecord);
    }

    private void parseSQLine(final ParsedHeaderLine parsedHeaderLine) {
        assert(HeaderRecordType.SQ.equals(parsedHeaderLine.getHeaderRecordType()));
        if (!parsedHeaderLine.requireTag(SAMSequenceRecord.SEQUENCE_NAME_TAG) ||
                !parsedHeaderLine.requireTag(SAMSequenceRecord.SEQUENCE_LENGTH_TAG)) {
            return;
        }
        String sequenceName = parsedHeaderLine.removeValue(SAMSequenceRecord.SEQUENCE_NAME_TAG);
        sequenceName = SAMSequenceRecord.truncateSequenceName(sequenceName);
        final SAMSequenceRecord samSequenceRecord = new SAMSequenceRecord(sequenceName,
                Integer.parseInt(parsedHeaderLine.removeValue(SAMSequenceRecord.SEQUENCE_LENGTH_TAG)));
        transferAttributes(samSequenceRecord, parsedHeaderLine.mKeyValuePairs);
        sequences.add(samSequenceRecord);
    }

    private void parseHDLine(final ParsedHeaderLine parsedHeaderLine) {
        assert(HeaderRecordType.HD.equals(parsedHeaderLine.getHeaderRecordType()));
        if (!parsedHeaderLine.requireTag(SAMFileHeader.VERSION_TAG)) {
            return;
        }

        final String soString = parsedHeaderLine.getValue(SAMFileHeader.SORT_ORDER_TAG);
        try {
            if (soString != null) SortOrder.valueOf(soString);
        } catch (IllegalArgumentException e) {
            reportErrorParsingLine(HEADER_LINE_START + parsedHeaderLine.getHeaderRecordType() +
                            " line has non-conforming SO tag value: " + soString + ".",
                    SAMValidationError.Type.HEADER_TAG_NON_CONFORMING_VALUE, null);
        }

        final String goString = parsedHeaderLine.getValue(SAMFileHeader.GROUP_ORDER_TAG);
        try {
            if (goString != null) SAMFileHeader.GroupOrder.valueOf(goString);
        } catch (IllegalArgumentException e) {
            reportErrorParsingLine(HEADER_LINE_START + parsedHeaderLine.getHeaderRecordType() +
                            " line has non-conforming GO tag value: "+ goString + ".",
                    SAMValidationError.Type.HEADER_TAG_NON_CONFORMING_VALUE, null);
        }

        transferAttributes(mFileHeader, parsedHeaderLine.mKeyValuePairs);
    }

    private void reportErrorParsingLine(String reason, final SAMValidationError.Type type, final Throwable nestedException) {
        reason = "Error parsing SAM header. " + reason + ". Line:\n" + mCurrentLine;
        if (validationStringency != ValidationStringency.STRICT) {
            final SAMValidationError error = new SAMValidationError(type, reason, null, mReader.getLineNumber());
            error.setSource(mSource);
            mFileHeader.addValidationError(error);
        } else {
            String fileMessage = "";
            if (mSource != null) {
                fileMessage = "File " + mSource;
            }
            throw new SAMFormatException(reason + "; " + fileMessage +
                    "; Line number " + mReader.getLineNumber(), nestedException);
        }
    }

    private enum HeaderRecordType {
        HD, SQ, RG, PG, CO
    }

    /**
     * Takes a header line as a String and converts it into a HeaderRecordType, and a map of key:value strings.
     * If the line does not contain a recognized HeaderRecordType, then the line is considered invalid, and will
     * not have any key:value pairs.
     */
    private class ParsedHeaderLine {
        private HeaderRecordType mHeaderRecordType;
        private final Map mKeyValuePairs = new LinkedHashMap();
        private boolean lineValid = false;

        ParsedHeaderLine(final String line) {
            assert(line.startsWith(HEADER_LINE_START));

            // Tab-separate
            String[] fields = new String[1024];
            int numFields = StringUtil.split(line, fields, FIELD_SEPARATOR_CHAR);
            if (numFields == fields.length) {
                // Lots of fields, so fall back
                fields = FIELD_SEPARATOR_RE.split(line);
                numFields = fields.length;
            }

            // Parse the HeaderRecordType
            try {
                mHeaderRecordType = HeaderRecordType.valueOf(fields[0].substring(1));
            } catch (IllegalArgumentException e) {
                reportErrorParsingLine("Unrecognized header record type", SAMValidationError.Type.UNRECOGNIZED_HEADER_TYPE, null);
                mHeaderRecordType = null;
                return;
            }

            // Do not parse key:value pairs for comment lines.
            if (mHeaderRecordType == HeaderRecordType.CO) {
                lineValid = true;
                return;
            }

            final String[] keyAndValue = new String[2];
            // Parse they key:value pairs
            for (int i = 1; i < numFields; ++i) {
                if (StringUtil.splitConcatenateExcessTokens(fields[i], keyAndValue, TAG_KEY_VALUE_SEPARATOR_CHAR) != 2) {
                    reportErrorParsingLine("Problem parsing " + HEADER_LINE_START + mHeaderRecordType +
                            " key:value pair", SAMValidationError.Type.POORLY_FORMATTED_HEADER_TAG, null);
                    continue;
                }
                if (mKeyValuePairs.containsKey(keyAndValue[0]) &&
                        ! mKeyValuePairs.get(keyAndValue[0]).equals(keyAndValue[1])) {
                    reportErrorParsingLine("Problem parsing " + HEADER_LINE_START + mHeaderRecordType +
                            " key:value pair " + keyAndValue[0] + ":" + keyAndValue[1] +
                            " clashes with " + keyAndValue[0] + ":" + mKeyValuePairs.get(keyAndValue[0]),
                            SAMValidationError.Type.HEADER_TAG_MULTIPLY_DEFINED, null);
                    continue;
                }
                if (keyAndValue[0].length() != 2) {
                    reportErrorParsingLine("Problem parsing " + HEADER_LINE_START + mHeaderRecordType +
                            " key:value pair " + keyAndValue[0] + ": key is not two characters",
                            SAMValidationError.Type.HEADER_TAG_INVALID_KEY, null);
                    continue;
                }
                validateSortOrderValue(keyAndValue);
                mKeyValuePairs.put(keyAndValue[0], keyAndValue[1]);
            }
            lineValid = true;
        }

        private void validateSortOrderValue(String[] value) {
            if (SAMFileHeader.SORT_ORDER_TAG.equals(value[0])) {
                try {
                    SortOrder.valueOf(value[1]);
                } catch (IllegalArgumentException e) {
                    if (validationStringency == ValidationStringency.STRICT) {
                        throw new SAMFormatException("Found non-conforming header SO tag: "
                                                     + value[1]
                                                     + ", exiting because VALIDATION_STRINGENCY=STRICT");
                    } else if (validationStringency == ValidationStringency.LENIENT) {
                        log.warn("Found non-conforming header SO tag: "
                                 + value[1] + ". Treating as 'unknown'.");
                    }
                    value[1] = SortOrder.unknown.toString();
                }
            }
        }

        /**
         * True if the line is recognized as one of the valid HeaderRecordTypes.
         */
        public boolean isLineValid() {
            return lineValid;
        }

        /**
         * Handling depends on the validation stringency.  If the tag is not present, and stringency is strict,
         * an exception is thrown.  If stringency is not strict, false is returned.
         * @param tag Must be present for the line to be considered value.
         * @return True if tag is present.
         */
        boolean requireTag(final String tag) {
            if (!mKeyValuePairs.containsKey(tag)) {
                reportErrorParsingLine(HEADER_LINE_START + mHeaderRecordType + " line missing " + tag + " tag",
                        SAMValidationError.Type.HEADER_RECORD_MISSING_REQUIRED_TAG, null);
                return false;
            }
            return true;
        }

        /**
         * @return null if line is invalid, otherwise the parsed HeaderRecordType
         */
        public HeaderRecordType getHeaderRecordType() {
            return mHeaderRecordType;
        }

        boolean containsKey(final String key) {
            return mKeyValuePairs.containsKey(key);
        }

        String getValue(final String key) {
            return mKeyValuePairs.get(key);
        }

        String removeValue(final String key) {
            final String ret = mKeyValuePairs.get(key);
            mKeyValuePairs.remove(key);
            return ret;
        }

    }

    /**
     * Convert SAMFileHeader from in-memory representation to text representation. Always writes
     * SAMFileHeader.CURRENT_VERSION as the version in the header.
     * @param writer where to write the header text.
     * @param header object to be converted to text.
     */
    public void encode(final Writer writer, final SAMFileHeader header) {
        encode(writer, header, false);
    }

    /**
     * Convert SAMFileHeader from in-memory representation to text representation.
     * @param writer where to write the header text.
     * @param header object to be converted to text.
     * @param keepExistingVersionNumber If true, writes whatever version # was in the header.  If false, writes
     *                                  SAMFileHeader.CURRENT_VERSION.
     */
    public void encode(final Writer writer, final SAMFileHeader header, final boolean keepExistingVersionNumber) {
        mFileHeader = header;
        this.writer = new BufferedWriter(writer);
        writeHDLine(keepExistingVersionNumber);
        for (final SAMSequenceRecord sequenceRecord: header.getSequenceDictionary().getSequences()) {
            writeSQLine(sequenceRecord);
        }

        for (final SAMReadGroupRecord readGroup : header.getReadGroups()) {
            writeRGLine(readGroup);
        }
        for (final SAMProgramRecord programRecord : header.getProgramRecords()) {
            writePGLine(programRecord);
        }
        for (final String comment : header.getComments()) {
            println(comment);
        }
        try {
            this.writer.flush();
        } catch (IOException e) {
            throw new RuntimeIOException(e);
        }
    }

    /**
     * Encode {@link SAMSequenceRecord}.
     * Designed for using in {@link SAMSequenceDictionaryCodec}, allows to implement recording on the fly.
     * @throws IllegalStateException, if writer is null.
     */
    void encodeSequenceRecord(final SAMSequenceRecord sequenceRecord) {
        if (writer == null) {
            throw new IllegalStateException("writer couldn't be null");
        }
        writeSQLine(sequenceRecord);
    }

    /**
     * Encode HD line.
     * Designed for using in {@link SAMSequenceDictionaryCodec}, allows to implement recording on the fly.
     * @throws IllegalStateException, if writer is null.
     */
    void encodeHeaderLine(final boolean keepExistingVersionNumber) {
        if (writer == null) {
            throw new IllegalStateException("writer couldn't be null");
        }
        writeHDLine(keepExistingVersionNumber);
    }

    private void println(final String s) {
        try {
            writer.append(s);
            writer.append("\n");
        } catch (IOException e) {
            throw new RuntimeIOException(e);
        }
    }

    private void writePGLine(final SAMProgramRecord programRecord) {
        println(getPGLine(programRecord));
    }

    protected String getPGLine(final SAMProgramRecord programRecord) {
        final String[] fields = new String[2 + programRecord.getAttributes().size()];
        fields[0] = HEADER_LINE_START + HeaderRecordType.PG;
        fields[1] = SAMProgramRecord.PROGRAM_GROUP_ID_TAG + TAG_KEY_VALUE_SEPARATOR + programRecord.getProgramGroupId();
        encodeTags(programRecord, fields, 2);
        return StringUtil.join(FIELD_SEPARATOR, fields);
    }

    private void writeRGLine(final SAMReadGroupRecord readGroup) {
        println(getRGLine(readGroup));
    }

    protected String getRGLine(final SAMReadGroupRecord readGroup) {
      final String[] fields = new String[2 + readGroup.getAttributes().size()];
      fields[0] = HEADER_LINE_START + HeaderRecordType.RG;
      fields[1] = SAMReadGroupRecord.READ_GROUP_ID_TAG + TAG_KEY_VALUE_SEPARATOR + readGroup.getReadGroupId();
      encodeTags(readGroup, fields, 2);
      return StringUtil.join(FIELD_SEPARATOR, fields);
    }

    private void writeHDLine(final boolean keepExistingVersionNumber) {
        final SAMFileHeader newHeader;
        if (keepExistingVersionNumber) {
            newHeader = mFileHeader;
        } else {
            // Make a copy of the header, excluding the version from the input header, so that
            // output get CURRENT_VERSION instead of whatever the version of the input header was.
            newHeader = new SAMFileHeader();

            for (final Map.Entry entry : mFileHeader.getAttributes()) {
                if (!entry.getKey().equals(SAMFileHeader.VERSION_TAG)) {
                    newHeader.setAttribute(entry.getKey(), entry.getValue());
                }
            }
        }

        final String[] fields = new String[1 + newHeader.getAttributes().size()];
        fields[0] = HEADER_LINE_START + HeaderRecordType.HD;
        encodeTags(newHeader, fields, 1);
        println(StringUtil.join(FIELD_SEPARATOR, fields));
    }

    private void writeSQLine(final SAMSequenceRecord sequenceRecord) {
        println(getSQLine(sequenceRecord));
    }

    protected String getSQLine(final SAMSequenceRecord sequenceRecord) {
        final int numAttributes = sequenceRecord.getAttributes() != null ? sequenceRecord.getAttributes().size() : 0;
        final String[] fields = new String[3 + numAttributes];
        fields[0] = HEADER_LINE_START + HeaderRecordType.SQ;
        fields[1] = SAMSequenceRecord.SEQUENCE_NAME_TAG + TAG_KEY_VALUE_SEPARATOR + sequenceRecord.getSequenceName();
        fields[2] = SAMSequenceRecord.SEQUENCE_LENGTH_TAG + TAG_KEY_VALUE_SEPARATOR + Integer.toString(sequenceRecord.getSequenceLength());
        encodeTags(sequenceRecord, fields, 3);
        return StringUtil.join(FIELD_SEPARATOR, fields);
    }

    /**
     * Encode all the attributes in the given object as text
     * @param rec object containing attributes, and knowledge of which are standard tags
     * @param fields where to put the text representation of the tags.  Must be big enough to hold all tags.
     * @param offset where to start putting text tag representations.
     */
    private void encodeTags(final AbstractSAMHeaderRecord rec, final String[] fields, int offset) {
        for (final Map.Entry entry: rec.getAttributes()) {
            fields[offset++] = mTagCodec.encodeUntypedTag(entry.getKey(), entry.getValue());
        }
    }

    public void setValidationStringency(final ValidationStringency validationStringency) {
        if (validationStringency == null) {
            throw new IllegalArgumentException("null validationStringency not allowed");
        }
        this.validationStringency = validationStringency;
    }
}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy