picard.illumina.ClusterDataToSamConverter Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of picard Show documentation
picard
There is a newer version: 3.2.0
/*
 * The MIT License
 *
 * Copyright (c) 2011 The Broad Institute
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
 * in the Software without restriction, including without limitation the rights
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be included in
 * all copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 * THE SOFTWARE.
 */
package picard.illumina;

import htsjdk.samtools.ReservedTagConstants;
import htsjdk.samtools.SAMRecord;
import htsjdk.samtools.SAMTag;
import htsjdk.samtools.SAMUtils;
import htsjdk.samtools.filter.SamRecordFilter;
import htsjdk.samtools.filter.SolexaNoiseFilter;
import picard.PicardException;
import picard.fastq.IlluminaReadNameEncoder;
import picard.fastq.ReadNameEncoder;
import picard.illumina.parser.ClusterData;
import picard.illumina.parser.ReadData;
import picard.illumina.parser.ReadStructure;
import picard.util.AdapterMarker;
import picard.util.AdapterPair;
import picard.util.IlluminaUtil;

import java.util.ArrayList;
import java.util.Collections;
import java.util.List;

/**
 * Takes ClusterData provided by an IlluminaDataProvider into one or two SAMRecords,
 * as appropriate, and optionally marking adapter sequence.  There is one converter per
 * IlluminaBasecallsToSam run, and all the TileProcessors use the same converter.
 * 
 * @author [email protected]
 */
public class ClusterDataToSamConverter implements
        IlluminaBasecallsConverter.ClusterDataConverter {


    private final String readGroupId;
    private final SamRecordFilter filters = new SolexaNoiseFilter();
    private final boolean isPairedEnd;
    private final boolean hasSampleBarcode;
    private final boolean hasMolecularBarcode;
    private final int [] templateIndices;
    private final int [] sampleBarcodeIndices;
    private final int [] molecularBarcodeIndices;

    private final AdapterMarker adapterMarker;
    private final int outputRecordsPerCluster;
    private final ReadNameEncoder readNameEncoder;

    // TODO: add RX and QX to the list of SAMTags and change this. initial discussion
    // TODO: here:
    // TODO: - https://github.com/broadinstitute/picard/issues/287
    // TODO: - HTS-spec issue: https://github.com/samtools/hts-specs/issues/109
    // TODO: - https://github.com/samtools/hts-specs/pull/119
    private String molecularIndexTag             = "RX";
    private String molecularIndexQualityTag      = "QX";
    private final String molecularIndexDelimiter = "-";
    private List tagPerMolecularIndex    = Collections.emptyList();

    /**
     * Constructor
     *
     * @param runBarcode        Used to construct read names.
     * @param readGroupId       If non-null, set RG attribute on SAMRecord to this.
     * @param readStructure     The expected structure (number of reads and indexes,
     *                          and their length) in the read.
     * @param adapters          The list of adapters to check for in the read
     */
    public ClusterDataToSamConverter(final String runBarcode,
                                     final String readGroupId,
                                     final ReadStructure readStructure,
                                     final List adapters) {
        this.readGroupId = readGroupId;
        
        this.readNameEncoder = new IlluminaReadNameEncoder(runBarcode);

        this.isPairedEnd = readStructure.templates.length() == 2;
        this.hasSampleBarcode = !readStructure.sampleBarcodes.isEmpty();
        this.hasMolecularBarcode = !readStructure.molecularBarcode.isEmpty();

        if (adapters.isEmpty()) {
            this.adapterMarker = null;
        } else {
            this.adapterMarker = new AdapterMarker(adapters.toArray(new AdapterPair[adapters.size()]));
        }

        this.templateIndices = readStructure.templates.getIndices();
        this.sampleBarcodeIndices = readStructure.sampleBarcodes.getIndices();
        this.molecularBarcodeIndices = readStructure.molecularBarcode.getIndices();

        this.outputRecordsPerCluster = readStructure.templates.length();
    }

    /**
     *  Sets the SAM tag to use to store the molecular index bases.  If multiple molecular indexes exist, it will concatenate them
     * and store them in this tag.
     */
    public ClusterDataToSamConverter withMolecularIndexTag(final String molecularIndexTag) {
        if (molecularIndexTag == null) throw new IllegalArgumentException("Molecular index tag was null");
        this.molecularIndexTag = molecularIndexTag;
        return this;
    }

    /**
     * Sets the SAM tag to use to store the molecular index base qualities.  If multiple molecular indexes exist, it will concatenate them
     * and store them in this tag.
     */
    public ClusterDataToSamConverter withMolecularIndexQualityTag(final String molecularIndexQualityTag) {
        if (molecularIndexQualityTag == null) throw new IllegalArgumentException("Molecular index quality tag was null");
        this.molecularIndexQualityTag = molecularIndexQualityTag;
        return this;
    }

    /**
     * Sets the SAM tags to use to store the bases each molecular index.  This will only be used if there are more than one molecular
     * index. If fewer tags are given than molecular indexes found, then the remaining molecular indexes will be concatenated and stored
     * in the last tag.  If more tags are provided than molecular indexes found, the additional tags will not be used.
     */
    public ClusterDataToSamConverter withTagPerMolecularIndex(final List tagPerMolecularIndex) {
        if (tagPerMolecularIndex == null) throw new IllegalArgumentException("Null given for tagPerMolecularIndex");
        this.tagPerMolecularIndex = tagPerMolecularIndex;
        return this;
    }

    /**
     * Creates a new SAM record from the basecall data
     */
    private SAMRecord createSamRecord(final ReadData readData, final String readName, final boolean isPf, final boolean firstOfPair,
                                      final String unmatchedBarcode,
                                      final List molecularIndexes, final List molecularIndexQualities) {
        final SAMRecord sam = new SAMRecord(null);
        sam.setReadName(readName);
        sam.setReadBases(readData.getBases());
        sam.setBaseQualities(readData.getQualities());

        // Flag values
        sam.setReadPairedFlag(isPairedEnd);
        sam.setReadUnmappedFlag(true);
        sam.setReadFailsVendorQualityCheckFlag(!isPf);
        if (isPairedEnd) {
            sam.setMateUnmappedFlag(true);
            sam.setFirstOfPairFlag(firstOfPair);
            sam.setSecondOfPairFlag(!firstOfPair);
        }

        if (filters.filterOut(sam)) {
            sam.setAttribute(ReservedTagConstants.XN, 1);
        }

        if (this.readGroupId != null) {
            sam.setAttribute(SAMTag.RG.name(), readGroupId);
        }

        // If it's a barcoded run and the read isn't assigned to a barcode, then add the barcode
        // that was read as an optional tag
        if (unmatchedBarcode != null) {
            sam.setAttribute(SAMTag.BC.name(), unmatchedBarcode);
        }

        if (!molecularIndexes.isEmpty()) {
            if (!this.molecularIndexTag.isEmpty()) {
                sam.setAttribute(this.molecularIndexTag, String.join(molecularIndexDelimiter, molecularIndexes));
            }
            if (!this.molecularIndexQualityTag.isEmpty()) {
                sam.setAttribute(this.molecularIndexQualityTag, String.join(molecularIndexDelimiter, molecularIndexQualities));
            }
            if (!this.tagPerMolecularIndex.isEmpty()) {
                if (tagPerMolecularIndex.size() != molecularIndexes.size()) {
                    throw new PicardException("Found " + molecularIndexes.size() + " molecular indexes but only " + tagPerMolecularIndex.size() + " SAM tags given.");
                }
                for (int i = 0; i < this.tagPerMolecularIndex.size(); i++) {
                    sam.setAttribute(this.tagPerMolecularIndex.get(i), molecularIndexes.get(i));
                }
            }
        }

        return sam;
    }

    /**
     * Creates the SAMRecord for each read in the cluster
     */
    public IlluminaBasecallsToSam.SAMRecordsForCluster convertClusterToOutputRecord(final ClusterData cluster) {

        final IlluminaBasecallsToSam.SAMRecordsForCluster ret = new IlluminaBasecallsToSam.SAMRecordsForCluster(outputRecordsPerCluster);
        final String readName = readNameEncoder.generateReadName(cluster, null); // Use null here to prevent /1 or /2 suffixes on read name.

        // Get and transform the unmatched barcode, if any, to store with the reads
        String unmatchedBarcode = null;
        if (hasSampleBarcode && cluster.getMatchedBarcode() == null) {
            final byte[][] barcode = new byte[sampleBarcodeIndices.length][];
            for (int i = 0; i < sampleBarcodeIndices.length; i++) {
                barcode[i] = cluster.getRead(sampleBarcodeIndices[i]).getBases();
            }
            unmatchedBarcode = IlluminaUtil.barcodeSeqsToString(barcode).replace('.', 'N'); //TODO: This has a separator, where as in other places we do not use a separator
        }

        final List molecularIndexes;
        final List molecularIndexQualities;
        if (hasMolecularBarcode) {
            molecularIndexes        = new ArrayList<>();
            molecularIndexQualities = new ArrayList<>();
            for (int i = 0; i < molecularBarcodeIndices.length; i++) {
                molecularIndexes.add(new String(cluster.getRead(molecularBarcodeIndices[i]).getBases()).replace('.', 'N'));
                molecularIndexQualities.add(SAMUtils.phredToFastq(cluster.getRead(molecularBarcodeIndices[i]).getQualities()));
            }
        } else {
            molecularIndexes        = Collections.emptyList();
            molecularIndexQualities = Collections.emptyList();
        }

        final SAMRecord firstOfPair = createSamRecord(
            cluster.getRead(templateIndices[0]), readName, cluster.isPf(), true, unmatchedBarcode, molecularIndexes, molecularIndexQualities);
        ret.records[0] = firstOfPair;

        SAMRecord secondOfPair = null;

        if(isPairedEnd) {
            secondOfPair  = createSamRecord(
                cluster.getRead(templateIndices[1]), readName, cluster.isPf(), false, unmatchedBarcode, molecularIndexes, molecularIndexQualities);
            ret.records[1] = secondOfPair;
        }

        if (adapterMarker != null) {
            // Clip the read
            if (isPairedEnd) {
                adapterMarker.adapterTrimIlluminaPairedReads(firstOfPair, secondOfPair);
            }
            else {
                adapterMarker.adapterTrimIlluminaSingleRead(firstOfPair);
            }
        }
        return ret;
    }
}