picard.sam.markduplicates.ElcHashBasedDuplicatesFinder Maven / Gradle / Ivy

Show more of this group Show more artifacts with this name
Show all versions of picard Show documentation
A set of command line tools (in Java) for manipulating high-throughput sequencing (HTS) data and formats such as SAM/BAM/CRAM and VCF.
There is a newer version: 3.2.0
Show newest version
/*
 * The MIT License
 *
 * Copyright (c) 2016 The Broad Institute
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
 * in the Software without restriction, including without limitation the rights
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be included in
 * all copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 * THE SOFTWARE.
 */

package picard.sam.markduplicates;

import htsjdk.samtools.util.Histogram;
import picard.sam.markduplicates.util.OpticalDuplicateFinder;

import java.util.*;

import static picard.sam.markduplicates.EstimateLibraryComplexity.PairedReadSequence;

/**
 * Algorithm for search duplicates is used in EstimateLibraryComplexity only. It splits each PairedReadSequence
 * on maximum error number + 1 and hash them. To find a duplicate we need to compare hashes and if they are similar
 * we compare bases corresponding to the hashes.
 *
 * * @author [email protected], EPAM Systems, Inc. 
 */
class ElcHashBasedDuplicatesFinder extends ElcDuplicatesFinder {

    // We split the bases of each read into interspersed parts, so that each part spans the entire length of the read.
    // e.g. breaking a read of length 12 into 4 parts would be as follows:
    // f. i.:
    // 1 2 3 4 1 2 3 4 1 2 3 4
    // A C A T T A C G G A T T

    // number of parts we split reads in
    // -1 because we want to find biggest number of hashes in this group
    // calculated formula : ((minReadLength - minIdenticalBases) * maxDiffRate) + 1
    // when minReadLength = Math.min(Math.min(prs.read1.length, prs.read2.length), maxReadLength)
    private int numberOfHashesInGroup = -1;

    // minimal read length in group, this variable necessary for splitting reads on part,
    // see EstimateLibraryComplexity.PairedReadSequence.getHashes()
    // initial value Integer.MAX_VALUE
    private int minReadLenInGroup = Integer.MAX_VALUE;

    private final Map> readsByHashInGroup;

    ElcHashBasedDuplicatesFinder(double maxDiffRate, int maxReadLength, int minIdenticalBases,
                                 OpticalDuplicateFinder opticalDuplicateFinder) {
        super(maxDiffRate, maxReadLength, minIdenticalBases, opticalDuplicateFinder);
        readsByHashInGroup = new HashMap<>();
    }

    @Override
    void searchDuplicates(List sequences, Histogram duplicationHisto,
                          Histogram opticalHisto) {

        initHashLength(sequences);
        fillHashValues(sequences);
        populateDupCandidates(sequences);

        //Duplicate set to filter the treated PairedReadSequences out
        final Set dupSet = new HashSet<>();

        for (PairedReadSequence lhs : sequences) {
            if (dupSet.contains(lhs)) continue;
            final List dupes = new ArrayList<>();
            for (PairedReadSequence rhs : getSimilarReads(lhs)) {
                if (dupSet.contains(rhs)) continue;
                if (isDuplicate(lhs, rhs)) {
                    dupes.add(rhs);
                }
            }
            dupSet.addAll(dupes);
            fillHistogram(duplicationHisto, opticalHisto, lhs, dupes);
        }
    }

    private Set getSimilarReads(final PairedReadSequence pattern) {
        final Set toCheck = new HashSet<>();
        for (int[] hashesForRead: new int[][]{pattern.hashes1, pattern.hashes2}) {
            for (int hash : hashesForRead) {
                List readsWithSameHash = readsByHashInGroup.get(hash);
                // if size == 1, then this list contains only pattern read
                if (readsWithSameHash.size() > 1) {
                    toCheck.addAll(readsWithSameHash);
                }
            }
        }
        return toCheck;
    }

    /**
     * Populate PairedReadSequence.dupCandidates with duplicate candidates
     * based on the same hash on the same position.
     */
    private void populateDupCandidates(List seqs) {
        // Contains hash value as a key and PairedReadSequences match this key as a value
        readsByHashInGroup.clear();

        // Iterate over all PairedReadSequence and split in sets according to the hash value
        // in a certain position
        for (PairedReadSequence prs : seqs) {
            int[][] readHashValues = {prs.hashes1, prs.hashes2};
            for (int[] readHashValue : readHashValues) {
                for (int key : readHashValue) {
                    final List dupCandidates
                            = readsByHashInGroup.computeIfAbsent(key, k -> new ArrayList<>());
                    dupCandidates.add(prs);
                }
            }
        }
    }

    /**
     * Populate hashes for each PairedReadSequence.
     */
    private void fillHashValues(List sequences) {
        for (PairedReadSequence prs : sequences) {
            prs.initHashes(numberOfHashesInGroup, minIdenticalBases, minReadLenInGroup);
        }
    }

    /**
     * Calculate hash length based on minReadLength and numberOfHashesInGroup
     */
    private void initHashLength(List sequences) {
        for (PairedReadSequence prs : sequences) {
            int minReadLength = Math.min(Math.min(prs.read1.length, prs.read2.length), maxReadLength);

            //search maximum number of hashes (if length of reads is similar number of hashes will be similar too)
            int numberOfHashes = (int) ((minReadLength - minIdenticalBases) * maxDiffRate) + 1;
            if (numberOfHashes > numberOfHashesInGroup) {
                numberOfHashesInGroup = numberOfHashes;
            }

            //search minimum length of read for calculating hashes value
            if (minReadLenInGroup > minReadLength) {
                minReadLenInGroup = minReadLength;
            }
        }
    }

    private boolean isDuplicate(final PairedReadSequence lhs, final PairedReadSequence rhs) {
        // self is not duplicate
        if (lhs == rhs) {
            return false;
        }

        final int read1Length = minLength(lhs.read1, rhs.read1);
        final int read2Length = minLength(lhs.read2, rhs.read2);
        final int maxErrors = (int) Math.floor((read1Length + read2Length) * maxDiffRate);

        int errors = compareReadToRead(
                lhs.read1, lhs.hashes1,
                rhs.read1, rhs.hashes1,
                maxErrors
        );

        if (errors > maxErrors) {
            return false;
        }

        errors += compareReadToRead(
                lhs.read2, lhs.hashes2,
                rhs.read2, rhs.hashes2,
                maxErrors
        );

        return errors <= maxErrors;
    }
    /**
     * Compare hashes and if they are similar we compare bases corresponding to the hashes.
     */
    private int compareReadToRead(byte[] read1, int[] hashes1, byte[] read2, int[] hashes2, int maxErrors) {
        int errors = 0;
        final int minReadLength = minLength(read1, read2);

        for (int hashNumber = 0; hashNumber < numberOfHashesInGroup; ++hashNumber) {
            if (hashes1[hashNumber] != hashes2[hashNumber]) {
                errors += compareHashes(read1, read2, hashNumber);
                if (errors > maxErrors) {
                    return errors;
                }
            }
        }

        if (minReadLength > minReadLenInGroup) {
            errors += compareTails(read1, read2, minReadLenInGroup, minReadLength);
        }

        return errors;
    }

    /**
     * Compare bases corresponding to the hashes.
     *
     * @return errors number for current part of the read
     */
    private int compareHashes(byte[] read1, byte[] read2, int hashNumber) {
        int errors = 0;
        //shift position corresponding to hash we inspect
        int position = minIdenticalBases + hashNumber;
        while (position < minReadLenInGroup) {
            if (read1[position] != read2[position]) {
                errors++;
            }
            // shift position to next base in the hash
            position += numberOfHashesInGroup;
        }
        return errors;
    }

    /**
     * Compare bases corresponding to the hashes.
     *
     * @return errors number for current part of the read
     */
    private int compareTails(byte[] read1, byte[] read2, int start, int stop) {
        int errors = 0;
        for (int i = start; i < stop; ++i) {
            if (read1[i] != read2[i]) {
                errors++;
            }
        }
        return errors;
    }
}