Many resources are needed to download a project. Please understand that we have to compensate our server costs. Thank you in advance. Project price only 1 $
You can buy this project and download/modify it how often you want.
/*
* The MIT License
*
* Copyright (c) 2016 The Broad Institute
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
* THE SOFTWARE.
*/
package picard.sam.markduplicates;
import htsjdk.samtools.util.Histogram;
import picard.sam.markduplicates.util.OpticalDuplicateFinder;
import java.util.*;
import static picard.sam.markduplicates.EstimateLibraryComplexity.PairedReadSequence;
/**
* Algorithm for search duplicates is used in EstimateLibraryComplexity only. It splits each PairedReadSequence
* on maximum error number + 1 and hash them. To find a duplicate we need to compare hashes and if they are similar
* we compare bases corresponding to the hashes.
*
* * @author [email protected], EPAM Systems, Inc.
*/
class ElcHashBasedDuplicatesFinder extends ElcDuplicatesFinder {
// We split the bases of each read into interspersed parts, so that each part spans the entire length of the read.
// e.g. breaking a read of length 12 into 4 parts would be as follows:
// f. i.:
// 1 2 3 4 1 2 3 4 1 2 3 4
// A C A T T A C G G A T T
// number of parts we split reads in
// -1 because we want to find biggest number of hashes in this group
// calculated formula : ((minReadLength - minIdenticalBases) * maxDiffRate) + 1
// when minReadLength = Math.min(Math.min(prs.read1.length, prs.read2.length), maxReadLength)
private int numberOfHashesInGroup = -1;
// minimal read length in group, this variable necessary for splitting reads on part,
// see EstimateLibraryComplexity.PairedReadSequence.getHashes()
// initial value Integer.MAX_VALUE
private int minReadLenInGroup = Integer.MAX_VALUE;
private final Map> readsByHashInGroup;
ElcHashBasedDuplicatesFinder(double maxDiffRate, int maxReadLength, int minIdenticalBases,
OpticalDuplicateFinder opticalDuplicateFinder) {
super(maxDiffRate, maxReadLength, minIdenticalBases, opticalDuplicateFinder);
readsByHashInGroup = new HashMap<>();
}
@Override
void searchDuplicates(List sequences, Histogram duplicationHisto,
Histogram opticalHisto) {
initHashLength(sequences);
fillHashValues(sequences);
populateDupCandidates(sequences);
//Duplicate set to filter the treated PairedReadSequences out
final Set dupSet = new HashSet<>();
for (PairedReadSequence lhs : sequences) {
if (dupSet.contains(lhs)) continue;
final List dupes = new ArrayList<>();
for (PairedReadSequence rhs : getSimilarReads(lhs)) {
if (dupSet.contains(rhs)) continue;
if (isDuplicate(lhs, rhs)) {
dupes.add(rhs);
}
}
dupSet.addAll(dupes);
fillHistogram(duplicationHisto, opticalHisto, lhs, dupes);
}
}
private Set getSimilarReads(final PairedReadSequence pattern) {
final Set toCheck = new HashSet<>();
for (int[] hashesForRead: new int[][]{pattern.hashes1, pattern.hashes2}) {
for (int hash : hashesForRead) {
List readsWithSameHash = readsByHashInGroup.get(hash);
// if size == 1, then this list contains only pattern read
if (readsWithSameHash.size() > 1) {
toCheck.addAll(readsWithSameHash);
}
}
}
return toCheck;
}
/**
* Populate PairedReadSequence.dupCandidates with duplicate candidates
* based on the same hash on the same position.
*/
private void populateDupCandidates(List seqs) {
// Contains hash value as a key and PairedReadSequences match this key as a value
readsByHashInGroup.clear();
// Iterate over all PairedReadSequence and split in sets according to the hash value
// in a certain position
for (PairedReadSequence prs : seqs) {
int[][] readHashValues = {prs.hashes1, prs.hashes2};
for (int[] readHashValue : readHashValues) {
for (int key : readHashValue) {
final List dupCandidates
= readsByHashInGroup.computeIfAbsent(key, k -> new ArrayList<>());
dupCandidates.add(prs);
}
}
}
}
/**
* Populate hashes for each PairedReadSequence.
*/
private void fillHashValues(List sequences) {
for (PairedReadSequence prs : sequences) {
prs.initHashes(numberOfHashesInGroup, minIdenticalBases, minReadLenInGroup);
}
}
/**
* Calculate hash length based on minReadLength and numberOfHashesInGroup
*/
private void initHashLength(List sequences) {
for (PairedReadSequence prs : sequences) {
int minReadLength = Math.min(Math.min(prs.read1.length, prs.read2.length), maxReadLength);
//search maximum number of hashes (if length of reads is similar number of hashes will be similar too)
int numberOfHashes = (int) ((minReadLength - minIdenticalBases) * maxDiffRate) + 1;
if (numberOfHashes > numberOfHashesInGroup) {
numberOfHashesInGroup = numberOfHashes;
}
//search minimum length of read for calculating hashes value
if (minReadLenInGroup > minReadLength) {
minReadLenInGroup = minReadLength;
}
}
}
private boolean isDuplicate(final PairedReadSequence lhs, final PairedReadSequence rhs) {
// self is not duplicate
if (lhs == rhs) {
return false;
}
final int read1Length = minLength(lhs.read1, rhs.read1);
final int read2Length = minLength(lhs.read2, rhs.read2);
final int maxErrors = (int) Math.floor((read1Length + read2Length) * maxDiffRate);
int errors = compareReadToRead(
lhs.read1, lhs.hashes1,
rhs.read1, rhs.hashes1,
maxErrors
);
if (errors > maxErrors) {
return false;
}
errors += compareReadToRead(
lhs.read2, lhs.hashes2,
rhs.read2, rhs.hashes2,
maxErrors
);
return errors <= maxErrors;
}
/**
* Compare hashes and if they are similar we compare bases corresponding to the hashes.
*/
private int compareReadToRead(byte[] read1, int[] hashes1, byte[] read2, int[] hashes2, int maxErrors) {
int errors = 0;
final int minReadLength = minLength(read1, read2);
for (int hashNumber = 0; hashNumber < numberOfHashesInGroup; ++hashNumber) {
if (hashes1[hashNumber] != hashes2[hashNumber]) {
errors += compareHashes(read1, read2, hashNumber);
if (errors > maxErrors) {
return errors;
}
}
}
if (minReadLength > minReadLenInGroup) {
errors += compareTails(read1, read2, minReadLenInGroup, minReadLength);
}
return errors;
}
/**
* Compare bases corresponding to the hashes.
*
* @return errors number for current part of the read
*/
private int compareHashes(byte[] read1, byte[] read2, int hashNumber) {
int errors = 0;
//shift position corresponding to hash we inspect
int position = minIdenticalBases + hashNumber;
while (position < minReadLenInGroup) {
if (read1[position] != read2[position]) {
errors++;
}
// shift position to next base in the hash
position += numberOfHashesInGroup;
}
return errors;
}
/**
* Compare bases corresponding to the hashes.
*
* @return errors number for current part of the read
*/
private int compareTails(byte[] read1, byte[] read2, int start, int stop) {
int errors = 0;
for (int i = start; i < stop; ++i) {
if (read1[i] != read2[i]) {
errors++;
}
}
return errors;
}
}