Many resources are needed to download a project. Please understand that we have to compensate our server costs. Thank you in advance. Project price only 1 $
You can buy this project and download/modify it how often you want.
package org.broadinstitute.hellbender.utils;
import htsjdk.samtools.SAMSequenceDictionary;
import htsjdk.samtools.SAMSequenceRecord;
import org.broadinstitute.hellbender.exceptions.GATKException;
import org.broadinstitute.hellbender.exceptions.UserException;
import java.util.*;
import java.util.stream.Collectors;
/**
*
* A series of utility functions that enable the GATK to compare two sequence dictionaries -- from the reference,
* from BAMs, or from feature sources -- for consistency. The system supports two basic modes: get an enum state that
* describes at a high level the consistency between two dictionaries, or a validateDictionaries that will
* blow up with a UserException if the dicts are too incompatible.
*
* Dictionaries are tested for contig name overlaps, consistency in ordering in these overlap set, and length,
* if available.
*/
public final class SequenceDictionaryUtils {
private SequenceDictionaryUtils(){}
/**
* Compares sequence records by their order
*/
private static final Comparator SEQUENCE_INDEX_ORDER = Comparator.comparing(SAMSequenceRecord::getSequenceIndex);
// The following sets of contig records are used to perform the non-canonical human ordering check.
// This check ensures that the order is 1,2,3... instead of 1, 10, 11, 12...2, 20, 21...
// hg18
protected static final SAMSequenceRecord CHR1_HG18 = new SAMSequenceRecord("chr1", 247249719);
protected static final SAMSequenceRecord CHR2_HG18 = new SAMSequenceRecord("chr2", 242951149);
protected static final SAMSequenceRecord CHR10_HG18 = new SAMSequenceRecord("chr10", 135374737);
// hg19
protected static final SAMSequenceRecord CHR1_HG19 = new SAMSequenceRecord("chr1", 249250621);
protected static final SAMSequenceRecord CHR2_HG19 = new SAMSequenceRecord("chr2", 243199373);
protected static final SAMSequenceRecord CHR10_HG19 = new SAMSequenceRecord("chr10", 135534747);
// b36
protected static final SAMSequenceRecord CHR1_B36 = new SAMSequenceRecord("1", 247249719);
protected static final SAMSequenceRecord CHR2_B36 = new SAMSequenceRecord("2", 242951149);
protected static final SAMSequenceRecord CHR10_B36 = new SAMSequenceRecord("10", 135374737);
// b37
protected static final SAMSequenceRecord CHR1_B37 = new SAMSequenceRecord("1", 249250621);
protected static final SAMSequenceRecord CHR2_B37 = new SAMSequenceRecord("2", 243199373);
protected static final SAMSequenceRecord CHR10_B37 = new SAMSequenceRecord("10", 135534747);
public enum SequenceDictionaryCompatibility {
IDENTICAL, // the dictionaries are identical
COMMON_SUBSET, // there exists a common subset of equivalent contigs
SUPERSET, // the first dict's set of contigs supersets the second dict's set
NO_COMMON_CONTIGS, // no overlap between dictionaries
UNEQUAL_COMMON_CONTIGS, // common subset has contigs that have the same name but different lengths
NON_CANONICAL_HUMAN_ORDER, // human reference detected but the order of the contigs is non-standard (lexicographic, for example)
OUT_OF_ORDER, // the two dictionaries overlap but the overlapping contigs occur in different
// orders with respect to each other
DIFFERENT_INDICES // the two dictionaries overlap and the overlapping contigs occur in the same
// order with respect to each other, but one or more of them have different
// indices in the two dictionaries. Eg., { chrM, chr1, chr2 } vs. { chr1, chr2 }
}
/**
* Tests for compatibility between two sequence dictionaries, using standard validation settings appropriate
* for the GATK. If the dictionaries are incompatible, then UserExceptions are thrown with detailed error messages.
*
* The standard validation settings used by this method are:
*
* -Require the dictionaries to share a common subset of equivalent contigs
*
* -Do not require dict1 to be a superset of dict2.
*
* -Do not perform checks related to contig ordering: don't throw if the common contigs are in
* different orders with respect to each other, occur at different absolute indices, or are
* lexicographically sorted human dictionaries. GATK uses contig names rather than contig
* indices, and so should not be sensitive to contig ordering issues.
*
* For comparing a CRAM dictionary against a reference dictionary, call
* {@link #validateCRAMDictionaryAgainstReference(SAMSequenceDictionary, SAMSequenceDictionary)} instead.
*
* @param name1 name associated with dict1
* @param dict1 the sequence dictionary dict1
* @param name2 name associated with dict2
* @param dict2 the sequence dictionary dict2
*/
public static void validateDictionaries( final String name1,
final SAMSequenceDictionary dict1,
final String name2,
final SAMSequenceDictionary dict2) {
final boolean requireSuperset = false;
final boolean checkContigOrdering = false;
validateDictionaries(name1, dict1, name2, dict2, requireSuperset, checkContigOrdering);
}
/**
* Tests for compatibility between a reference dictionary and a CRAM dictionary, using appropriate
* validation settings. If the dictionaries are incompatible, then UserExceptions are thrown with
* detailed error messages.
*
* The standard validation settings used by this method are:
*
* -Require the reference dictionary to be a superset of the cram dictionary
*
* -Do not perform checks related to contig ordering: don't throw if the common contigs are in
* different orders with respect to each other, occur at different absolute indices, or are
* lexicographically sorted human dictionaries. GATK uses contig names rather than contig
* indices, and so should not be sensitive to contig ordering issues.
*
* @param referenceDictionary the sequence dictionary for the reference
* @param cramDictionary sequence dictionary from a CRAM file
*/
public static void validateCRAMDictionaryAgainstReference( final SAMSequenceDictionary referenceDictionary,
final SAMSequenceDictionary cramDictionary ) {
// For CRAM, we require the reference dictionary to be a superset of the reads dictionary
final boolean requireSuperset = true;
final boolean checkContigOrdering = false;
validateDictionaries("reference", referenceDictionary, "reads", cramDictionary, requireSuperset, checkContigOrdering);
}
/**
* Tests for compatibility between two sequence dictionaries. If the dictionaries are incompatible, then
* UserExceptions are thrown with detailed error messages.
*
* Two sequence dictionaries are compatible if they share a common subset of equivalent contigs,
* where equivalent contigs are defined as having the same name and length.
*
* @param name1 name associated with dict1
* @param dict1 the sequence dictionary dict1
* @param name2 name associated with dict2
* @param dict2 the sequence dictionary dict2
* @param requireSuperset if true, require that dict1 be a superset of dict2, rather than dict1 and dict2 sharing a common subset
* @param checkContigOrdering if true, require common contigs to be in the same relative order with respect to each other
* and occur at the same absolute indices, and forbid lexicographically-sorted human dictionaries
*/
public static void validateDictionaries( final String name1,
final SAMSequenceDictionary dict1,
final String name2,
final SAMSequenceDictionary dict2,
final boolean requireSuperset,
final boolean checkContigOrdering ) {
Utils.nonNull(dict1, "Something went wrong with sequence dictionary detection, check that "+name1+" has a valid sequence dictionary");
Utils.nonNull(dict2, "Something went wrong with sequence dictionary detection, check that "+name2+" has a valid sequence dictionary");
final SequenceDictionaryCompatibility type = compareDictionaries(dict1, dict2, checkContigOrdering);
switch ( type ) {
case IDENTICAL:
return;
case COMMON_SUBSET:
if ( requireSuperset ) {
final Set contigs1 = dict1.getSequences().stream().map(SAMSequenceRecord::getSequenceName).collect(Collectors.toSet());
final List missingContigs = dict2.getSequences().stream()
.map(SAMSequenceRecord::getSequenceName)
.filter(contig -> !contigs1.contains(contig))
.collect(Collectors.toList());
throw new UserException.IncompatibleSequenceDictionaries(String.format("Dictionary %s is missing contigs found in dictionary %s. Missing contigs: \n %s \n", name1, name2, String.join(", ", missingContigs)), name1, dict1, name2, dict2);
}
return;
case SUPERSET:
return;
case NO_COMMON_CONTIGS:
throw new UserException.IncompatibleSequenceDictionaries("No overlapping contigs found", name1, dict1, name2, dict2);
case UNEQUAL_COMMON_CONTIGS: {
List x = findDisequalCommonContigs(getCommonContigsByName(dict1, dict2), dict1, dict2);
SAMSequenceRecord elt1 = x.get(0);
SAMSequenceRecord elt2 = x.get(1);
UserException ex = new UserException.IncompatibleSequenceDictionaries(String.format("Found contigs with the same name but different lengths:\n contig %s = %s / %d\n contig %s = %s / %d",
name1, elt1.getSequenceName(), elt1.getSequenceLength(),
name2, elt2.getSequenceName(), elt2.getSequenceLength()),
name1, dict1, name2, dict2);
throw ex;
}
case NON_CANONICAL_HUMAN_ORDER: {
// We only get NON_CANONICAL_HUMAN_ORDER if the caller explicitly requested that we check contig ordering,
// so we should always throw when we see it.
UserException ex;
if ( nonCanonicalHumanContigOrder(dict1) ) {
ex = new UserException.LexicographicallySortedSequenceDictionary(name1, dict1);
}
else {
ex = new UserException.LexicographicallySortedSequenceDictionary(name2, dict2);
}
throw ex;
}
case OUT_OF_ORDER: {
// We only get OUT_OF_ORDER if the caller explicitly requested that we check contig ordering,
// so we should always throw when we see it.
UserException ex = new UserException.IncompatibleSequenceDictionaries(
"The relative ordering of the common contigs in " + name1 + " and " + name2 +
" is not the same; to fix this please see: "
+ "(https://www.broadinstitute.org/gatk/guide/article?id=1328), "
+ " which describes reordering contigs in BAM and VCF files.",
name1, dict1, name2, dict2);
throw ex;
}
case DIFFERENT_INDICES: {
// We only get DIFFERENT_INDICES if the caller explicitly requested that we check contig ordering,
// so we should always throw when we see it.
final String msg = "One or more contigs common to both dictionaries have " +
"different indices (ie., absolute positions) in each dictionary. Code " +
"that is sensitive to contig ordering can fail when this is the case. " +
"You should fix the sequence dictionaries so that all shared contigs " +
"occur at the same absolute positions in both dictionaries.";
final UserException ex = new UserException.IncompatibleSequenceDictionaries(msg, name1, dict1, name2, dict2);
throw ex;
}
default:
throw new GATKException("Unexpected SequenceDictionaryComparison type: " + type);
}
}
/**
* Workhorse routine that takes two dictionaries and returns their compatibility.
*
* @param dict1 first sequence dictionary
* @param dict2 second sequence dictionary
* @param checkContigOrdering if true, perform checks related to contig ordering: forbid lexicographically-sorted
* dictionaries, and require common contigs to be in the same relative order and at the
* same absolute indices
* @return A SequenceDictionaryCompatibility enum value describing the compatibility of the two dictionaries
*/
public static SequenceDictionaryCompatibility compareDictionaries( final SAMSequenceDictionary dict1, final SAMSequenceDictionary dict2, final boolean checkContigOrdering ) {
if ( checkContigOrdering && (nonCanonicalHumanContigOrder(dict1) || nonCanonicalHumanContigOrder(dict2)) ) {
return SequenceDictionaryCompatibility.NON_CANONICAL_HUMAN_ORDER;
}
final Set commonContigs = getCommonContigsByName(dict1, dict2);
if (commonContigs.isEmpty()) {
return SequenceDictionaryCompatibility.NO_COMMON_CONTIGS;
}
else if ( ! commonContigsHaveSameLengths(commonContigs, dict1, dict2) ) {
return SequenceDictionaryCompatibility.UNEQUAL_COMMON_CONTIGS;
}
final boolean commonContigsAreInSameRelativeOrder = commonContigsAreInSameRelativeOrder(commonContigs, dict1, dict2);
if ( checkContigOrdering && ! commonContigsAreInSameRelativeOrder ) {
return SequenceDictionaryCompatibility.OUT_OF_ORDER;
}
else if ( commonContigsAreInSameRelativeOrder && commonContigs.size() == dict1.size() && commonContigs.size() == dict2.size() ) {
return SequenceDictionaryCompatibility.IDENTICAL;
}
else if ( checkContigOrdering && ! commonContigsAreAtSameIndices(commonContigs, dict1, dict2) ) {
return SequenceDictionaryCompatibility.DIFFERENT_INDICES;
}
else if ( supersets(dict1, dict2) ) {
return SequenceDictionaryCompatibility.SUPERSET;
}
else {
return SequenceDictionaryCompatibility.COMMON_SUBSET;
}
}
/**
* Utility function that tests whether dict1's set of contigs is a superset of dict2's
*
* @param dict1 first sequence dictionary
* @param dict2 second sequence dictionary
* @return true if dict1's set of contigs supersets dict2's
*/
private static boolean supersets( SAMSequenceDictionary dict1, SAMSequenceDictionary dict2 ) {
// Cannot rely on SAMSequenceRecord.equals() as it's too strict (takes extended attributes into account).
for ( final SAMSequenceRecord dict2Record : dict2.getSequences() ) {
final SAMSequenceRecord dict1Record = dict1.getSequence(dict2Record.getSequenceName());
if ( dict1Record == null || ! sequenceRecordsAreEquivalent(dict2Record, dict1Record) ) {
return false;
}
}
return true;
}
/**
* Utility function that tests whether the commonContigs in both dicts are equivalent. Equivalence means
* that the seq records have the same length, if both are non-zero.
*
* @param commonContigs
* @param dict1
* @param dict2
* @return true if all of the common contigs are equivalent
*/
private static boolean commonContigsHaveSameLengths(Set commonContigs, SAMSequenceDictionary dict1, SAMSequenceDictionary dict2) {
return findDisequalCommonContigs(commonContigs, dict1, dict2) == null;
}
/**
* Returns a List(x,y) that contains two disequal sequence records among the common contigs in both dicts. Returns
* null if all common contigs are equivalent
*
* @param commonContigs
* @param dict1
* @param dict2
* @return
*/
private static List findDisequalCommonContigs(Set commonContigs, SAMSequenceDictionary dict1, SAMSequenceDictionary dict2) {
for ( String name : commonContigs ) {
SAMSequenceRecord elt1 = dict1.getSequence(name);
SAMSequenceRecord elt2 = dict2.getSequence(name);
if ( ! sequenceRecordsAreEquivalent(elt1, elt2) )
return Arrays.asList(elt1,elt2);
}
return null;
}
/**
* Helper routine that returns whether two sequence records are equivalent, defined as having the same name and
* lengths.
*
* NOTE: we allow the lengths to differ if one or both are UNKNOWN_SEQUENCE_LENGTH
*
* @param first first sequence record to compare
* @param second second sequence record to compare
* @return true if first and second have the same names and lengths, otherwise false
*/
public static boolean sequenceRecordsAreEquivalent(final SAMSequenceRecord first, final SAMSequenceRecord second) {
if ( first == second ) {
return true;
}
if ( first == null || second == null ) {
return false;
}
final int length1 = first.getSequenceLength();
final int length2 = second.getSequenceLength();
if (length1 != length2 && length1 != SAMSequenceRecord.UNKNOWN_SEQUENCE_LENGTH && length2 != SAMSequenceRecord.UNKNOWN_SEQUENCE_LENGTH){
return false;
}
if (! first.getSequenceName().equals(second.getSequenceName())){
return false;
}
return true;
}
/**
* A very simple (and naive) algorithm to determine (1) if the dict is a human reference (hg18, hg19, b36, or b37) and if it's
* lexicographically sorted. Works by matching lengths of the static chr1, chr10, and chr2, and then if these
* are all matched, requiring that the order be chr1, chr2, chr10.
*
* @param dict
* @return
*/
private static boolean nonCanonicalHumanContigOrder(SAMSequenceDictionary dict) {
SAMSequenceRecord chr1 = null, chr2 = null, chr10 = null;
for ( SAMSequenceRecord elt : dict.getSequences() ) {
if ( isHumanSeqRecord(elt, CHR1_HG18, CHR1_HG19, CHR1_B36, CHR1_B37) ) chr1 = elt;
if ( isHumanSeqRecord(elt, CHR2_HG18, CHR2_HG19, CHR2_B36, CHR2_B37) ) chr2 = elt;
if ( isHumanSeqRecord(elt, CHR10_HG18, CHR10_HG19, CHR10_B36, CHR10_B37) ) chr10 = elt;
}
if ( chr1 != null && chr2 != null && chr10 != null) {
return ! ( chr1.getSequenceIndex() < chr2.getSequenceIndex() && chr2.getSequenceIndex() < chr10.getSequenceIndex() );
}
return false;
}
/**
* Trivial helper that returns true if elt has the same name and length as rec1 or rec2
* @param elt record to test
* @param recs the list of records to check for name and length equivalence
* @return true if elt has the same name and length as any of the recs
*/
private static boolean isHumanSeqRecord(SAMSequenceRecord elt, SAMSequenceRecord... recs) {
for (SAMSequenceRecord rec : recs) {
if (elt.getSequenceLength() == rec.getSequenceLength() && elt.getSequenceName().equals(rec.getSequenceName())) {
return true;
}
}
return false;
}
/**
* Returns true if the common contigs in dict1 and dict2 are in the same relative order, without regard to
* absolute index position. This is accomplished by getting the common contigs in both dictionaries, sorting
* these according to their indices, and then walking through the sorted list to ensure that each ordered contig
* is equivalent
*
* @param commonContigs names of the contigs common to both dictionaries
* @param dict1 first SAMSequenceDictionary
* @param dict2 second SAMSequenceDictionary
* @return true if the common contigs occur in the same relative order in both dict1 and dict2, otherwise false
*/
private static boolean commonContigsAreInSameRelativeOrder(Set commonContigs, SAMSequenceDictionary dict1, SAMSequenceDictionary dict2) {
final List list1 = getSequencesOfName(commonContigs, dict1);
final List list2 = getSequencesOfName(commonContigs, dict2);
list1.sort(SEQUENCE_INDEX_ORDER);
list2.sort(SEQUENCE_INDEX_ORDER);
for ( int i = 0; i < list1.size(); i++ ) {
SAMSequenceRecord elt1 = list1.get(i);
SAMSequenceRecord elt2 = list2.get(i);
if ( ! elt1.getSequenceName().equals(elt2.getSequenceName()) )
return false;
}
return true;
}
/**
* Gets the subset of SAMSequenceRecords in commonContigs in dict
*
* @param commonContigs
* @param dict
* @return
*/
private static List getSequencesOfName(Set commonContigs, SAMSequenceDictionary dict) {
List l = new ArrayList<>(commonContigs.size());
for ( String name : commonContigs ) {
l.add(dict.getSequence(name) );
}
return l;
}
/**
* Checks whether the common contigs in the given sequence dictionaries occur at the same indices
* in both dictionaries
*
* @param commonContigs Set of names of the contigs that occur in both dictionaries
* @param dict1 first sequence dictionary
* @param dict2 second sequence dictionary
* @return true if the contigs common to dict1 and dict2 occur at the same indices in both dictionaries,
* otherwise false
*/
private static boolean commonContigsAreAtSameIndices( final Set commonContigs, final SAMSequenceDictionary dict1, final SAMSequenceDictionary dict2 ) {
for ( String commonContig : commonContigs ) {
SAMSequenceRecord dict1Record = dict1.getSequence(commonContig);
SAMSequenceRecord dict2Record = dict2.getSequence(commonContig);
// Each common contig must have the same index in both dictionaries
if ( dict1Record.getSequenceIndex() != dict2Record.getSequenceIndex() ) {
return false;
}
}
return true;
}
/**
* Returns the set of contig names found in both dicts.
* @param dict1
* @param dict2
* @return
*/
public static Set getCommonContigsByName(SAMSequenceDictionary dict1, SAMSequenceDictionary dict2) {
Set intersectingSequenceNames = getContigNames(dict1);
intersectingSequenceNames.retainAll(getContigNames(dict2));
return intersectingSequenceNames;
}
public static Set getContigNames(SAMSequenceDictionary dict) {
Set contigNames = new LinkedHashSet(Utils.optimumHashSize(dict.size()));
for (SAMSequenceRecord dictionaryEntry : dict.getSequences())
contigNames.add(dictionaryEntry.getSequenceName());
return contigNames;
}
public static List getContigNamesList(final SAMSequenceDictionary refSeqDict) {
Utils.nonNull(refSeqDict, "provided reference sequence ditionary is null");
return refSeqDict.getSequences().stream().map(SAMSequenceRecord::getSequenceName).collect(Collectors.toList());
}
/**
* Returns a compact String representation of the sequence dictionary it's passed
*
* The format of the returned String is:
* [ contig1Name(length: contig1Length) contig2Name(length: contig2Length) ... ]
*
* @param dict a non-null SAMSequenceDictionary
* @return A String containing all of the contig names and lengths from the sequence dictionary it's passed
*/
public static String getDictionaryAsString( final SAMSequenceDictionary dict ) {
Utils.nonNull(dict, "Sequence dictionary must be non-null");
StringBuilder s = new StringBuilder("[ ");
for ( SAMSequenceRecord dictionaryEntry : dict.getSequences() ) {
s.append(dictionaryEntry.getSequenceName());
s.append("(length:");
s.append(dictionaryEntry.getSequenceLength());
s.append(") ");
}
s.append("]");
return s.toString();
}
}