org.broadinstitute.hellbender.tools.reference.ReferenceSequenceTable Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of gatk Show documentation
Development on GATK 4
The newest version!
package org.broadinstitute.hellbender.tools.reference;

import htsjdk.samtools.SAMSequenceDictionary;
import htsjdk.samtools.SAMSequenceRecord;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
import org.broadinstitute.hellbender.engine.GATKPath;
import org.broadinstitute.hellbender.engine.ReferenceDataSource;
import org.broadinstitute.hellbender.exceptions.UserException;
import org.broadinstitute.hellbender.utils.SimpleInterval;
import org.broadinstitute.hellbender.utils.reference.ReferenceUtils;

import java.util.*;

/**
 * Table utilized by CompareReferences tool to compare and analyze sequences found in specified references.
 * Underlying table is keyed by MD5, with the ability to query by sequence name across references as well.
 */
public class ReferenceSequenceTable implements Iterable {

    public static final Logger logger = LogManager.getLogger(ReferenceSequenceTable.class);
    public static final String MISSING_ENTRY_DISPLAY_STRING = "---";
    public static final String MD5_COLUMN_NAME = "MD5";
    public static final String LENGTH_COLUMN_NAME = "Length";
    private static final int MD5_COLUMN_INDEX = 0;
    private static final int LENGTH_COLUMN_INDEX = 1;

    private Map tableByMD5;
    private Map> tableBySequenceName;
    private Map referenceDictionaries;
    private List columnNames;
    private CompareReferences.MD5CalculationMode md5Mode;
    private final Map columnIndices;
    private List references;
    private boolean tableBuilt;


    private ReferenceSequenceTable(CompareReferences.MD5CalculationMode mode, Map dictionaries) {
        referenceDictionaries = dictionaries;
        columnNames = generateColumnNames();
        md5Mode = mode;
        references = new ArrayList<>(referenceDictionaries.keySet());
        tableBuilt = false;

        columnIndices = new HashMap<>();
        for (int i = 0; i < columnNames.size(); i++) {
            if(i == MD5_COLUMN_INDEX){
                columnIndices.put(MD5_COLUMN_NAME, i);
            }
            else if(i == LENGTH_COLUMN_INDEX){
                columnIndices.put(LENGTH_COLUMN_NAME, i);
            }
            else{
                columnIndices.put(getReferenceColumnName(references.get(i-2)), i);
            }
        }
    }

    /**
     * Returns a ReferenceSequenceTable object.
     *
     * Note: table is not constructed until a call to build() is made. All methods which attempt to use
     * the table before it has been constructed will crash.
     *
     * @param referenceSources map containing references and their data sources
     * @param mode MD5 calculation mode
     */
    public ReferenceSequenceTable(Map referenceSources, CompareReferences.MD5CalculationMode mode){
        this(mode, extractDictionariesFromReferences(referenceSources));
    }

    /**
     * Constructs a ReferenceSequenceTable from a collection of sequence dictionaries.
     *
     * Since the sequence dictionaries provided to this constructor may come from sources
     * other than references (such as CRAMs, VCFs, etc.), we force MD5CalculationMode.USE_DICT
     * mode here, since we can't necessarily go back to the original file to recalculate MD5s. The
     * caller is responsible for making sure that all dictionaries passed in have an MD5 for all
     * sequences, otherwise an error will occur during table build.
     *
     * Note: table is not constructed until a call to build() is made. All methods which attempt to use
     * the table before it has been constructed will crash.
     *
     * @param referenceDictionaries
     */
    public ReferenceSequenceTable(Map referenceDictionaries){
        this(CompareReferences.MD5CalculationMode.USE_DICT, referenceDictionaries);
    }

    private static Map extractDictionariesFromReferences(Map referenceSources){
        Map dictionaries = new LinkedHashMap<>();
        for (Map.Entry entry : referenceSources.entrySet()) {
            SAMSequenceDictionary dictionary = entry.getValue().getSequenceDictionary();
            dictionaries.put(entry.getKey(), dictionary);
        }
        return dictionaries;
    }

    private List generateColumnNames() {
        List columns = new ArrayList<>();
        columns.add(MD5_COLUMN_NAME);
        columns.add(LENGTH_COLUMN_NAME);
        for (GATKPath path : referenceDictionaries.keySet()) {
            columns.add(getReferenceColumnName(path));
        }
        return columns;
    }

    public List getColumnNames() {
        return columnNames;
    }

    /**
     * Given a GATKPath, return the name of the file as a String.
     *
     * TODO: may need to add functionality to uniquify reference names
     *
     * @param reference The path to a reference.
     * @return the name of the reference as a String.
     */
    public static String getReferenceColumnName(GATKPath reference){
        return reference.toPath().getFileName().toString();
    }

    public Map getColumnIndices() {
        return columnIndices;
    }

    private void checkTableBuildStatus(){
        if(!tableBuilt){
            throw new IllegalStateException("Table has not been built yet.");
        }
    }

    /**
     * Construct 2 tables of references: one keyed by MD5, one keyed by sequence name.
     */
    public void build() {
        tableByMD5 = new LinkedHashMap<>();
        tableBySequenceName = new LinkedHashMap<>();

        for (Map.Entry entry : referenceDictionaries.entrySet()) {
            SAMSequenceDictionary dictionary = entry.getValue();
            for (SAMSequenceRecord record : dictionary.getSequences()) {
                String name = record.getSequenceName();
                int length = record.getSequenceLength();
                String md5 = getMD5ForRecord(record, entry.getKey());
                GATKPath reference = entry.getKey();
                TableEntry newEntry = new TableEntry(getReferenceColumnName(reference), name);
                TableRow newRow = null;

                // map each MD5 to List of TableEntry objects containing length, md5, and name
                if (!tableByMD5.containsKey(md5)) {
                    newRow = new TableRow(md5, length, new ArrayList<>(referenceDictionaries.keySet()));
                    tableByMD5.put(md5, newRow);
                }
                tableByMD5.get(md5).add(newEntry);

                if(!tableBySequenceName.containsKey(name)){
                    Set rowSet = new HashSet<>();
                    tableBySequenceName.put(name, rowSet);
                }
                tableBySequenceName.get(name).add(tableByMD5.get(md5));
            }
        }
        tableBuilt = true;
    }

    public Set getAllSequenceNames(){
        checkTableBuildStatus();

        return tableBySequenceName.keySet();
    }

    /**
     * Given an MD5, returns its corresponding row
     *
     * @param md5 The MD5 as a String
     * @return the corresponding TableRow from the tableByMD5
     */
    public TableRow queryByMD5(String md5){
        checkTableBuildStatus();

        return tableByMD5.get(md5);
    }

    /**
     * Given a sequence name, returns the set of its corresponding rows or an empty set if no such rows found.
     *
     * @param sequenceName The sequence name as a String
     * @return the set of TableRows that contain the sequence name
     */
    public Set queryBySequenceName(String sequenceName){
        checkTableBuildStatus();

        Set rows = tableBySequenceName.get(sequenceName);
        return rows == null ? Collections.emptySet() : rows;
    }

    private String getMD5ForRecord(SAMSequenceRecord record, GATKPath referencePath) {
        final String md5FromDict = record.getMd5();
        final SimpleInterval referenceInterval = new SimpleInterval(record.getSequenceName(), 1, record.getSequenceLength());
        String md5 = null;

        switch (md5Mode) {
            case USE_DICT:
                if (md5FromDict == null || md5FromDict.isEmpty()) {
                    throw new UserException.BadInput("Running in USE_DICT mode, but MD5 missing for sequence " + record.getSequenceName() + ". Run --md5-calculation-mode with a different mode to recalculate MD5.");
                }
                md5 = md5FromDict;
                break;
            case RECALCULATE_IF_MISSING:
                if (md5FromDict == null || md5FromDict.isEmpty()) {
                    logger.warn(String.format("%s in sequence dictionary for %s missing MD5. MD5 recalculated.", record.getSequenceName(), getReferenceColumnName(referencePath)));
                    md5 = ReferenceUtils.calculateMD5(referencePath, referenceInterval);
                }
                else {
                    md5 = md5FromDict;
                }
                break;
            case ALWAYS_RECALCULATE:
                md5 = ReferenceUtils.calculateMD5(referencePath, referenceInterval);
                if(md5FromDict != null && !md5FromDict.equals(md5)){
                    logger.warn(String.format("MD5 Mismatch for sequence %s in %s reference. Found '%s' in sequence dictionary, but calculated '%s'. Sequence dictionary may be invalid.", record.getSequenceName(), getReferenceColumnName(referencePath), md5FromDict, md5));
                }
                break;
        }
        return md5;
    }

    /**
     * Generate ReferencePairs for pairwise comparison of all references present in the table
     *
     * @return the list of ReferencePairs for every pair of references
     */
    public final List generateReferencePairs(){
        List referencePairs = new ArrayList<>();
        for(int i = 0; i < references.size(); i++){
            for(int j = i + 1; j < references.size(); j++){
                referencePairs.add(new ReferencePair(this, references.get(i), references.get(j)));
            }
        }
        return referencePairs;
    }

    /**
     * Generate ReferencePairs for comparison of all references present in the table against a provided dictionary.
     *
     * @return the list of ReferencePairs
     */
    public final List generateReferencePairsAgainstDictionary(GATKPath dictionary){
        List referencePairs = new ArrayList<>();
        for(int i = 0; i < references.size(); i++){
            GATKPath currRef = references.get(i);
            if(!currRef.equals(dictionary)){
                referencePairs.add(new ReferencePair(this, dictionary, currRef));
            }
        }
        return referencePairs;
    }

    /** Create list of ReferencePair for pairwise comparison of references in the table and analyze each pair.
     *
     * @return list of ReferencePairs in the table with updated status sets
     */
    public List compareAllReferences(){
        List refPairs = generateReferencePairs();
        return analyzeTable(refPairs);
    }

    /** Create list ReferencePair for comparison of references in the table against a key dictionary and analyze each pair.
     *
     * @param dictionary key dictionary
     * @return list of ReferencePairs keyed against the provided dictionary with updated status sets
     */
    public List compareAgainstKeyReference(GATKPath dictionary){
        List refPairs = generateReferencePairsAgainstDictionary(dictionary);
        return analyzeTable(refPairs);
    }

    /**
     * Analyze the table by doing a pairwise comparison for all table references. Analyzes each ReferencePair
     * in the list passed in. First, traverses the table by MD5 and checks for any discrepancies across the column values
     * that violate an exact match or a difference in sequence name. Then traverses by sequence name and checks
     * each sequence name across the references to determine if the sequence name is present in both but corresponding to
     * different sequences or missing in one, and if so, if it constitutes a superset/subset case.
     *
     * Each ReferencePair is analyzed as above and assigned an analysis as a set of the following statuses:
     *      EXACT_MATCH,
     *      DIFFER_IN_SEQUENCE_NAMES,
     *      DIFFER_IN_SEQUENCE,
     *      DIFFER_IN_SEQUENCES_PRESENT,
     *      SUPERSET,
     *      SUBSET
     * @param refPairs list of ReferencePairs in the table
     * @return list of ReferencePairs with updated status sets
     */
    private List analyzeTable(List refPairs){
        checkTableBuildStatus();

        for(TableRow row : tableByMD5.values()) {
            for(ReferencePair pair : refPairs) {
                int ref1Index = pair.getRef1ColumnIndex();
                int ref2Index = pair.getRef2ColumnIndex();
                TableEntry[] entries = row.getEntries();

                TableEntry ref1Value = entries[ref1Index];
                TableEntry ref2Value = entries[ref2Index];
                if (!ref1Value.getColumnValue().equals(ref2Value.getColumnValue())) {
                    pair.removeStatus(ReferencePair.Status.EXACT_MATCH);
                }
                if(!ref1Value.getColumnValue().equals(ref2Value.getColumnValue()) &&
                        !(ref1Value.isEmpty() || ref2Value.isEmpty())){
                    pair.addStatus(ReferencePair.Status.DIFFER_IN_SEQUENCE_NAMES);
                }
            }
        }

        for(ReferencePair pair : refPairs){
            boolean superset = false;
            boolean subset = false;
            for(String sequenceName : tableBySequenceName.keySet()){
                int ref1Index = pair.getRef1ColumnIndex();
                int ref2Index = pair.getRef2ColumnIndex();
                Set rows = queryBySequenceName(sequenceName);
                int sequenceNameFoundInOneRef = 0;

                for(TableRow row : rows) {
                    TableEntry[] entries = row.getEntries();
                    TableEntry ref1Value = entries[ref1Index];
                    TableEntry ref2Value = entries[ref2Index];

                    if((ref1Value.getColumnValue().equals(sequenceName) ^ ref2Value.getColumnValue().equals(sequenceName))
                            && (ref1Value.isEmpty() ^ ref2Value.isEmpty())){
                        sequenceNameFoundInOneRef++;
                    }
                    if(ref1Value.isEmpty() ^ ref2Value.isEmpty()){
                        if(ref1Value.isEmpty()){
                            subset = true;
                        }
                        else if(ref2Value.isEmpty()){
                            superset = true;
                        }
                    }
                }

                // if 2 difference sequences with same name found, mismatching sequences for that name
                if(sequenceNameFoundInOneRef == 2){
                    pair.addStatus(ReferencePair.Status.DIFFER_IN_SEQUENCE);
                // if a sequence name is found only once, it is present in one reference but not the other
                } else if(sequenceNameFoundInOneRef == 1){
                    pair.addStatus(ReferencePair.Status.DIFFER_IN_SEQUENCES_PRESENT);
                } else if (sequenceNameFoundInOneRef > 2){
                    throw new UserException.BadInput(String.format("Duplicate of sequence '%s' found in %s or %s.", sequenceName, pair.getRef1AsString(), pair.getRef2AsString()));
                }
            }

            // if a pair has exclusively superset or subset marked as true and no naming discrepancies found,
            // mark it as super/sub set and remove the 'DIFFER_IN_SEQUENCES_PRESENT' tag
            if(superset ^ subset && !pair.getAnalysis().contains(ReferencePair.Status.DIFFER_IN_SEQUENCE_NAMES)){
                pair.removeStatus(ReferencePair.Status.DIFFER_IN_SEQUENCES_PRESENT);

                if(superset && !subset){
                    pair.addStatus(ReferencePair.Status.SUPERSET);
                } else if(subset && !superset) {
                    pair.addStatus(ReferencePair.Status.SUBSET);
                }
            }
        }
        return refPairs;
    }

    @Override
    public Iterator iterator() {
        checkTableBuildStatus();

        return tableByMD5.values().iterator();
    }

    /**
     * Minimal class representing a table entry within a row of the ReferenceSequenceTable.
     * Stores the entry's value and the associated column.
     */
    public class TableEntry {

        private final String columnName;
        private final String columnValue;

        public TableEntry(String columnName, String columnValue){
            this.columnName = columnName;
            this.columnValue = columnValue;
        }

        public String getColumnName() { return columnName; }
        public String getColumnValue() { return columnValue; }

        public boolean isEmpty(){
            return columnValue.equals(MISSING_ENTRY_DISPLAY_STRING);
        }

        @Override
        public boolean equals(Object o) {
            if (this == o) return true;
            if (o == null || getClass() != o.getClass()) return false;
            TableEntry that = (TableEntry) o;
            return columnName.equals(that.columnName) && columnValue.equals(that.columnValue);
        }

        @Override
        public int hashCode() {
            return Objects.hash(columnName, columnValue);
        }
    }

    /**
     * Class representing a row of the ReferenceSequenceTable. Rows are keyed by MD5 and the remaining columns
     * contain the sequence length, followed by the sequence name across each reference.
     */
    public class TableRow {
        private final String md5;
        private final TableEntry[] entries;
        private final int length;

        public TableRow(String md5, int length, List references) {
            this.md5 = md5;
            this.length = length;
            entries = new TableEntry[references.size()+2];

            add(new TableEntry(MD5_COLUMN_NAME, md5));
            add(new TableEntry(LENGTH_COLUMN_NAME, Integer.toString(length)));

            for(int i = LENGTH_COLUMN_INDEX + 1; i < entries.length; i++){
                entries[i] = new TableEntry(getColumnNames().get(i), MISSING_ENTRY_DISPLAY_STRING);
            }
        }

        public void add(TableEntry entry) {
            int idx = columnIndices.get(entry.getColumnName());
            entries[idx] = entry;
        }

        public String getMd5() {
            return md5;
        }

        public TableEntry[] getEntries() {
            return entries;
        }

        public int getLength() {
            return length;
        }

        public List getColumnNames() {
            return columnNames;
        }

        // TableRows keyed by MD5 -- TableRows considered equal when they have matching MD5s
        @Override
        public boolean equals(Object o) {
            if (this == o) return true;
            if (o == null || getClass() != o.getClass()) return false;
            TableRow tableRow = (TableRow) o;
            return md5.equals(tableRow.md5);
        }

        @Override
        public int hashCode() {
            return Objects.hash(md5);
        }
    }
}