org.broadinstitute.hellbender.tools.reference.CompareReferences Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of gatk Show documentation
Development on GATK 4
The newest version!
package org.broadinstitute.hellbender.tools.reference;

import htsjdk.samtools.reference.FastaReferenceWriter;
import htsjdk.samtools.reference.FastaReferenceWriterBuilder;
import htsjdk.samtools.reference.ReferenceSequence;
import htsjdk.variant.variantcontext.VariantContext;
import htsjdk.variant.variantcontext.VariantContextBuilder;
import htsjdk.variant.variantcontext.writer.VariantContextWriter;
import htsjdk.variant.vcf.VCFHeader;
import org.broadinstitute.barclay.argparser.Argument;
import org.broadinstitute.barclay.argparser.CommandLineException;
import org.broadinstitute.barclay.argparser.CommandLineProgramProperties;
import org.broadinstitute.barclay.argparser.ExperimentalFeature;
import org.broadinstitute.barclay.help.DocumentedFeature;
import org.broadinstitute.hellbender.cmdline.StandardArgumentDefinitions;
import org.broadinstitute.hellbender.engine.GATKPath;
import org.broadinstitute.hellbender.engine.GATKTool;
import org.broadinstitute.hellbender.engine.ReferenceDataSource;
import org.broadinstitute.hellbender.exceptions.GATKException;
import org.broadinstitute.hellbender.exceptions.UserException;
import org.broadinstitute.hellbender.tools.funcotator.FilterFuncotations;
import org.broadinstitute.hellbender.utils.SimpleInterval;
import org.broadinstitute.hellbender.utils.alignment.MummerExecutor;
import org.broadinstitute.hellbender.utils.io.IOUtils;
import org.broadinstitute.hellbender.utils.text.XReadLines;
import org.broadinstitute.hellbender.utils.tsv.DataLine;
import org.broadinstitute.hellbender.utils.tsv.TableColumnCollection;
import org.broadinstitute.hellbender.utils.tsv.TableWriter;
import picard.cmdline.programgroups.ReferenceProgramGroup;

import java.io.*;
import java.nio.file.Files;
import java.nio.file.Path;
import java.util.*;

/**
 * Display reference comparison as a tab-delimited table and summarize reference differences.
 *
 * This tool generates a MD5-keyed table comparing specified references and does an analysis to summarize the differences
 * between the references provided. Comparisons are made against a "special" reference, specified with the -R
 * argument. Subsequent references to be compared may be specified using the -references-to-compare argument.
 * The table can be directed to a file or standard output using provided command-line arguments.
 * A supplementary table keyed by sequence name can be displayed using the -display-sequences-by-name argument;
 * to display only sequence names for which the references are not consistent, run with the -display-only-differing-sequences
 * argument as well.
 *
 * Input
 * 
 * The references and MD5 calculation mode.
 * 
 *
 * Output
 * 
 * A TSV file or output stream displaying the MD5-keyed table comparison, and a table analysis displayed to standard output.
 * 
 * MD5	Length	hg19mini.fasta	hg19mini_1renamed.fasta
 * 8c0c38e352d8f3309eabe4845456f274	16000	1	chr1
 * 5f8388fe3fb34aa38375ae6cf5e45b89	16000	2	2
 * 94de808a3a2203dbb02434a47bd8184f	16000	3	3
 * 7d397ee919e379328d8f52c57a54c778	16000	4	4
 *
 * REFERENCE PAIR: hg19mini.fasta, hg19mini_1renamed.fasta
 * Status:
 * 	DIFFER_IN_SEQUENCE_NAMES
 * 
 * 
 *
 * Usage example
 *  * gatk CompareReferences \
 *   -R reference1.fasta \
 *   -refcomp reference2.fasta \
 *   -O output.fasta \
 *   -md5-calculation-mode USE_DICT
 * 
 *
 */

@CommandLineProgramProperties(
        summary = "Compare multiple references and output a tab-delimited table detailing what the differences are and a summarized analysis of each pair of references.",
        oneLineSummary = "Display reference comparison as a tab-delimited table and summarize reference differences.",
        programGroup = ReferenceProgramGroup.class
)

@DocumentedFeature
@ExperimentalFeature
public class CompareReferences extends GATKTool {



    @Argument(fullName = "references-to-compare", shortName = "refcomp", doc = "Reference sequence file(s) to compare.")
    private List references;

    /**
     * Output file will be written here.
     *
     * Note: If no output file provided, table will print to standard output.
     */
    @Argument(fullName = StandardArgumentDefinitions.OUTPUT_LONG_NAME, shortName = StandardArgumentDefinitions.OUTPUT_SHORT_NAME, doc = "If specified, output reference sequence table in TSV format to this file. Otherwise print it to stdout.", optional = true)
    private GATKPath output;

    @Argument(fullName = "md5-calculation-mode", shortName = "md5-calculation-mode", doc = "MD5CalculationMode indicating method of MD5 calculation.", optional = true)
    private MD5CalculationMode md5CalculationMode = MD5CalculationMode.RECALCULATE_IF_MISSING;

    @Argument(fullName = "display-sequences-by-name", doc = "If provided, the table by sequence name will be printed.", optional = true)
    private boolean displaySequencesByName = false;

    @Argument(fullName = "display-only-differing-sequences", doc = "If provided, only display sequence names that differ in their actual sequence.", optional = true)
    private boolean onlyDisplayDifferingSequences = false;

    @Argument(fullName = "base-comparison", doc = "Mode for base-level comparisons. Default is off, but can do full alignment of mismatching sequences to find SNPs and INDELs (FULL_ALIGNMENT), or detect SNPs only in mismatching sequences of the same length (FIND_SNPS_ONLY).", optional = true)
    private BaseComparisonMode baseComparisonMode = BaseComparisonMode.NO_BASE_COMPARISON;

    @Argument(fullName = "base-comparison-output", doc = "Output directory for base comparison outputs. Required for running base-comparison in FULL_ALIGNMENT or FIND_SNPS_ONLY mode.", optional = true)
    private GATKPath baseComparisonOutputDirectory;


    public enum MD5CalculationMode {
        // use only MD5s found in dictionary; if MD5 missing, crashes
        USE_DICT,
        // use any MD5s found in dictionary and recalculate any missing MD5s
        RECALCULATE_IF_MISSING,
        // recalculate all MD5s, regardless of presence in dictionary
        ALWAYS_RECALCULATE;
    }

    public enum BaseComparisonMode{
        // no base comparison
        NO_BASE_COMPARISON,
        // run the mummer pipeline to generate a VCF file containing SNPs and INDELs for any mismatching sequences of same name
        FULL_ALIGNMENT,
        // do a base-by-base comparison of any same-name, same-length mismatching sequences to output a table containing each base mismatch
        FIND_SNPS_ONLY
    }

    private Map referenceSources;

    @Override
    public boolean requiresReference() {
        return true;
    }

    @Override
    public void onTraversalStart() {
        // add data source for -R reference
        referenceSources = new LinkedHashMap<>();
        referenceSources.put(referenceArguments.getReferenceSpecifier(), directlyAccessEngineReferenceDataSource());

        // add data sources for remaining references
        for(GATKPath path : references){
            referenceSources.put(path, ReferenceDataSource.of(path.toPath()));
        }

        // must have exactly 2 references to run either base comparison mode
        if(baseComparisonMode != BaseComparisonMode.NO_BASE_COMPARISON){
            if(baseComparisonOutputDirectory == null) {
                throw new UserException.CouldNotCreateOutputFile(baseComparisonOutputDirectory, "Output directory not provided but required in -base-comparison "  +  baseComparisonMode + " mode.");
            }
            if(!Files.exists(baseComparisonOutputDirectory.toPath())){
                throw new UserException.CouldNotCreateOutputFile(baseComparisonOutputDirectory, "Output directory non-existent.");
            }
            if(referenceSources.size() != 2) {
                throw new UserException.BadInput("Base comparison modes can only be run on 2 references.");
            }
        }
    }

    @Override
    public void traverse(){
        ReferenceSequenceTable table = new ReferenceSequenceTable(referenceSources, md5CalculationMode);
        logger.info("Building reference sequence table.");
        table.build();
        logger.info("Finished building table.");

        writeTable(table);

        if(displaySequencesByName){
            writeTableBySequenceName(table);
        }

        logger.info("Analyzing table.");
        List referencePairs = table.compareAllReferences();
        logger.info("Finished analyzing table.");
        System.out.println("*********************************************************");
        for(ReferencePair pair : referencePairs){
            System.out.println(pair);
        }

        ReferencePair refPair = referencePairs.get(0);
        switch (baseComparisonMode) {
            case FULL_ALIGNMENT:
                runFullAlignment(refPair, table);
                break;
            case FIND_SNPS_ONLY:
                runFindSNPS(refPair, table);
                break;
        }
    }


    /**
     * Given a table, write table to output.
     *
     * Note: if no output file specified, displays to standard output.
     *
     * @param table
     */
    private void writeTable(ReferenceSequenceTable table){
        TableColumnCollection columns = new TableColumnCollection(table.getColumnNames());
        try(CompareReferences.CompareReferencesOutputTableWriter writer = output == null
            ? new CompareReferences.CompareReferencesOutputTableWriter(new OutputStreamWriter(System.out), columns)
            : new CompareReferences.CompareReferencesOutputTableWriter(output.toPath(), columns)
        ){
            writer.writeHeaderIfApplies();
            for(ReferenceSequenceTable.TableRow row : table){
                writer.writeRecord(row);
            }
        }
        catch(IOException exception){
             throw (output == null)
             ? new UserException.CouldNotCreateOutputFile("System.out", "Failed to write output table.", exception)
             : new UserException.CouldNotCreateOutputFile(output, "Failed to write output table.", exception);
        }
    }

    /**
     * Given a table, write table by sequence name to standard output
     *
     * @param table
     */
    public void writeTableBySequenceName(ReferenceSequenceTable table){
        System.out.print("*********************************************************\n");
        System.out.print("Name \tMD5 \tReference\n");
        for(String sequenceName : table.getAllSequenceNames()){
            Set rows = table.queryBySequenceName(sequenceName);
            if(onlyDisplayDifferingSequences) {
                if(rows.size() > 1) {
                    displayBySequenceName(rows, sequenceName);
                }
            } else {
                displayBySequenceName(rows, sequenceName);
            }
        }
    }

    private void displayBySequenceName(Set rows, String sequenceName){
        System.out.print(sequenceName);
        for(ReferenceSequenceTable.TableRow row : rows) {
            ReferenceSequenceTable.TableEntry[] entries = row.getEntries();
            System.out.print("\n\t" + row.getMd5() + "\t");

            for(int i = 2; i < entries.length; i++) {
                if(entries[i].getColumnValue().equals(sequenceName)) {
                    System.out.print(entries[i].getColumnName() + "\t");
                }
            }
        }
        System.out.println();
    }

    /**
     * Method to execute a full alignment of 2 references. Detects same-named mismatching sequences and extracts
     * the sequence in each reference into temp fasta files. Creates a MummerExecutor to run the MUMmer alignment
     * pipeline on mismatching sequences, which produces a .snps file with SNPs and INDELs for each sequence.
     * Merges individual sequence .snps files, and then converts to a VCF containing all alignment
     * information for all mismatching sequences in the references.
     *
     * @param refPair pair of references to be aligned
     * @param table
     */
    private void runFullAlignment(ReferencePair refPair, ReferenceSequenceTable table){
        List snpsFiles = new ArrayList<>();
        if(refPair.getAnalysis().contains(ReferencePair.Status.DIFFER_IN_SEQUENCE)) {
            // find the mismatch sequence
            for (String sequenceName : table.getAllSequenceNames()) {
                Set rows = table.queryBySequenceName(sequenceName);
                // rows.size()==2 indicates that 2 MD5s found with the same name
                if (rows.size() == 2) {
                    // generate fasta files for mismatching sequences
                    GATKPath ref1Path = refPair.getRef1();
                    GATKPath ref2Path = refPair.getRef2();

                    String sequenceInRef1Name = ref1Path.toPath().getFileName() + "." + sequenceName;
                    String sequenceInRef2Name = ref2Path.toPath().getFileName() + "." + sequenceName;

                    File ref1TempFastaOutput = IOUtils.createTempFile(sequenceInRef1Name, ".fasta");
                    File ref2TempFastaOutput = IOUtils.createTempFile(sequenceInRef2Name, ".fasta");

                    GATKPath ref1Fasta = generateFastaForSequence(ref1Path, sequenceName, new GATKPath(ref1TempFastaOutput.toString()));
                    GATKPath ref2Fasta = generateFastaForSequence(ref2Path, sequenceName, new GATKPath(ref2TempFastaOutput.toString()));

                    // pass fastas into mummer, get back a vcf and add to list
                    MummerExecutor executor = new MummerExecutor();
                    logger.info("Running mummer alignment on sequence " + sequenceName);
                    File tempSnpsDirectory = IOUtils.createTempDir("tempsnps");
                    File mummerOutput = executor.executeMummer(ref1Fasta.toPath().toFile(), ref2Fasta.toPath().toFile(), tempSnpsDirectory);
                    logger.info("Finished running mummer alignment on sequence " + sequenceName);
                    snpsFiles.add(mummerOutput);
                }
            }
            // merge individual snps files
            File mergedSnpsFile = mergeSnpsFiles(refPair, snpsFiles);

            // convert merged snps file to VCF
            File vcf = convertSnpsFileToVCF(refPair, mergedSnpsFile);
        }
        else{
            logger.info("No mismatching sequences found.");
        }
    }

    private File mergeSnpsFiles(ReferencePair refPair, List snpsFiles){
        File mergedSnpsFile = IOUtils.createTempFile(String.format("%s_%s", refPair.getRef1AsString(), refPair.getRef2AsString()), ".snps");
        try (PrintWriter writer = new PrintWriter(mergedSnpsFile)) {
            for (File file : snpsFiles) {
                try (XReadLines reader = new XReadLines(file)) {
                    for (String line : reader) {
                        writer.write(line + "\n");
                    }
                } catch (IOException e) {
                    throw new UserException("Error merging show-snps outputs.", e);
                }
            }
            return mergedSnpsFile;
        }
        catch(IOException e){
            throw new UserException("Error writing show-snps output file.", e);
        }
    }

    private File convertSnpsFileToVCF(ReferencePair refPair, File mergedSnpsFile){
        File vcf = new File(baseComparisonOutputDirectory.toPath().toString(), String.format("%s_%s.vcf", refPair.getRef1AsString(), refPair.getRef2AsString()));
        try(XReadLines reader = new XReadLines(mergedSnpsFile);
            VariantContextWriter writer = createVCFWriter(vcf);
            ReferenceDataSource source = ReferenceDataSource.of(refPair.getRef1().toPath())) {

            VCFHeader header = new VCFHeader();
            header.setSequenceDictionary(getReferenceDictionary());
            writer.writeHeader(header);
            MummerIndel currentIndel = null;
            int previousPos = -1;
            for (String line : reader) {
                String[] fields = line.split("\\t", -1);
                String contig = fields[12];
                int pos = Integer.valueOf(fields[0]);
                String ref = fields[1];
                String alt = fields[2];

                // insertion
                if (ref.equals(".") && !alt.equals(".")) {
                    if (pos == previousPos && currentIndel != null && currentIndel.isInsertion) {
                        currentIndel.alt += alt;
                    } else {
                        if (currentIndel != null) {
                            writer.add(currentIndel.getAsVCFRecord());
                        }
                        String refPadding = new String(source.queryAndPrefetch(new SimpleInterval(contig, pos-1, pos-1)).getBases());
                        currentIndel = new MummerIndel(contig, refPadding, refPadding + alt, pos-1, true, false);
                    }
                }
                // deletion
                else if(!ref.equals(".") && alt.equals(".")){
                    if(pos == previousPos+1 && currentIndel != null && currentIndel.isDeletion){
                        currentIndel.ref += ref;
                    }
                    else {
                        if(currentIndel != null) {
                            writer.add(currentIndel.getAsVCFRecord());
                        }
                        String refPadding = new String(source.queryAndPrefetch(new SimpleInterval(contig, pos-1, pos-1)).getBases());
                        currentIndel = new MummerIndel(contig, refPadding + ref, refPadding, pos-1, false, true);
                    }
                }
                // snp
                else {
                    if(currentIndel != null){
                        writer.add(currentIndel.getAsVCFRecord());
                        currentIndel = null;
                    }
                    VariantContextBuilder vcfBuilder = new VariantContextBuilder();
                    VariantContext record = vcfBuilder.chr(contig).start(pos).stop(pos).alleles(ref, alt).make();
                    writer.add(record);
                }
                previousPos = pos;
            }
            if(currentIndel != null){
                writer.add(currentIndel.getAsVCFRecord());
            }
            return vcf;
        }
        catch(IOException e){
            throw new UserException("Error converting snps file to VCF.", e);
        }
        catch(NumberFormatException e){
            throw new UserException("Expected integer position, but found non-integer value.", e);
        }
    }

    /**
     * Method to do base-by-base comparison of a pair of references. Detects mismatching sequence
     *
     * @param refPair pair of references to be aligned
     * @param table
     */
    private void runFindSNPS(ReferencePair refPair, ReferenceSequenceTable table) {
        if(refPair.getAnalysis().contains(ReferencePair.Status.DIFFER_IN_SEQUENCE)) {
            TableColumnCollection columns = new TableColumnCollection(Arrays.asList("Sequence Name", "Position", refPair.getRef1AsString(), refPair.getRef2AsString()));
            File snpsOutput = new File(baseComparisonOutputDirectory.toPath().toString(), String.format("%s_%s_snps.tsv", refPair.getRef1AsString(), refPair.getRef2AsString()));

            try(FindSNPsOnlyTableWriter writer = new FindSNPsOnlyTableWriter(snpsOutput.toPath(), columns); final ReferenceDataSource source1 = ReferenceDataSource.of(refPair.getRef1().toPath(), true);
                final ReferenceDataSource source2 = ReferenceDataSource.of(refPair.getRef2().toPath(), true)){
                writer.writeHeaderIfApplies();

                // find the mismatch sequence
                for (String sequenceName : table.getAllSequenceNames()) {
                    Set rows = table.queryBySequenceName(sequenceName);
                    // rows.size()==2 indicates that 2 MD5s found with the same name
                    if (rows.size() == 2) {
                        // if the lengths of the 2 sequences aren't equal, error - can't compare different sequence lengths, probably indel need alignment
                        ReferenceSequenceTable.TableRow[] rowArray = rows.toArray(new ReferenceSequenceTable.TableRow[0]);
                        if (rowArray[0].getLength() != rowArray[1].getLength()) {
                            logger.warn("Lengths for sequence " + sequenceName  + " are not equal and can't be compared. Consider running in FULL_ALIGNMENT mode.");
                            continue;
                        }

                        int sequenceLength = rowArray[0].getLength();
                        Iterator ref1BaseIterator = source1.query(new SimpleInterval(sequenceName, 1, sequenceLength));
                        Iterator ref2BaseIterator = source2.query(new SimpleInterval(sequenceName, 1, sequenceLength));
                        int position = 0;

                        // find SNPS and generate SNPRecords
                        while (ref1BaseIterator.hasNext() && ref2BaseIterator.hasNext()) {
                            position++;
                            Byte ref1Allele = ref1BaseIterator.next();
                            Byte ref2Allele = ref2BaseIterator.next();
                            if (!ref1Allele.equals(ref2Allele)) {
                                SNPRecord record = new SNPRecord(sequenceName, position, new String(new byte[]{ref1Allele}), new String(new byte[]{ref2Allele}), refPair.getRef1AsString(), refPair.getRef2AsString());
                                writer.writeRecord(record);
                            }
                        }
                        if(ref1BaseIterator.hasNext() || ref2BaseIterator.hasNext()){
                            throw new GATKException.ShouldNeverReachHereException("");
                        }
                    }
                }
            }
            catch(IOException exception){
                throw new UserException.CouldNotCreateOutputFile(baseComparisonOutputDirectory + "/" + snpsOutput, "Failed to write output table.", exception);
            }
        }
        else{
            logger.info("No mismatching sequences found.");
        }
    }

    /**
     * Method to generate a fasta file for an individual sequence in a reference
     *
     * @param
     * @param sequenceName target sequence name as a String
     * @param output GATKPath for file output
     * @return output location as a GATKPath
     */
    public static GATKPath generateFastaForSequence(GATKPath refPath, String sequenceName, GATKPath output){
        try(ReferenceDataSource source = ReferenceDataSource.of(refPath.toPath(), true)){
            int sequenceLength = source.getSequenceDictionary().getSequence(sequenceName).getSequenceLength();
            FastaReferenceWriter writer = new FastaReferenceWriterBuilder()
                    .setFastaFile(output.toPath())
                    .setBasesPerLine(80)
                    .build();
            ReferenceSequence seq = source.queryAndPrefetch(new SimpleInterval(sequenceName, 1, sequenceLength));
            writer.addSequence(seq);
            writer.close();
            return output;
        } catch (IOException e) {
            throw new UserException.CouldNotCreateOutputFile("Couldn't create " + output + ", encountered exception: " + e.getMessage(), e);
        }
    }

    @Override
    public void closeTool() {
        for(Map.Entry entry : referenceSources.entrySet()){
            if(!entry.getKey().equals(referenceArguments.getReferenceSpecifier())){
                entry.getValue().close();
            }
        }
    }

    /**
     * TableWriter to format and write the table output.
     */
    public static class CompareReferencesOutputTableWriter extends TableWriter {

        public CompareReferencesOutputTableWriter(final Path table, TableColumnCollection columns) throws IOException {
            super(table, columns);
        }

        public CompareReferencesOutputTableWriter(final Writer writer, TableColumnCollection columns) throws IOException {
            super(writer, columns);
        }

        @Override
        protected void composeLine(final ReferenceSequenceTable.TableRow record, final DataLine dataLine) {
            List columnNames = record.getColumnNames();
            ReferenceSequenceTable.TableEntry[] entries = record.getEntries();
            for(int i = 0; i < columnNames.size(); i++){
                dataLine.set(entries[i].getColumnName(), entries[i].getColumnValue());
            }
        }
    }

    /**
     * Minimal class representing a single SNP in a pair of references.
     * Stores the sequence, position of the SNP, and the allele in each reference.
     */
    private static class SNPRecord{
        String sequence;
        int position;
        String ref1Allele;
        String ref2Allele;
        String ref1;
        String ref2;

        public SNPRecord(String sequence, int position, String ref1Allele, String ref2Allele, String ref1, String ref2){
            this.sequence = sequence;
            this.position = position;
            this.ref1Allele = ref1Allele;
            this.ref2Allele = ref2Allele;
            this.ref1 = ref1;
            this.ref2 = ref2;
        }
    }

    /**
     * TableWriter to format and write SNP table output.
     */
    public static class FindSNPsOnlyTableWriter extends TableWriter {

        public FindSNPsOnlyTableWriter(final Path table, TableColumnCollection columns) throws IOException {
            super(table, columns);
        }

        @Override
        protected void composeLine(final SNPRecord record, final DataLine dataLine) {
            dataLine.set("Sequence Name", record.sequence)
                    .set("Position", record.position)
                    .set(record.ref1, record.ref1Allele)
                    .set(record.ref2, record.ref2Allele);
        }
    }

    /**
     * Minimal class to identify INDELs detected by MUMmer and merge into one VariantContext
     */
    private static class MummerIndel{
        String ref;
        String alt;
        String chr;
        int pos;
        boolean isInsertion;
        boolean isDeletion;

        MummerIndel(String chr, String ref, String alt, int pos, boolean isInsertion, boolean isDeletion){
            this.ref = ref;
            this.alt = alt;
            this.chr = chr;
            this.pos = pos;
            this.isInsertion = isInsertion;
            this.isDeletion = isDeletion;
        }

        public VariantContext getAsVCFRecord(){
            VariantContextBuilder vcfBuilder = new VariantContextBuilder();
            int stopPos = isInsertion ? pos : pos + ref.length()-1;
            VariantContext record = vcfBuilder.chr(chr).start(pos).stop(stopPos).alleles(ref, alt).make();
            return record;
        }


    }
}