org.broadinstitute.hellbender.tools.reference.CompareReferences Maven / Gradle / Ivy
package org.broadinstitute.hellbender.tools.reference;
import htsjdk.samtools.reference.FastaReferenceWriter;
import htsjdk.samtools.reference.FastaReferenceWriterBuilder;
import htsjdk.samtools.reference.ReferenceSequence;
import htsjdk.variant.variantcontext.VariantContext;
import htsjdk.variant.variantcontext.VariantContextBuilder;
import htsjdk.variant.variantcontext.writer.VariantContextWriter;
import htsjdk.variant.vcf.VCFHeader;
import org.broadinstitute.barclay.argparser.Argument;
import org.broadinstitute.barclay.argparser.CommandLineException;
import org.broadinstitute.barclay.argparser.CommandLineProgramProperties;
import org.broadinstitute.barclay.argparser.ExperimentalFeature;
import org.broadinstitute.barclay.help.DocumentedFeature;
import org.broadinstitute.hellbender.cmdline.StandardArgumentDefinitions;
import org.broadinstitute.hellbender.engine.GATKPath;
import org.broadinstitute.hellbender.engine.GATKTool;
import org.broadinstitute.hellbender.engine.ReferenceDataSource;
import org.broadinstitute.hellbender.exceptions.GATKException;
import org.broadinstitute.hellbender.exceptions.UserException;
import org.broadinstitute.hellbender.tools.funcotator.FilterFuncotations;
import org.broadinstitute.hellbender.utils.SimpleInterval;
import org.broadinstitute.hellbender.utils.alignment.MummerExecutor;
import org.broadinstitute.hellbender.utils.io.IOUtils;
import org.broadinstitute.hellbender.utils.text.XReadLines;
import org.broadinstitute.hellbender.utils.tsv.DataLine;
import org.broadinstitute.hellbender.utils.tsv.TableColumnCollection;
import org.broadinstitute.hellbender.utils.tsv.TableWriter;
import picard.cmdline.programgroups.ReferenceProgramGroup;
import java.io.*;
import java.nio.file.Files;
import java.nio.file.Path;
import java.util.*;
/**
* Display reference comparison as a tab-delimited table and summarize reference differences.
*
* This tool generates a MD5-keyed table comparing specified references and does an analysis to summarize the differences
* between the references provided. Comparisons are made against a "special" reference, specified with the -R
* argument. Subsequent references to be compared may be specified using the -references-to-compare argument.
* The table can be directed to a file or standard output using provided command-line arguments.
* A supplementary table keyed by sequence name can be displayed using the -display-sequences-by-name argument;
* to display only sequence names for which the references are not consistent, run with the -display-only-differing-sequences
* argument as well.
*
* Input
*
* The references and MD5 calculation mode.
*
*
* Output
*
* A TSV file or output stream displaying the MD5-keyed table comparison, and a table analysis displayed to standard output.
*
* MD5 Length hg19mini.fasta hg19mini_1renamed.fasta
* 8c0c38e352d8f3309eabe4845456f274 16000 1 chr1
* 5f8388fe3fb34aa38375ae6cf5e45b89 16000 2 2
* 94de808a3a2203dbb02434a47bd8184f 16000 3 3
* 7d397ee919e379328d8f52c57a54c778 16000 4 4
*
* REFERENCE PAIR: hg19mini.fasta, hg19mini_1renamed.fasta
* Status:
* DIFFER_IN_SEQUENCE_NAMES
*
*
*
* Usage example
*
* gatk CompareReferences \
* -R reference1.fasta \
* -refcomp reference2.fasta \
* -O output.fasta \
* -md5-calculation-mode USE_DICT
*
*
*/
@CommandLineProgramProperties(
summary = "Compare multiple references and output a tab-delimited table detailing what the differences are and a summarized analysis of each pair of references.",
oneLineSummary = "Display reference comparison as a tab-delimited table and summarize reference differences.",
programGroup = ReferenceProgramGroup.class
)
@DocumentedFeature
@ExperimentalFeature
public class CompareReferences extends GATKTool {
@Argument(fullName = "references-to-compare", shortName = "refcomp", doc = "Reference sequence file(s) to compare.")
private List references;
/**
* Output file will be written here.
*
* Note: If no output file provided, table will print to standard output.
*/
@Argument(fullName = StandardArgumentDefinitions.OUTPUT_LONG_NAME, shortName = StandardArgumentDefinitions.OUTPUT_SHORT_NAME, doc = "If specified, output reference sequence table in TSV format to this file. Otherwise print it to stdout.", optional = true)
private GATKPath output;
@Argument(fullName = "md5-calculation-mode", shortName = "md5-calculation-mode", doc = "MD5CalculationMode indicating method of MD5 calculation.", optional = true)
private MD5CalculationMode md5CalculationMode = MD5CalculationMode.RECALCULATE_IF_MISSING;
@Argument(fullName = "display-sequences-by-name", doc = "If provided, the table by sequence name will be printed.", optional = true)
private boolean displaySequencesByName = false;
@Argument(fullName = "display-only-differing-sequences", doc = "If provided, only display sequence names that differ in their actual sequence.", optional = true)
private boolean onlyDisplayDifferingSequences = false;
@Argument(fullName = "base-comparison", doc = "Mode for base-level comparisons. Default is off, but can do full alignment of mismatching sequences to find SNPs and INDELs (FULL_ALIGNMENT), or detect SNPs only in mismatching sequences of the same length (FIND_SNPS_ONLY).", optional = true)
private BaseComparisonMode baseComparisonMode = BaseComparisonMode.NO_BASE_COMPARISON;
@Argument(fullName = "base-comparison-output", doc = "Output directory for base comparison outputs. Required for running base-comparison in FULL_ALIGNMENT or FIND_SNPS_ONLY mode.", optional = true)
private GATKPath baseComparisonOutputDirectory;
public enum MD5CalculationMode {
// use only MD5s found in dictionary; if MD5 missing, crashes
USE_DICT,
// use any MD5s found in dictionary and recalculate any missing MD5s
RECALCULATE_IF_MISSING,
// recalculate all MD5s, regardless of presence in dictionary
ALWAYS_RECALCULATE;
}
public enum BaseComparisonMode{
// no base comparison
NO_BASE_COMPARISON,
// run the mummer pipeline to generate a VCF file containing SNPs and INDELs for any mismatching sequences of same name
FULL_ALIGNMENT,
// do a base-by-base comparison of any same-name, same-length mismatching sequences to output a table containing each base mismatch
FIND_SNPS_ONLY
}
private Map referenceSources;
@Override
public boolean requiresReference() {
return true;
}
@Override
public void onTraversalStart() {
// add data source for -R reference
referenceSources = new LinkedHashMap<>();
referenceSources.put(referenceArguments.getReferenceSpecifier(), directlyAccessEngineReferenceDataSource());
// add data sources for remaining references
for(GATKPath path : references){
referenceSources.put(path, ReferenceDataSource.of(path.toPath()));
}
// must have exactly 2 references to run either base comparison mode
if(baseComparisonMode != BaseComparisonMode.NO_BASE_COMPARISON){
if(baseComparisonOutputDirectory == null) {
throw new UserException.CouldNotCreateOutputFile(baseComparisonOutputDirectory, "Output directory not provided but required in -base-comparison " + baseComparisonMode + " mode.");
}
if(!Files.exists(baseComparisonOutputDirectory.toPath())){
throw new UserException.CouldNotCreateOutputFile(baseComparisonOutputDirectory, "Output directory non-existent.");
}
if(referenceSources.size() != 2) {
throw new UserException.BadInput("Base comparison modes can only be run on 2 references.");
}
}
}
@Override
public void traverse(){
ReferenceSequenceTable table = new ReferenceSequenceTable(referenceSources, md5CalculationMode);
logger.info("Building reference sequence table.");
table.build();
logger.info("Finished building table.");
writeTable(table);
if(displaySequencesByName){
writeTableBySequenceName(table);
}
logger.info("Analyzing table.");
List referencePairs = table.compareAllReferences();
logger.info("Finished analyzing table.");
System.out.println("*********************************************************");
for(ReferencePair pair : referencePairs){
System.out.println(pair);
}
ReferencePair refPair = referencePairs.get(0);
switch (baseComparisonMode) {
case FULL_ALIGNMENT:
runFullAlignment(refPair, table);
break;
case FIND_SNPS_ONLY:
runFindSNPS(refPair, table);
break;
}
}
/**
* Given a table, write table to output.
*
* Note: if no output file specified, displays to standard output.
*
* @param table
*/
private void writeTable(ReferenceSequenceTable table){
TableColumnCollection columns = new TableColumnCollection(table.getColumnNames());
try(CompareReferences.CompareReferencesOutputTableWriter writer = output == null
? new CompareReferences.CompareReferencesOutputTableWriter(new OutputStreamWriter(System.out), columns)
: new CompareReferences.CompareReferencesOutputTableWriter(output.toPath(), columns)
){
writer.writeHeaderIfApplies();
for(ReferenceSequenceTable.TableRow row : table){
writer.writeRecord(row);
}
}
catch(IOException exception){
throw (output == null)
? new UserException.CouldNotCreateOutputFile("System.out", "Failed to write output table.", exception)
: new UserException.CouldNotCreateOutputFile(output, "Failed to write output table.", exception);
}
}
/**
* Given a table, write table by sequence name to standard output
*
* @param table
*/
public void writeTableBySequenceName(ReferenceSequenceTable table){
System.out.print("*********************************************************\n");
System.out.print("Name \tMD5 \tReference\n");
for(String sequenceName : table.getAllSequenceNames()){
Set rows = table.queryBySequenceName(sequenceName);
if(onlyDisplayDifferingSequences) {
if(rows.size() > 1) {
displayBySequenceName(rows, sequenceName);
}
} else {
displayBySequenceName(rows, sequenceName);
}
}
}
private void displayBySequenceName(Set rows, String sequenceName){
System.out.print(sequenceName);
for(ReferenceSequenceTable.TableRow row : rows) {
ReferenceSequenceTable.TableEntry[] entries = row.getEntries();
System.out.print("\n\t" + row.getMd5() + "\t");
for(int i = 2; i < entries.length; i++) {
if(entries[i].getColumnValue().equals(sequenceName)) {
System.out.print(entries[i].getColumnName() + "\t");
}
}
}
System.out.println();
}
/**
* Method to execute a full alignment of 2 references. Detects same-named mismatching sequences and extracts
* the sequence in each reference into temp fasta files. Creates a MummerExecutor to run the MUMmer alignment
* pipeline on mismatching sequences, which produces a .snps file with SNPs and INDELs for each sequence.
* Merges individual sequence .snps files, and then converts to a VCF containing all alignment
* information for all mismatching sequences in the references.
*
* @param refPair pair of references to be aligned
* @param table
*/
private void runFullAlignment(ReferencePair refPair, ReferenceSequenceTable table){
List snpsFiles = new ArrayList<>();
if(refPair.getAnalysis().contains(ReferencePair.Status.DIFFER_IN_SEQUENCE)) {
// find the mismatch sequence
for (String sequenceName : table.getAllSequenceNames()) {
Set rows = table.queryBySequenceName(sequenceName);
// rows.size()==2 indicates that 2 MD5s found with the same name
if (rows.size() == 2) {
// generate fasta files for mismatching sequences
GATKPath ref1Path = refPair.getRef1();
GATKPath ref2Path = refPair.getRef2();
String sequenceInRef1Name = ref1Path.toPath().getFileName() + "." + sequenceName;
String sequenceInRef2Name = ref2Path.toPath().getFileName() + "." + sequenceName;
File ref1TempFastaOutput = IOUtils.createTempFile(sequenceInRef1Name, ".fasta");
File ref2TempFastaOutput = IOUtils.createTempFile(sequenceInRef2Name, ".fasta");
GATKPath ref1Fasta = generateFastaForSequence(ref1Path, sequenceName, new GATKPath(ref1TempFastaOutput.toString()));
GATKPath ref2Fasta = generateFastaForSequence(ref2Path, sequenceName, new GATKPath(ref2TempFastaOutput.toString()));
// pass fastas into mummer, get back a vcf and add to list
MummerExecutor executor = new MummerExecutor();
logger.info("Running mummer alignment on sequence " + sequenceName);
File tempSnpsDirectory = IOUtils.createTempDir("tempsnps");
File mummerOutput = executor.executeMummer(ref1Fasta.toPath().toFile(), ref2Fasta.toPath().toFile(), tempSnpsDirectory);
logger.info("Finished running mummer alignment on sequence " + sequenceName);
snpsFiles.add(mummerOutput);
}
}
// merge individual snps files
File mergedSnpsFile = mergeSnpsFiles(refPair, snpsFiles);
// convert merged snps file to VCF
File vcf = convertSnpsFileToVCF(refPair, mergedSnpsFile);
}
else{
logger.info("No mismatching sequences found.");
}
}
private File mergeSnpsFiles(ReferencePair refPair, List snpsFiles){
File mergedSnpsFile = IOUtils.createTempFile(String.format("%s_%s", refPair.getRef1AsString(), refPair.getRef2AsString()), ".snps");
try (PrintWriter writer = new PrintWriter(mergedSnpsFile)) {
for (File file : snpsFiles) {
try (XReadLines reader = new XReadLines(file)) {
for (String line : reader) {
writer.write(line + "\n");
}
} catch (IOException e) {
throw new UserException("Error merging show-snps outputs.", e);
}
}
return mergedSnpsFile;
}
catch(IOException e){
throw new UserException("Error writing show-snps output file.", e);
}
}
private File convertSnpsFileToVCF(ReferencePair refPair, File mergedSnpsFile){
File vcf = new File(baseComparisonOutputDirectory.toPath().toString(), String.format("%s_%s.vcf", refPair.getRef1AsString(), refPair.getRef2AsString()));
try(XReadLines reader = new XReadLines(mergedSnpsFile);
VariantContextWriter writer = createVCFWriter(vcf);
ReferenceDataSource source = ReferenceDataSource.of(refPair.getRef1().toPath())) {
VCFHeader header = new VCFHeader();
header.setSequenceDictionary(getReferenceDictionary());
writer.writeHeader(header);
MummerIndel currentIndel = null;
int previousPos = -1;
for (String line : reader) {
String[] fields = line.split("\\t", -1);
String contig = fields[12];
int pos = Integer.valueOf(fields[0]);
String ref = fields[1];
String alt = fields[2];
// insertion
if (ref.equals(".") && !alt.equals(".")) {
if (pos == previousPos && currentIndel != null && currentIndel.isInsertion) {
currentIndel.alt += alt;
} else {
if (currentIndel != null) {
writer.add(currentIndel.getAsVCFRecord());
}
String refPadding = new String(source.queryAndPrefetch(new SimpleInterval(contig, pos-1, pos-1)).getBases());
currentIndel = new MummerIndel(contig, refPadding, refPadding + alt, pos-1, true, false);
}
}
// deletion
else if(!ref.equals(".") && alt.equals(".")){
if(pos == previousPos+1 && currentIndel != null && currentIndel.isDeletion){
currentIndel.ref += ref;
}
else {
if(currentIndel != null) {
writer.add(currentIndel.getAsVCFRecord());
}
String refPadding = new String(source.queryAndPrefetch(new SimpleInterval(contig, pos-1, pos-1)).getBases());
currentIndel = new MummerIndel(contig, refPadding + ref, refPadding, pos-1, false, true);
}
}
// snp
else {
if(currentIndel != null){
writer.add(currentIndel.getAsVCFRecord());
currentIndel = null;
}
VariantContextBuilder vcfBuilder = new VariantContextBuilder();
VariantContext record = vcfBuilder.chr(contig).start(pos).stop(pos).alleles(ref, alt).make();
writer.add(record);
}
previousPos = pos;
}
if(currentIndel != null){
writer.add(currentIndel.getAsVCFRecord());
}
return vcf;
}
catch(IOException e){
throw new UserException("Error converting snps file to VCF.", e);
}
catch(NumberFormatException e){
throw new UserException("Expected integer position, but found non-integer value.", e);
}
}
/**
* Method to do base-by-base comparison of a pair of references. Detects mismatching sequence
*
* @param refPair pair of references to be aligned
* @param table
*/
private void runFindSNPS(ReferencePair refPair, ReferenceSequenceTable table) {
if(refPair.getAnalysis().contains(ReferencePair.Status.DIFFER_IN_SEQUENCE)) {
TableColumnCollection columns = new TableColumnCollection(Arrays.asList("Sequence Name", "Position", refPair.getRef1AsString(), refPair.getRef2AsString()));
File snpsOutput = new File(baseComparisonOutputDirectory.toPath().toString(), String.format("%s_%s_snps.tsv", refPair.getRef1AsString(), refPair.getRef2AsString()));
try(FindSNPsOnlyTableWriter writer = new FindSNPsOnlyTableWriter(snpsOutput.toPath(), columns); final ReferenceDataSource source1 = ReferenceDataSource.of(refPair.getRef1().toPath(), true);
final ReferenceDataSource source2 = ReferenceDataSource.of(refPair.getRef2().toPath(), true)){
writer.writeHeaderIfApplies();
// find the mismatch sequence
for (String sequenceName : table.getAllSequenceNames()) {
Set rows = table.queryBySequenceName(sequenceName);
// rows.size()==2 indicates that 2 MD5s found with the same name
if (rows.size() == 2) {
// if the lengths of the 2 sequences aren't equal, error - can't compare different sequence lengths, probably indel need alignment
ReferenceSequenceTable.TableRow[] rowArray = rows.toArray(new ReferenceSequenceTable.TableRow[0]);
if (rowArray[0].getLength() != rowArray[1].getLength()) {
logger.warn("Lengths for sequence " + sequenceName + " are not equal and can't be compared. Consider running in FULL_ALIGNMENT mode.");
continue;
}
int sequenceLength = rowArray[0].getLength();
Iterator ref1BaseIterator = source1.query(new SimpleInterval(sequenceName, 1, sequenceLength));
Iterator ref2BaseIterator = source2.query(new SimpleInterval(sequenceName, 1, sequenceLength));
int position = 0;
// find SNPS and generate SNPRecords
while (ref1BaseIterator.hasNext() && ref2BaseIterator.hasNext()) {
position++;
Byte ref1Allele = ref1BaseIterator.next();
Byte ref2Allele = ref2BaseIterator.next();
if (!ref1Allele.equals(ref2Allele)) {
SNPRecord record = new SNPRecord(sequenceName, position, new String(new byte[]{ref1Allele}), new String(new byte[]{ref2Allele}), refPair.getRef1AsString(), refPair.getRef2AsString());
writer.writeRecord(record);
}
}
if(ref1BaseIterator.hasNext() || ref2BaseIterator.hasNext()){
throw new GATKException.ShouldNeverReachHereException("");
}
}
}
}
catch(IOException exception){
throw new UserException.CouldNotCreateOutputFile(baseComparisonOutputDirectory + "/" + snpsOutput, "Failed to write output table.", exception);
}
}
else{
logger.info("No mismatching sequences found.");
}
}
/**
* Method to generate a fasta file for an individual sequence in a reference
*
* @param
* @param sequenceName target sequence name as a String
* @param output GATKPath for file output
* @return output location as a GATKPath
*/
public static GATKPath generateFastaForSequence(GATKPath refPath, String sequenceName, GATKPath output){
try(ReferenceDataSource source = ReferenceDataSource.of(refPath.toPath(), true)){
int sequenceLength = source.getSequenceDictionary().getSequence(sequenceName).getSequenceLength();
FastaReferenceWriter writer = new FastaReferenceWriterBuilder()
.setFastaFile(output.toPath())
.setBasesPerLine(80)
.build();
ReferenceSequence seq = source.queryAndPrefetch(new SimpleInterval(sequenceName, 1, sequenceLength));
writer.addSequence(seq);
writer.close();
return output;
} catch (IOException e) {
throw new UserException.CouldNotCreateOutputFile("Couldn't create " + output + ", encountered exception: " + e.getMessage(), e);
}
}
@Override
public void closeTool() {
for(Map.Entry entry : referenceSources.entrySet()){
if(!entry.getKey().equals(referenceArguments.getReferenceSpecifier())){
entry.getValue().close();
}
}
}
/**
* TableWriter to format and write the table output.
*/
public static class CompareReferencesOutputTableWriter extends TableWriter {
public CompareReferencesOutputTableWriter(final Path table, TableColumnCollection columns) throws IOException {
super(table, columns);
}
public CompareReferencesOutputTableWriter(final Writer writer, TableColumnCollection columns) throws IOException {
super(writer, columns);
}
@Override
protected void composeLine(final ReferenceSequenceTable.TableRow record, final DataLine dataLine) {
List columnNames = record.getColumnNames();
ReferenceSequenceTable.TableEntry[] entries = record.getEntries();
for(int i = 0; i < columnNames.size(); i++){
dataLine.set(entries[i].getColumnName(), entries[i].getColumnValue());
}
}
}
/**
* Minimal class representing a single SNP in a pair of references.
* Stores the sequence, position of the SNP, and the allele in each reference.
*/
private static class SNPRecord{
String sequence;
int position;
String ref1Allele;
String ref2Allele;
String ref1;
String ref2;
public SNPRecord(String sequence, int position, String ref1Allele, String ref2Allele, String ref1, String ref2){
this.sequence = sequence;
this.position = position;
this.ref1Allele = ref1Allele;
this.ref2Allele = ref2Allele;
this.ref1 = ref1;
this.ref2 = ref2;
}
}
/**
* TableWriter to format and write SNP table output.
*/
public static class FindSNPsOnlyTableWriter extends TableWriter {
public FindSNPsOnlyTableWriter(final Path table, TableColumnCollection columns) throws IOException {
super(table, columns);
}
@Override
protected void composeLine(final SNPRecord record, final DataLine dataLine) {
dataLine.set("Sequence Name", record.sequence)
.set("Position", record.position)
.set(record.ref1, record.ref1Allele)
.set(record.ref2, record.ref2Allele);
}
}
/**
* Minimal class to identify INDELs detected by MUMmer and merge into one VariantContext
*/
private static class MummerIndel{
String ref;
String alt;
String chr;
int pos;
boolean isInsertion;
boolean isDeletion;
MummerIndel(String chr, String ref, String alt, int pos, boolean isInsertion, boolean isDeletion){
this.ref = ref;
this.alt = alt;
this.chr = chr;
this.pos = pos;
this.isInsertion = isInsertion;
this.isDeletion = isDeletion;
}
public VariantContext getAsVCFRecord(){
VariantContextBuilder vcfBuilder = new VariantContextBuilder();
int stopPos = isInsertion ? pos : pos + ref.length()-1;
VariantContext record = vcfBuilder.chr(chr).start(pos).stop(stopPos).alleles(ref, alt).make();
return record;
}
}
}