All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.broadinstitute.hellbender.tools.walkers.bqsr.AnalyzeCovariates Maven / Gradle / Ivy

The newest version!
package org.broadinstitute.hellbender.tools.walkers.bqsr;

import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
import org.broadinstitute.barclay.argparser.Argument;
import org.broadinstitute.barclay.argparser.CommandLineException;
import org.broadinstitute.barclay.argparser.CommandLineProgramProperties;
import org.broadinstitute.barclay.help.DocumentedFeature;
import org.broadinstitute.hellbender.cmdline.CommandLineProgram;
import org.broadinstitute.hellbender.cmdline.StandardArgumentDefinitions;
import picard.cmdline.programgroups.DiagnosticsAndQCProgramGroup;
import org.broadinstitute.hellbender.exceptions.UserException;
import org.broadinstitute.hellbender.utils.recalibration.RecalUtils;
import org.broadinstitute.hellbender.utils.recalibration.RecalibrationReport;
import org.broadinstitute.hellbender.utils.Utils;
import org.broadinstitute.hellbender.utils.io.IOUtils;

import java.io.File;
import java.io.FileNotFoundException;
import java.util.LinkedHashMap;
import java.util.Map;
import java.util.Optional;

/**
 * Evaluate and compare base quality score recalibration tables
 *
 * 

This tool generates plots to assess the quality of a recalibration run as part of the Base Quality Score * Recalibration (BQSR) procedure.

* *

Summary of the BQSR procedure

*

The goal of this procedure is to correct for systematic bias that affects the assignment of base quality scores * by the sequencer. The first pass consists of calculating error empirically and finding patterns in how error varies * with basecall features over all bases. The relevant observations are written to a recalibration table. The second * pass consists of applying numerical corrections to each individual basecall based on the patterns identified in the * first step (recorded in the recalibration table) and writing out the recalibrated data to a new BAM or CRAM file.

* *

Inputs

* *

The tool can take up to three different sets of recalibration tables. * The resulting plots will be overlaid on top of each other to make * comparisons easy.

* * * * * * * * * * * * * *
SetArgumentLabelColorDescription
Original-beforeBEFOREMaroon1First pass recalibration * tables obtained from applying {@link org.broadinstitute.hellbender.transformers.BQSRReadTransformer} * on the original alignment.
Recalibrated-afterAFTERBlueSecond pass recalibration tables * results from the application of {@link org.broadinstitute.hellbender.transformers.BQSRReadTransformer} * on the alignment recalibrated using the first pass tables
Input-bqsrBQSRBlackAny recalibration table without a specific role
*
* *

You need to specify at least one set. Multiple sets need to have the same values for the following parameters:

*

covariate (order is not important), no_standard_covs, run_without_dbsnp, solid_recal_mode, * solid_nocall_strategy, mismatches_context_size, mismatches_default_quality, deletions_default_quality, * insertions_default_quality, maximum_cycle_value, low_quality_tail, default_platform, force_platform, * quantizing_levels and binary_tag_name

* *

Outputs

* *

Currently this tool generates two outputs:

* *
*
-plots my-report.pdf
*
A pdf document that encloses plots to assess the quality of the recalibration
*
-csv my-report.csv
*
A csv file that contains a table with all the data required to generate those plots
*
* *

You need to specify at least one of them.

* *

Usage examples

* *

Plot a single recalibration table

*
 *   gatk AnalyzeCovariates \
 *     -bqsr recal1.table \
 *     -plots AnalyzeCovariates.pdf
 * 
* *

Plot "before" (first pass) and "after" (second pass) recalibration tables to compare them

*
 *   gatk AnalyzeCovariates \
 *     -before recal1.table \
 *     -after recal2.table \
 *     -plots AnalyzeCovariates.pdf
 * 
* *

Plot up to three recalibration tables for comparison

*
 *   gatk AnalyzeCovariates \
 *     -bqsr recal1.table \
 *     -before recal2.table \
 *     -after recal3.table \
 *     -plots AnalyzeCovariates.pdf
 * 
* *

Notes

*
    *
  • Sometimes you may want to compare recalibration tables where the "after" table was actually generated first. To * suppress warnings about the dates of creation of the files, use the `--ignore-last-modification-times` argument.
  • *
  • You can ignore the before/after semantics completely if you like, but all tables must have been generated using * the same parameters.
  • *
* */ @DocumentedFeature @CommandLineProgramProperties( summary = "Evaluate and compare base quality score recalibration (BQSR) tables", oneLineSummary = "Evaluate and compare base quality score recalibration (BQSR) tables", programGroup = DiagnosticsAndQCProgramGroup.class ) public final class AnalyzeCovariates extends CommandLineProgram { private final Logger logger = LogManager.getLogger(AnalyzeCovariates.class); // Constants on option short names that are used in some error/warning messages: static final String CSV_ARG_SHORT_NAME = "csv"; static final String PDF_ARG_SHORT_NAME = "plots"; static final String BEFORE_ARG_SHORT_NAME = "before"; static final String AFTER_ARG_SHORT_NAME = "after"; static final String IGNORE_LMT_LONG_NAME = "ignore-last-modification-times"; /** * File containing the recalibration tables from the first pass. */ @Argument( shortName = BEFORE_ARG_SHORT_NAME, fullName = "before-report-file", doc = "file containing the BQSR first-pass report file", optional = true ) protected File beforeFile = null; /** * File containing the recalibration tables from the second pass. */ @Argument( shortName = AFTER_ARG_SHORT_NAME, fullName = "after-report-file", doc = "file containing the BQSR second-pass report file", optional = true ) protected File afterFile = null; /** * If true, it won't show a warning if the last-modification time of the before and after input files suggest that they have been reversed. */ @Argument( fullName = IGNORE_LMT_LONG_NAME, doc = "do not emit warning messages related to suspicious last modification time order of inputs", optional = true ) protected boolean ignoreLastModificationTime = false; /** * Output report file name. */ @Argument( shortName = PDF_ARG_SHORT_NAME, fullName = "plots-report-file", doc = "location of the output report", optional = true ) protected File pdfFile = null; /** * Output csv file name. */ @Argument( shortName=CSV_ARG_SHORT_NAME, fullName="intermediate-csv-file", doc = "location of the csv intermediate file", optional = true ) protected File csvFile = null; /** * Enables recalibration of base qualities, intended primarily for use with BaseRecalibrator and ApplyBQSR * (see Best Practices workflow documentation). The covariates tables are produced by the BaseRecalibrator tool. * Please be aware that you should only run recalibration with the covariates file created on the same input bam(s). */ @Argument( fullName = StandardArgumentDefinitions.BQSR_TABLE_LONG_NAME, shortName = StandardArgumentDefinitions.BQSR_TABLE_SHORT_NAME, optional=true, doc="Input covariates table file for on-the-fly base quality score recalibration" ) public File BQSR_RECAL_FILE = null; /** * Checks inputs and argument values. *

* Notice that this routine will not validate the content of files. It may have some minor side effects as * the output of warning messages back to the user. * * @throw IllegalStateException there is some required argument value that has not been loaded yet. * @throw UserException if there is some error caused by or under the end user's control. */ private void checkArgumentsValues() { checkInputReportFile("BQSR",BQSR_RECAL_FILE); checkInputReportFile("before",beforeFile); checkInputReportFile("after",afterFile); if (BQSR_RECAL_FILE == null && beforeFile == null && afterFile == null) { throw new UserException("you must provide at least one recalibration report file " + "(arguments -" + StandardArgumentDefinitions.BQSR_TABLE_SHORT_NAME + ", -" + BEFORE_ARG_SHORT_NAME + " or -" + AFTER_ARG_SHORT_NAME); } checkOutputFile(PDF_ARG_SHORT_NAME,pdfFile); checkOutputFile(CSV_ARG_SHORT_NAME, csvFile); checkInputReportFileLMT(beforeFile,afterFile); checkOutputRequested(); } /** * Checks whether the last-modification-time of the inputs is consistent with their relative roles. * * This routine does not thrown an exception but may output a warning message if inconsistencies are spotted. * * @param beforeFile the before report file. * @param afterFile the after report file. */ private void checkInputReportFileLMT(final File beforeFile, final File afterFile) { if (ignoreLastModificationTime || beforeFile == null || afterFile == null) { return; // nothing to do here } else if (beforeFile.lastModified() > afterFile.lastModified()) { Utils.warnUser("Last modification timestamp for 'Before' and 'After'" + "recalibration reports are in the wrong order. Perhaps, have they been swapped?"); } } /** * Checks that at least one output was requested. * * @throw UserException if no output was requested. */ private void checkOutputRequested() { if (pdfFile == null && csvFile == null) { throw new UserException("you need to request at least one output:" + " the intermediate csv file (-" + CSV_ARG_SHORT_NAME + " FILE)" + " or the final plot file (-" + PDF_ARG_SHORT_NAME + " FILE)."); } } /** * Checks the value provided to input file arguments. * * @throw UserException if there is any problem cause by or under the end user's control * * @param name command line argument short name. * @param value the argument value. */ private void checkInputReportFile(final String name,final File value) { if (value == null) { return; } else if (!value.exists()) { throw new CommandLineException.BadArgumentValue(name, "input report '" + value + "' does not exist or is unreachable"); } else if (!value.isFile()) { throw new CommandLineException.BadArgumentValue(name, "input report '" + value + "' is not a regular file"); } else if (!value.canRead()) { throw new CommandLineException.BadArgumentValue(name, "input report '" + value + "' cannot be read"); } } /** * Checks the value provided for output arguments. * * @throw UserException if there is any problem cause by or under the end user's control * * @param name command line argument short name. * @param value the argument value. */ private void checkOutputFile(final String name, final File value) { if (value == null) { return; } if (value.exists() && !value.isFile()) { throw new CommandLineException.BadArgumentValue(name, "the output file location '" + value + "' exists as not a file"); } final File parent = value.getParentFile(); if (parent == null) { return; } if (!parent.exists()) { throw new CommandLineException.BadArgumentValue(name, "the output file parent directory '" + parent + "' does not exists or is unreachable"); } else if (!parent.isDirectory()) { throw new CommandLineException.BadArgumentValue(name, "the output file parent directory '" + parent + "' is not a directory"); } else if (!parent.canWrite()) { throw new CommandLineException.BadArgumentValue(name, "the output file parent directory '" + parent + "' cannot be written"); } } /** * Generates the plots using the external R script. * *

* If plotsFile is null, it does not perform any plotting. * * @param csvFile the intermediary csv file. * @param plotsFile the output plot location. */ private void generatePlots(final File csvFile, final Map reportFiles, final File plotsFile) { if (plotsFile == null) { return; } logger.info("Generating plots file '" + plotsFile + "'"); final File exampleReportFile = reportFiles.values().iterator().next(); RecalUtils.generatePlots(csvFile,exampleReportFile,plotsFile); } @Override public Object doWork() { checkArgumentsValues(); final Map reportFiles = buildReportFileMap(); final Map reports = buildReportMap(reportFiles); checkReportConsistency(reports); final File csvFile = resolveCsvFile(); generateCsvFile(csvFile,reports); final File plotFile = resolvePlotFile(); generatePlots(csvFile, reportFiles, plotFile); return Optional.empty(); } /** * Returns the plot output file * @return might be null if the user has not indicated and output file. */ private File resolvePlotFile() { return pdfFile; } /** * Generates the intermediary Csv file. * * @param csvFile where to write the file. * @param reports the reports to be included. */ private void generateCsvFile(final File csvFile, final Map reports) { try { logger.info("Generating csv file '" + csvFile + "'"); RecalUtils.generateCsv(csvFile, reports); } catch (FileNotFoundException e) { throw new UserException( String.format("There is a problem creating the intermediary Csv file '%s': %s", csvFile, e.getMessage()),e); } } /** * Checks whether multiple input recalibration report files argument values are consistent (equal). * * @param reports map with report to verify. * * @throw UserException if there is any inconsistency. */ private void checkReportConsistency(final Map reports) { @SuppressWarnings({"unchecked", "rawtypes"}) final Map.Entry[] reportEntries = reports.entrySet().toArray((Map.Entry[]) new Map.Entry[reports.size()]); final Map.Entry exampleEntry = reportEntries[0]; for (int i = 1; i < reportEntries.length; i++) { final Map diffs = exampleEntry.getValue().getRAC().compareReportArguments( reportEntries[i].getValue().getRAC(),exampleEntry.getKey(),reportEntries[i].getKey()); if (!diffs.isEmpty()) { throw new UserException.IncompatibleRecalibrationTableParameters("There are differences in relevant arguments of" + " two or more input recalibration reports. Please make sure" + " they have been created using the same recalibration parameters." + " " + String.join("// ", reportDifferencesStringArray(diffs))); } } } /** * Creates a map with all input recalibration files indexed by their "role". *

* The key is the role and the value the corresponding report file. *

* Roles: "Before" (recalibration), "After" (recalibration), "BQSR" (the tool standard argument recalibration file) * * @return never null */ private Map buildReportFileMap() { final Map reports = new LinkedHashMap<>(3); if (BQSR_RECAL_FILE != null) { reports.put("BQSR",BQSR_RECAL_FILE); } if (beforeFile != null) { reports.put("Before",beforeFile); } if (afterFile != null) { reports.put("After",afterFile); } return reports; } /** * Transforms a recalibration file map into a report object map. * * @param reportFileMap the file map to transforms. * @return never null, a new map with the same size as * reportFileMap and the same key set. */ private Map buildReportMap(final Map reportFileMap) { final Map reports = new LinkedHashMap<>(reportFileMap.size()); for (final Map.Entry e : reportFileMap.entrySet()) { reports.put(e.getKey(),new RecalibrationReport(e.getValue())); } return reports; } /** * Generates a flatter String array representation of recalibration argument differences. * @param diffs the differences to represent. * * @return never null, an array of the same length as the size of the input diffs. */ private String[] reportDifferencesStringArray(final Map diffs) { final String[] result = new String[diffs.size()]; int i = 0; for (final Map.Entry e : diffs.entrySet()) { result[i++] = capitalize(e.getKey()) + ": " + e.getValue(); } return result; } /** * Returns the input string capitalizing the first letter. * * @param str the string to capitalize * @return never null. */ private String capitalize(final String str) { if (str.isEmpty()) { return str; } else { return Character.toUpperCase(str.charAt(0)) + str.substring(1); } } /** * Returns the csv file to use. *

* This is the the one specified by the user if any or a temporary file * that will be deleted as soon as the VM exists by default. * * @return never null. */ private File resolveCsvFile() { if (csvFile != null) { return csvFile; } else { return IOUtils.createTempFile("AnalyzeCovariates", ".csv"); } } }





© 2015 - 2025 Weber Informatics LLC | Privacy Policy