 
                        
        
                        
        org.broadinstitute.hellbender.tools.walkers.bqsr.AnalyzeCovariates Maven / Gradle / Ivy
package org.broadinstitute.hellbender.tools.walkers.bqsr;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
import org.broadinstitute.barclay.argparser.Argument;
import org.broadinstitute.barclay.argparser.CommandLineException;
import org.broadinstitute.barclay.argparser.CommandLineProgramProperties;
import org.broadinstitute.barclay.help.DocumentedFeature;
import org.broadinstitute.hellbender.cmdline.CommandLineProgram;
import org.broadinstitute.hellbender.cmdline.StandardArgumentDefinitions;
import picard.cmdline.programgroups.DiagnosticsAndQCProgramGroup;
import org.broadinstitute.hellbender.exceptions.UserException;
import org.broadinstitute.hellbender.utils.recalibration.RecalUtils;
import org.broadinstitute.hellbender.utils.recalibration.RecalibrationReport;
import org.broadinstitute.hellbender.utils.Utils;
import org.broadinstitute.hellbender.utils.io.IOUtils;
import java.io.File;
import java.io.FileNotFoundException;
import java.util.LinkedHashMap;
import java.util.Map;
import java.util.Optional;
/**
 * Evaluate and compare base quality score recalibration tables
 *
 * This tool generates plots to assess the quality of a recalibration run as part of the Base Quality Score
 * Recalibration (BQSR) procedure. 
 *
 * Summary of the BQSR procedure
 * The goal of this procedure is to correct for systematic bias that affects the assignment of base quality scores
 * by the sequencer. The first pass consists of calculating error empirically and finding patterns in how error varies
 * with basecall features over all bases. The relevant observations are written to a recalibration table. The second
 * pass consists of applying numerical corrections to each individual basecall based on the patterns identified in the
 * first step (recorded in the recalibration table) and writing out the recalibrated data to a new BAM or CRAM file.
 *
 * Inputs
 *
 * The tool can take up to three different sets of recalibration tables.
 * The resulting plots will be overlaid on top of each other to make
 * comparisons easy.
 *
 * 
 *     
 *       Set Argument Label Color Description Original -before BEFORE Maroon1 
 *         First pass recalibration
 *             tables obtained from applying {@link org.broadinstitute.hellbender.transformers.BQSRReadTransformer}
 *             on the original alignment. Recalibrated -after AFTER Blue 
 *         Second pass recalibration tables
 *             results from the application of {@link org.broadinstitute.hellbender.transformers.BQSRReadTransformer}
 *             on the alignment recalibrated using the first pass tables Input -bqsr BQSR Black 
 *           Any recalibration table without a specific role 
 * 
 *
 * You need to specify at least one set. Multiple sets need to have the same values for the following parameters:
 * covariate (order is not important), no_standard_covs, run_without_dbsnp, solid_recal_mode,
 * solid_nocall_strategy, mismatches_context_size, mismatches_default_quality, deletions_default_quality,
 * insertions_default_quality, maximum_cycle_value, low_quality_tail, default_platform, force_platform,
 * quantizing_levels and binary_tag_name
 *
 * Outputs
 *
 * Currently this tool generates two outputs:
 *
 * 
 *   - -plots my-report.pdf*
- A pdf document that encloses plots to assess the quality of the recalibration*
- -csv my-report.csv*
- A csv file that contains a table with all the data required to generate those plots*
*
 *You need to specify at least one of them.
 *
 * Usage examples
 *
 * Plot a single recalibration table
 * 
 *   gatk AnalyzeCovariates \
 *     -bqsr recal1.table \
 *     -plots AnalyzeCovariates.pdf
 * 
 *
 * Plot "before" (first pass) and "after" (second pass) recalibration tables to compare them
 * 
 *   gatk AnalyzeCovariates \
 *     -before recal1.table \
 *     -after recal2.table \
 *     -plots AnalyzeCovariates.pdf
 * 
 *
 * Plot up to three recalibration tables for comparison
 * 
 *   gatk AnalyzeCovariates \
 *     -bqsr recal1.table \
 *     -before recal2.table \
 *     -after recal3.table \
 *     -plots AnalyzeCovariates.pdf
 * 
 *
 * Notes
 * 
 *     - Sometimes you may want to compare recalibration tables where the "after" table was actually generated first. To
 * suppress warnings about the dates of creation of the files, use the `--ignore-last-modification-times` argument.*
- You can ignore the before/after semantics completely if you like, but all tables must have been generated using
 * the same parameters.*
*
 */
@DocumentedFeature
@CommandLineProgramProperties(
        summary = "Evaluate and compare base quality score recalibration (BQSR) tables",
        oneLineSummary = "Evaluate and compare base quality score recalibration (BQSR) tables",
        programGroup = DiagnosticsAndQCProgramGroup.class
)
public final class AnalyzeCovariates extends CommandLineProgram {
    private final Logger logger = LogManager.getLogger(AnalyzeCovariates.class);
    // Constants on option short names that are used in some error/warning messages:
    static final String CSV_ARG_SHORT_NAME = "csv";
    static final String PDF_ARG_SHORT_NAME = "plots";
    static final String BEFORE_ARG_SHORT_NAME = "before";
    static final String AFTER_ARG_SHORT_NAME = "after";
    static final String IGNORE_LMT_LONG_NAME = "ignore-last-modification-times";
    /**
     * File containing the recalibration tables from the first pass.
     */
    @Argument(
            shortName = BEFORE_ARG_SHORT_NAME,
            fullName = "before-report-file",
            doc = "file containing the BQSR first-pass report file",
            optional = true
    )
    protected File beforeFile = null;
    /**
     * File containing the recalibration tables from the second pass.
     */
    @Argument(
            shortName = AFTER_ARG_SHORT_NAME,
            fullName = "after-report-file",
            doc = "file containing the BQSR second-pass report file",
            optional = true
    )
    protected File afterFile = null;
    /**
     * If true, it won't show a warning if the last-modification time of the before and after input files suggest that they have been reversed.
     */
    @Argument(
            fullName = IGNORE_LMT_LONG_NAME,
            doc = "do not emit warning messages related to suspicious last modification time order of inputs",
            optional = true
    )
    protected boolean ignoreLastModificationTime = false;
    /**
     * Output report file name.
     */
    @Argument(
            shortName = PDF_ARG_SHORT_NAME,
            fullName = "plots-report-file",
            doc = "location of the output report",
            optional = true
    )
    protected File pdfFile = null;
    /**
     * Output csv file name.
     */
    @Argument(
            shortName=CSV_ARG_SHORT_NAME,
            fullName="intermediate-csv-file",
            doc = "location of the csv intermediate file",
            optional = true
    )
    protected File csvFile = null;
    /**
     * Enables recalibration of base qualities, intended primarily for use with BaseRecalibrator and ApplyBQSR
     * (see Best Practices workflow documentation). The covariates tables are produced by the BaseRecalibrator tool.
     * Please be aware that you should only run recalibration with the covariates file created on the same input bam(s).
     */
    @Argument(
            fullName = StandardArgumentDefinitions.BQSR_TABLE_LONG_NAME,
            shortName = StandardArgumentDefinitions.BQSR_TABLE_SHORT_NAME,
            optional=true,
            doc="Input covariates table file for on-the-fly base quality score recalibration"
    )
    public File BQSR_RECAL_FILE = null;
    /**
     * Checks inputs and argument values.
     * 
     * Notice that this routine will not validate the content of files. It may have some minor side effects as
     * the output of warning messages back to the user.
     *
     * @throw IllegalStateException there is some required argument value that has not been loaded yet.
     * @throw UserException if there is some error caused by or under the end user's control.
     */
    private void checkArgumentsValues() {
        checkInputReportFile("BQSR",BQSR_RECAL_FILE);
        checkInputReportFile("before",beforeFile);
        checkInputReportFile("after",afterFile);
        if (BQSR_RECAL_FILE == null && beforeFile == null && afterFile == null) {
            throw new UserException("you must provide at least one recalibration report file "
                    + "(arguments -" + StandardArgumentDefinitions.BQSR_TABLE_SHORT_NAME +
                    ", -" + BEFORE_ARG_SHORT_NAME + " or -" + AFTER_ARG_SHORT_NAME);
        }
        checkOutputFile(PDF_ARG_SHORT_NAME,pdfFile);
        checkOutputFile(CSV_ARG_SHORT_NAME, csvFile);
        checkInputReportFileLMT(beforeFile,afterFile);
        checkOutputRequested();
    }
    /**
     * Checks whether the last-modification-time of the inputs is consistent with their relative roles.
     *
     * This routine does not thrown an exception but may output a warning message if inconsistencies are spotted.
     *
     * @param beforeFile the before report file.
     * @param afterFile  the after report file.
     */
    private void checkInputReportFileLMT(final File beforeFile, final File afterFile) {
        if (ignoreLastModificationTime  || beforeFile == null || afterFile == null) {
            return; // nothing to do here
        } else if (beforeFile.lastModified() > afterFile.lastModified()) {
            Utils.warnUser("Last modification timestamp for 'Before' and 'After'"
                    + "recalibration reports are in the wrong order. Perhaps, have they been swapped?");
        }
    }
    /**
     * Checks that at least one output was requested.
     *
     * @throw UserException if no output was requested.
     */
    private void checkOutputRequested() {
        if (pdfFile == null && csvFile == null) {
            throw new UserException("you need to request at least one output:"
                    + " the intermediate csv file (-" + CSV_ARG_SHORT_NAME + " FILE)"
                    + " or the final plot file (-" + PDF_ARG_SHORT_NAME + " FILE).");
        }
    }
    /**
     * Checks the value provided to input file arguments.
     *
     * @throw UserException if there is any problem cause by or under the end user's control
     *
     * @param name command line argument short name.
     * @param value the argument value.
     */
    private void checkInputReportFile(final String name,final File value) {
        if (value == null) {
            return;
        } else if (!value.exists()) {
            throw new CommandLineException.BadArgumentValue(name, "input report '" +
                    value + "' does not exist or is unreachable");
        } else if (!value.isFile()) {
            throw new CommandLineException.BadArgumentValue(name, "input report '" +
                    value + "' is not a regular file");
        } else if (!value.canRead()) {
            throw new CommandLineException.BadArgumentValue(name, "input report '" +
                    value + "' cannot be read");
        }
    }
    /**
     * Checks the value provided for output arguments.
     *
     * @throw UserException if there is any problem cause by or under the end user's control
     *
     * @param name command line argument short name.
     * @param value the argument value.
     */
    private void checkOutputFile(final String name, final File value) {
        if (value == null) {
            return;
        }
        if (value.exists() && !value.isFile()) {
            throw new CommandLineException.BadArgumentValue(name, "the output file location '"
                    + value + "' exists as not a file");
        }
        final File parent = value.getParentFile();
        if (parent == null) {
            return;
        }
        if (!parent.exists()) {
            throw new CommandLineException.BadArgumentValue(name, "the output file parent directory '"
                    + parent + "' does not exists or is unreachable");
        } else if (!parent.isDirectory()) {
            throw new CommandLineException.BadArgumentValue(name, "the output file parent directory '"
                    + parent + "' is not a directory");
        } else if (!parent.canWrite()) {
            throw new CommandLineException.BadArgumentValue(name, "the output file parent directory '"
                    + parent + "' cannot be written");
        }
    }
    /**
     * Generates the plots using the external R script.
     *
     * 
     * IfplotsFile is null, it does not perform any plotting.
     *
     * @param csvFile the intermediary csv file.
     * @param plotsFile the output plot location.
     */
    private void generatePlots(final File csvFile, final Map reportFiles, final File plotsFile) {
        if (plotsFile == null) {
            return;
        }
        logger.info("Generating plots file '" + plotsFile + "'");
        final File exampleReportFile = reportFiles.values().iterator().next();
        RecalUtils.generatePlots(csvFile,exampleReportFile,plotsFile);
    }
    @Override
    public Object doWork() {
        checkArgumentsValues();
        final Map reportFiles = buildReportFileMap();
        final Map reports = buildReportMap(reportFiles);
        checkReportConsistency(reports);
        final File csvFile = resolveCsvFile();
        generateCsvFile(csvFile,reports);
        final File plotFile = resolvePlotFile();
        generatePlots(csvFile, reportFiles, plotFile);
        return Optional.empty();
    }
    /**
     * Returns the plot output file
     * @return might be null if the user has not indicated and output file.
     */
    private File resolvePlotFile() {
        return pdfFile;
    }
    /**
     * Generates the intermediary Csv file.
     *
     * @param csvFile where to write the file.
     * @param reports the reports to be included.
     */
    private void generateCsvFile(final File csvFile, final Map reports) {
        try {
            logger.info("Generating csv file '" + csvFile + "'");
            RecalUtils.generateCsv(csvFile, reports);
        } catch (FileNotFoundException e) {
            throw new UserException(
                    String.format("There is a problem creating the intermediary Csv file '%s': %s",
                            csvFile, e.getMessage()),e);
        }
    }
    /**
     * Checks whether multiple input recalibration report files argument values are consistent (equal).
     *
     * @param reports map with report to verify.
     *
     * @throw UserException if there is any inconsistency.
     */
    private void checkReportConsistency(final Map reports) {
        @SuppressWarnings({"unchecked", "rawtypes"})
        final Map.Entry[] reportEntries =
                reports.entrySet().toArray((Map.Entry[]) new Map.Entry[reports.size()]);
        final Map.Entry exampleEntry = reportEntries[0];
        for (int i = 1; i < reportEntries.length; i++) {
            final Map diffs = exampleEntry.getValue().getRAC().compareReportArguments(
                    reportEntries[i].getValue().getRAC(),exampleEntry.getKey(),reportEntries[i].getKey());
            if (!diffs.isEmpty()) {
                throw new UserException.IncompatibleRecalibrationTableParameters("There are differences in relevant arguments of"
                        + " two or more input recalibration reports. Please make sure"
                        + " they have been created using the same recalibration parameters."
                        + " " + String.join("// ", reportDifferencesStringArray(diffs)));
            }
        }
    }
    /**
     * Creates a map with all input recalibration files indexed by their "role".
     * 
     * The key is the role and the value the corresponding report file.
     * 
     * Roles: "Before" (recalibration), "After" (recalibration), "BQSR" (the tool standard argument recalibration file)
     *
     * @return never null
     */
    private Map buildReportFileMap() {
        final Map reports = new LinkedHashMap<>(3);
        if (BQSR_RECAL_FILE != null) {
            reports.put("BQSR",BQSR_RECAL_FILE);
        }
        if (beforeFile != null) {
            reports.put("Before",beforeFile);
        }
        if (afterFile != null) {
            reports.put("After",afterFile);
        }
        return reports;
    }
    /**
     * Transforms a recalibration file map into a report object map.
     *
     * @param reportFileMap the file map to transforms.
     * @return never null, a new map with the same size as
     *  reportFileMap and the same key set.
     */
    private Map buildReportMap(final Map reportFileMap) {
        final Map reports = new LinkedHashMap<>(reportFileMap.size());
        for (final Map.Entry e : reportFileMap.entrySet()) {
            reports.put(e.getKey(),new RecalibrationReport(e.getValue()));
        }
        return reports;
    }
    /**
     * Generates a flatter String array representation of recalibration argument differences.
     * @param diffs the differences to represent.
     *
     * @return never null, an array of the same length as the size of the input diffs.
     */
    private String[] reportDifferencesStringArray(final Map diffs) {
        final String[] result = new String[diffs.size()];
        int i = 0;
        for (final Map.Entry e : diffs.entrySet()) {
            result[i++] = capitalize(e.getKey()) + ": " + e.getValue();
        }
        return result;
    }
    /**
     * Returns the input string capitalizing the first letter.
     *
     * @param str the string to capitalize
     * @return never null.
     */
    private String capitalize(final String str) {
        if (str.isEmpty()) {
            return str;
        } else {
            return Character.toUpperCase(str.charAt(0)) + str.substring(1);
        }
    }
    /**
     * Returns the csv file to use.
     * 
     * This is the the one specified by the user if any or a temporary file
     * that will be deleted as soon as the VM exists by default.
     *
     * @return never null.
     */
    private File resolveCsvFile() {
        if (csvFile != null)  {
            return csvFile;
        } else {
            return IOUtils.createTempFile("AnalyzeCovariates", ".csv");
        }
    }
}