org.broadinstitute.hellbender.utils.recalibration.RecalUtils Maven / Gradle / Ivy

Show more of this group Show more artifacts with this name
Show all versions of gatk Show documentation
Development on GATK 4
There is a newer version: 4.6.0.0
package org.broadinstitute.hellbender.utils.recalibration;

import htsjdk.samtools.SAMFileHeader;
import htsjdk.samtools.SAMReadGroupRecord;
import org.apache.commons.lang3.tuple.MutablePair;
import org.apache.commons.lang3.tuple.Pair;
import org.apache.logging.log4j.LogManager;
import org.broadinstitute.hellbender.exceptions.GATKException;
import org.broadinstitute.hellbender.exceptions.UserException;
import org.broadinstitute.hellbender.utils.R.RScriptExecutor;
import org.broadinstitute.hellbender.utils.Utils;
import org.broadinstitute.hellbender.utils.collections.NestedIntegerArray;
import org.broadinstitute.hellbender.utils.io.IOUtils;
import org.broadinstitute.hellbender.utils.io.Resource;
import org.broadinstitute.hellbender.utils.read.GATKRead;
import org.broadinstitute.hellbender.utils.read.ReadUtils;
import org.broadinstitute.hellbender.utils.recalibration.covariates.Covariate;
import org.broadinstitute.hellbender.utils.recalibration.covariates.CovariateKeyCache;
import org.broadinstitute.hellbender.utils.recalibration.covariates.ReadCovariates;
import org.broadinstitute.hellbender.utils.recalibration.covariates.StandardCovariateList;
import org.broadinstitute.hellbender.utils.report.GATKReport;
import org.broadinstitute.hellbender.utils.report.GATKReportTable;

import java.io.*;
import java.util.*;

/**
 * This helper class holds the data HashMap as well as submaps that represent the marginal distributions collapsed over all needed dimensions.
 * It also has static methods that are used to perform the various solid recalibration modes that attempt to correct the reference bias.
 * This class holds the parsing methods that are shared between BaseRecalibrator and PrintReads.
 */
public final class RecalUtils {
    public static final String ARGUMENT_REPORT_TABLE_TITLE = "Arguments";
    public static final String QUANTIZED_REPORT_TABLE_TITLE = "Quantized";
    public static final String READGROUP_REPORT_TABLE_TITLE = "RecalTable0";
    public static final String QUALITY_SCORE_REPORT_TABLE_TITLE = "RecalTable1";
    public static final String ALL_COVARIATES_REPORT_TABLE_TITLE = "RecalTable2";

    public static final String ARGUMENT_COLUMN_NAME = "Argument";
    public static final String ARGUMENT_VALUE_COLUMN_NAME = "Value";
    public static final String QUANTIZED_VALUE_COLUMN_NAME = "QuantizedScore";
    public static final String QUANTIZED_COUNT_COLUMN_NAME = "Count";
    public static final String READGROUP_COLUMN_NAME = "ReadGroup";
    public static final String EVENT_TYPE_COLUMN_NAME = "EventType";
    public static final String EMPIRICAL_QUALITY_COLUMN_NAME = "EmpiricalQuality";
    public static final String ESTIMATED_Q_REPORTED_COLUMN_NAME = "EstimatedQReported";
    public static final String QUALITY_SCORE_COLUMN_NAME = "QualityScore";
    public static final String COVARIATE_VALUE_COLUMN_NAME = "CovariateValue";
    public static final String COVARIATE_NAME_COLUMN_NAME = "CovariateName";
    public static final String NUMBER_OBSERVATIONS_COLUMN_NAME = "Observations";
    public static final String NUMBER_ERRORS_COLUMN_NAME = "Errors";

    private static boolean warnUserNullPlatform = false;

    private static final String SCRIPT_FILE = "BQSR.R";
    public static final int EMPIRICAL_QUAL_DECIMAL_PLACES = 4;
    public static final int EMPIRICAL_Q_REPORTED_DECIMAL_PLACES = 4;
    public static final int NUMBER_ERRORS_DECIMAL_PLACES = 2;

    private static final Pair covariateValue     = new MutablePair<>(RecalUtils.COVARIATE_VALUE_COLUMN_NAME, "%s");
    private static final Pair covariateName      = new MutablePair<>(RecalUtils.COVARIATE_NAME_COLUMN_NAME, "%s");
    private static final Pair eventType          = new MutablePair<>(RecalUtils.EVENT_TYPE_COLUMN_NAME, "%s");
    private static final Pair empiricalQuality   = new MutablePair<>(RecalUtils.EMPIRICAL_QUALITY_COLUMN_NAME, "%." + EMPIRICAL_QUAL_DECIMAL_PLACES + 'f');
    private static final Pair estimatedQReported = new MutablePair<>(RecalUtils.ESTIMATED_Q_REPORTED_COLUMN_NAME, "%." + EMPIRICAL_Q_REPORTED_DECIMAL_PLACES + 'f');
    private static final Pair nObservations      = new MutablePair<>(RecalUtils.NUMBER_OBSERVATIONS_COLUMN_NAME, "%d");
    private static final Pair nErrors            = new MutablePair<>(RecalUtils.NUMBER_ERRORS_COLUMN_NAME, "%." + NUMBER_ERRORS_DECIMAL_PLACES+ 'f');

    /**
     * Component used to print out csv representation of the reports that can be use to perform analysis in
     * external tools. E.g. generate plots using R scripts.
     * 
     * A header is always printed into the output stream (or file) when the printer is created. Then you only need
     * to call {@link #print(RecalibrationReport, String) print} for each report you want to include in the csv file.
     * Once finished, you close the printer calling {@link #close() close}
     *
     */
    private static class CsvPrinter {

        private final PrintStream ps;
        private final StandardCovariateList covariates;

        /**
         * Constructs a printer redirected to an output file.
         * @param out the output file.
         * @param covs covariates to print out.
         * @throws FileNotFoundException if the file could not be created anew.
         */
        protected CsvPrinter(final File out, final StandardCovariateList covs)
                throws FileNotFoundException {
            this(new FileOutputStream(out), covs);
        }

        /**
         * Constructs a printer redirected to an output stream
         * @param os the output.
         * @param covs  covariates to print out.
         */
        protected CsvPrinter(final OutputStream os, final StandardCovariateList covs) {
            covariates = covs;
            ps = new PrintStream(os);
            printHeader();
        }

        /**
         * Prints the header out.
         * 

         * Should only be invoked at creation.
         */
        protected void printHeader() {
            RecalUtils.printHeader(ps);
        }

        /**
         * Prints out a report into the csv file.
         *
         *
         * @param report the report to print out.
         * @param mode  the report associated mode. (typically ORIGINAL, RECALIBRATED
         */
        public void print(final RecalibrationReport report, final String mode) {
            RecalUtils.writeCsv(ps, report.getRecalibrationTables(), mode, covariates, false);
        }

        /**
         * Close the csv printer.
         *
         * No further output will be allowed or take place after calling this method.
         */
        public void close() {
            ps.close();
        }

    }

    /**
     * Returns a csv output printer.
     *
     * @param out the output file. It will be overridden
     * @param covs list of covariates to print out.
     *
     * @throws FileNotFoundException if out could not be created anew.
     *
     * @return never null
     */
    protected static CsvPrinter csvPrinter(final File out, final StandardCovariateList covs) throws FileNotFoundException {
        Utils.nonNull(covs, "the input covariate array cannot be null");
        return new CsvPrinter(out,covs);
    }

    /**
     * Prints out a collection of reports into a file in Csv format in a way
     * that can be used by R scripts (such as the plot generator script).
     * 

     * The set of covariates is take as the minimum common set from all reports.
     *
     * @param out the output file. It will be overridden.
     * @param reports map where keys are the unique 'mode' (ORIGINAL, RECALIBRATED, ...)
     *                of each report and the corresponding value the report itself.
     * @throws FileNotFoundException if out could not be created anew.
     */
    public static void generateCsv(final File out, final Map reports) throws FileNotFoundException {
        if (reports.isEmpty()) {
            throw new GATKException("no reports");
        }
        final RecalibrationReport firstReport = reports.values().iterator().next();
        final StandardCovariateList covariates = firstReport.getCovariates();
        writeCsv(out, reports, covariates);
    }

    /**
     * Print out a collection of reports into a file in Csv format in a way
     * that can be used by R scripts (such as the plot generator script).
     *
     * @param reports map where keys are the unique 'mode' (ORIGINAL, RECALIBRATED, ...)
     *                of each report and the corresponding value the report itself.
     * @param covs the covariates to print out.
     * @throws FileNotFoundException if out could not be created anew.
     */
    private static void writeCsv(final File out, final Map reports, final StandardCovariateList covs)
        throws FileNotFoundException {
        final CsvPrinter p = csvPrinter(out, covs);
        for (final Map.Entry e : reports.entrySet()) {
            p.print(e.getValue(),e.getKey());
        }
        p.close();
    }

    public static List generateReportTables(final RecalibrationTables recalibrationTables, final StandardCovariateList covariates) {
        final List result = new LinkedList<>();
        int rowIndex = 0;

        GATKReportTable allCovsReportTable = null;

        for (NestedIntegerArray table : recalibrationTables){
            final ArrayList> columnNames = new ArrayList<>(); // initialize the array to hold the column names
            columnNames.add(new MutablePair<>(covariates.getReadGroupCovariate().parseNameForReport(), "%s")); // save the required covariate name so we can reference it in the future
            if (!recalibrationTables.isReadGroupTable(table)) {
                columnNames.add(new MutablePair<>(covariates.getQualityScoreCovariate().parseNameForReport(), "%d")); // save the required covariate name so we can reference it in the future
                if (recalibrationTables.isAdditionalCovariateTable(table)) {
                    columnNames.add(covariateValue);
                    columnNames.add(covariateName);
                }
            }

            columnNames.add(eventType); // the order of these column names is important here
            columnNames.add(empiricalQuality);
            if (recalibrationTables.isReadGroupTable(table)) {
                columnNames.add(estimatedQReported); // only the read group table needs the estimated Q reported
            }
            columnNames.add(nObservations);
            columnNames.add(nErrors);

            final String reportTableName = getReportTableName(recalibrationTables, table);

            final GATKReportTable.Sorting sort = GATKReportTable.Sorting.SORT_BY_COLUMN;

            final GATKReportTable reportTable;
            final boolean addToList;

            //XXX this "if" implicitly uses the knowledge about the ordering of tables.
            if (!recalibrationTables.isAdditionalCovariateTable(table)) {
                reportTable = makeNewTableWithColumns(columnNames, reportTableName, sort);
                rowIndex = 0; // reset the row index since we're starting with a new table
                addToList = true;
            } else if (allCovsReportTable == null && recalibrationTables.isAdditionalCovariateTable(table)) {
                reportTable = makeNewTableWithColumns(columnNames, reportTableName, sort);
                rowIndex = 0; // reset the row index since we're starting with a new table
                allCovsReportTable =  reportTable;
                addToList = true;
            } else {
                reportTable = allCovsReportTable;
                addToList = false;
            }

            for (final NestedIntegerArray.Leaf row : table.getAllLeaves()) {
                final RecalDatum datum = row.value;
                final int[] keys = row.keys;

                int columnIndex = 0;
                int keyIndex = 0;
                reportTable.set(rowIndex, columnNames.get(columnIndex++).getLeft(), covariates.getReadGroupCovariate().formatKey(keys[keyIndex++]));
                if (!recalibrationTables.isReadGroupTable(table)) {
                    reportTable.set(rowIndex, columnNames.get(columnIndex++).getLeft(), covariates.getQualityScoreCovariate().formatKey(keys[keyIndex++]));
                    if (recalibrationTables.isAdditionalCovariateTable(table)){
                        final Covariate covariate = recalibrationTables.getCovariateForTable(table);
                        reportTable.set(rowIndex, columnNames.get(columnIndex++).getLeft(), covariate.formatKey(keys[keyIndex++]));
                        reportTable.set(rowIndex, columnNames.get(columnIndex++).getLeft(), covariate.parseNameForReport());
                    }
                }

                final EventType event = EventType.eventFrom(keys[keyIndex]);
                reportTable.set(rowIndex, columnNames.get(columnIndex++).getLeft(), event.toString());

                reportTable.set(rowIndex, columnNames.get(columnIndex++).getLeft(), datum.getEmpiricalQuality());
                if (recalibrationTables.isReadGroupTable(table)) {
                    reportTable.set(rowIndex, columnNames.get(columnIndex++).getLeft(), datum.getEstimatedQReported()); // we only add the estimated Q reported in the RG table
                }
                reportTable.set(rowIndex, columnNames.get(columnIndex++).getLeft(), datum.getNumObservations());
                reportTable.set(rowIndex, columnNames.get(columnIndex).getLeft(), datum.getNumMismatches());

                rowIndex++;
            }
            if (addToList){
                result.add(reportTable);      //XXX using a set would be slow because the equals method on GATKReportTable is expensive.
            }
        }

        return result;
    }

    private static String getReportTableName(RecalibrationTables recalibrationTables, NestedIntegerArray table) {
        final String reportTableName;
        if (recalibrationTables.isReadGroupTable(table)){
            return READGROUP_REPORT_TABLE_TITLE;
        } else if (recalibrationTables.isQualityScoreTable(table)){
            return QUALITY_SCORE_REPORT_TABLE_TITLE;
        } else {
            return ALL_COVARIATES_REPORT_TABLE_TITLE;
        }
    }

    private static GATKReportTable makeNewTableWithColumns(ArrayList> columnNames, String reportTableName, GATKReportTable.Sorting sort) {
        final GATKReportTable rt = new GATKReportTable(reportTableName, "", columnNames.size(), sort);
        for (final Pair columnName : columnNames) {
            rt.addColumn(columnName.getLeft(), columnName.getRight());
        }
        return rt;
    }

    /**
     * Outputs the GATK report to RAC.RECAL_TABLE.
     *
     * @param RAC The list of shared command line arguments
     * @param quantizationInfo Quantization info
     * @param recalibrationTables Recalibration tables
     * @param covariates The list of requested covariates
     */
    public static void outputRecalibrationReport(final PrintStream recalTableStream, final RecalibrationArgumentCollection RAC, final QuantizationInfo quantizationInfo, final RecalibrationTables recalibrationTables, final StandardCovariateList covariates) {
        final GATKReport report = createRecalibrationGATKReport(RAC.generateReportTable(covariates.covariateNames()), quantizationInfo.generateReportTable(), generateReportTables(recalibrationTables, covariates));
        report.print(recalTableStream);
    }

    /**
     * Creates a consolidated RecalibrationReport report from the tables.
     *
     * @param argumentTable Argument table
     * @param quantizationTable Quantization Table
     * @param recalTables Other recal tables
     * @return RecalibrationReport report
     */
    public static RecalibrationReport createRecalibrationReport(final GATKReportTable argumentTable, final GATKReportTable quantizationTable, final List recalTables) {
        final GATKReport report = RecalUtils.createRecalibrationGATKReport(argumentTable, quantizationTable, recalTables);
        return new RecalibrationReport(report);
    }

    /**
     * Creates a consolidated GATK report, first generating report tables. Report can then be written to a stream via GATKReport.print(PrintStream).
     *
     * @param argumentTable Argument table
     * @param quantizationInfo Quantization info
     * @param recalibrationTables Recalibration tables
     * @param covariates The list of covariates
     * @return GATK report
     */
    public static GATKReport createRecalibrationGATKReport(final GATKReportTable argumentTable, final QuantizationInfo quantizationInfo, final RecalibrationTables recalibrationTables, final StandardCovariateList covariates) {
        return createRecalibrationGATKReport(argumentTable, quantizationInfo.generateReportTable(), generateReportTables(recalibrationTables, covariates));
    }

    /**
     * Creates a consolidated GATK report from the tables. Report can then be written to a stream via GATKReport.print(PrintStream).
     *
     * @param argumentTable Argument table
     * @param quantizationTable Quantization Table
     * @param recalTables Other recal tables
     * @return GATK report
     */
    public static GATKReport createRecalibrationGATKReport(final GATKReportTable argumentTable, final GATKReportTable quantizationTable, final List recalTables) {
        final GATKReport report = new GATKReport();
        report.addTable(argumentTable);
        report.addTable(quantizationTable);
        report.addTables(recalTables);
        return report;
    }

    /**                                                s
     * Write recalibration plots into a file
     *
     * @param csvFile location of the intermediary file
     * @param maybeGzipedExampleReportFile where the report arguments are collected from.
     * @param output result plot file name.
     */
    public static void generatePlots(final File csvFile, final File maybeGzipedExampleReportFile, final File output) {
        final File exampleReportFile = IOUtils.gunzipToTempIfNeeded(maybeGzipedExampleReportFile);
        final RScriptExecutor executor = new RScriptExecutor();
        executor.addScript(loadBQSRScriptResource());
        executor.addArgs(csvFile.getAbsolutePath());
        executor.addArgs(exampleReportFile.getAbsolutePath());
        executor.addArgs(output.getAbsolutePath());
        LogManager.getLogger(RecalUtils.class).debug("R command line: " + executor.getApproximateCommandLine());
        executor.exec();
    }

    private static void writeCsv(final PrintStream deltaTableFile, final RecalibrationTables recalibrationTables, final String recalibrationMode, final StandardCovariateList covariates, final boolean printHeader) {

        final NestedIntegerArray deltaTable = createDeltaTable(recalibrationTables, covariates.size());

        // add the quality score table to the delta table
        final NestedIntegerArray qualTable = recalibrationTables.getQualityScoreTable();
        for (final NestedIntegerArray.Leaf leaf : qualTable.getAllLeaves()) { // go through every element in the covariates table to create the delta table
            final int[] newCovs = new int[4];
            newCovs[0] = leaf.keys[0];
            newCovs[1] = covariates.size(); // replace the covariate name with an arbitrary (unused) index for QualityScore. This is a HACK.
            newCovs[2] = leaf.keys[1];
            newCovs[3] = leaf.keys[2];
            addToDeltaTable(deltaTable, newCovs, leaf.value); // add this covariate to the delta table
        }

        // add the optional covariates to the delta table
        for(final NestedIntegerArray covTable : recalibrationTables.getAdditionalTables()){
            for (final NestedIntegerArray.Leaf leaf : covTable.getAllLeaves()) {
                final int[] covs = new int[4];
                covs[0] = leaf.keys[0];
                covs[1] = covariates.indexByClass(recalibrationTables.getCovariateForTable(covTable).getClass()); // reset the quality score covariate to 0 from the keyset (so we aggregate all rows regardless of QS)
                covs[2] = leaf.keys[2];
                covs[3] = leaf.keys[3];
                addToDeltaTable(deltaTable, covs, leaf.value); // add this covariate to the delta table
            }
        }

        // output the csv file
        if (printHeader) {
            printHeader(deltaTableFile);
        }

        // print each data line
        for (final NestedIntegerArray.Leaf leaf : deltaTable.getAllLeaves()) {
            final List