Many resources are needed to download a project. Please understand that we have to compensate our server costs. Thank you in advance. Project price only 1 $
You can buy this project and download/modify it how often you want.
package org.broadinstitute.hellbender.utils.recalibration;
import htsjdk.samtools.SAMFileHeader;
import htsjdk.samtools.SAMReadGroupRecord;
import org.apache.commons.lang3.tuple.MutablePair;
import org.apache.commons.lang3.tuple.Pair;
import org.apache.logging.log4j.LogManager;
import org.broadinstitute.hellbender.exceptions.GATKException;
import org.broadinstitute.hellbender.exceptions.UserException;
import org.broadinstitute.hellbender.utils.R.RScriptExecutor;
import org.broadinstitute.hellbender.utils.Utils;
import org.broadinstitute.hellbender.utils.collections.NestedIntegerArray;
import org.broadinstitute.hellbender.utils.io.IOUtils;
import org.broadinstitute.hellbender.utils.io.Resource;
import org.broadinstitute.hellbender.utils.read.GATKRead;
import org.broadinstitute.hellbender.utils.read.ReadUtils;
import org.broadinstitute.hellbender.utils.recalibration.covariates.Covariate;
import org.broadinstitute.hellbender.utils.recalibration.covariates.CovariateKeyCache;
import org.broadinstitute.hellbender.utils.recalibration.covariates.ReadCovariates;
import org.broadinstitute.hellbender.utils.recalibration.covariates.StandardCovariateList;
import org.broadinstitute.hellbender.utils.report.GATKReport;
import org.broadinstitute.hellbender.utils.report.GATKReportTable;
import java.io.*;
import java.util.*;
/**
* This helper class holds the data HashMap as well as submaps that represent the marginal distributions collapsed over all needed dimensions.
* It also has static methods that are used to perform the various solid recalibration modes that attempt to correct the reference bias.
* This class holds the parsing methods that are shared between BaseRecalibrator and PrintReads.
*/
public final class RecalUtils {
public static final String ARGUMENT_REPORT_TABLE_TITLE = "Arguments";
public static final String QUANTIZED_REPORT_TABLE_TITLE = "Quantized";
public static final String READGROUP_REPORT_TABLE_TITLE = "RecalTable0";
public static final String QUALITY_SCORE_REPORT_TABLE_TITLE = "RecalTable1";
public static final String ALL_COVARIATES_REPORT_TABLE_TITLE = "RecalTable2";
public static final String ARGUMENT_COLUMN_NAME = "Argument";
public static final String ARGUMENT_VALUE_COLUMN_NAME = "Value";
public static final String QUANTIZED_VALUE_COLUMN_NAME = "QuantizedScore";
public static final String QUANTIZED_COUNT_COLUMN_NAME = "Count";
public static final String READGROUP_COLUMN_NAME = "ReadGroup";
public static final String EVENT_TYPE_COLUMN_NAME = "EventType";
public static final String EMPIRICAL_QUALITY_COLUMN_NAME = "EmpiricalQuality";
public static final String ESTIMATED_Q_REPORTED_COLUMN_NAME = "EstimatedQReported";
public static final String QUALITY_SCORE_COLUMN_NAME = "QualityScore";
public static final String COVARIATE_VALUE_COLUMN_NAME = "CovariateValue";
public static final String COVARIATE_NAME_COLUMN_NAME = "CovariateName";
public static final String NUMBER_OBSERVATIONS_COLUMN_NAME = "Observations";
public static final String NUMBER_ERRORS_COLUMN_NAME = "Errors";
private static boolean warnUserNullPlatform = false;
private static final String SCRIPT_FILE = "BQSR.R";
public static final int EMPIRICAL_QUAL_DECIMAL_PLACES = 4;
public static final int EMPIRICAL_Q_REPORTED_DECIMAL_PLACES = 4;
public static final int NUMBER_ERRORS_DECIMAL_PLACES = 2;
private static final Pair covariateValue = new MutablePair<>(RecalUtils.COVARIATE_VALUE_COLUMN_NAME, "%s");
private static final Pair covariateName = new MutablePair<>(RecalUtils.COVARIATE_NAME_COLUMN_NAME, "%s");
private static final Pair eventType = new MutablePair<>(RecalUtils.EVENT_TYPE_COLUMN_NAME, "%s");
private static final Pair empiricalQuality = new MutablePair<>(RecalUtils.EMPIRICAL_QUALITY_COLUMN_NAME, "%." + EMPIRICAL_QUAL_DECIMAL_PLACES + 'f');
private static final Pair estimatedQReported = new MutablePair<>(RecalUtils.ESTIMATED_Q_REPORTED_COLUMN_NAME, "%." + EMPIRICAL_Q_REPORTED_DECIMAL_PLACES + 'f');
private static final Pair nObservations = new MutablePair<>(RecalUtils.NUMBER_OBSERVATIONS_COLUMN_NAME, "%d");
private static final Pair nErrors = new MutablePair<>(RecalUtils.NUMBER_ERRORS_COLUMN_NAME, "%." + NUMBER_ERRORS_DECIMAL_PLACES+ 'f');
/**
* Component used to print out csv representation of the reports that can be use to perform analysis in
* external tools. E.g. generate plots using R scripts.
*
* A header is always printed into the output stream (or file) when the printer is created. Then you only need
* to call {@link #print(RecalibrationReport, String) print} for each report you want to include in the csv file.
* Once finished, you close the printer calling {@link #close() close}
*
*/
private static class CsvPrinter {
private final PrintStream ps;
private final StandardCovariateList covariates;
/**
* Constructs a printer redirected to an output file.
* @param out the output file.
* @param covs covariates to print out.
* @throws FileNotFoundException if the file could not be created anew.
*/
protected CsvPrinter(final File out, final StandardCovariateList covs)
throws FileNotFoundException {
this(new FileOutputStream(out), covs);
}
/**
* Constructs a printer redirected to an output stream
* @param os the output.
* @param covs covariates to print out.
*/
protected CsvPrinter(final OutputStream os, final StandardCovariateList covs) {
covariates = covs;
ps = new PrintStream(os);
printHeader();
}
/**
* Prints the header out.
*
* Should only be invoked at creation.
*/
protected void printHeader() {
RecalUtils.printHeader(ps);
}
/**
* Prints out a report into the csv file.
*
*
* @param report the report to print out.
* @param mode the report associated mode. (typically ORIGINAL, RECALIBRATED
*/
public void print(final RecalibrationReport report, final String mode) {
RecalUtils.writeCsv(ps, report.getRecalibrationTables(), mode, covariates, false);
}
/**
* Close the csv printer.
*
* No further output will be allowed or take place after calling this method.
*/
public void close() {
ps.close();
}
}
/**
* Returns a csv output printer.
*
* @param out the output file. It will be overridden
* @param covs list of covariates to print out.
*
* @throws FileNotFoundException if out could not be created anew.
*
* @return never null
*/
protected static CsvPrinter csvPrinter(final File out, final StandardCovariateList covs) throws FileNotFoundException {
Utils.nonNull(covs, "the input covariate array cannot be null");
return new CsvPrinter(out,covs);
}
/**
* Prints out a collection of reports into a file in Csv format in a way
* that can be used by R scripts (such as the plot generator script).
*
* The set of covariates is take as the minimum common set from all reports.
*
* @param out the output file. It will be overridden.
* @param reports map where keys are the unique 'mode' (ORIGINAL, RECALIBRATED, ...)
* of each report and the corresponding value the report itself.
* @throws FileNotFoundException if out could not be created anew.
*/
public static void generateCsv(final File out, final Map reports) throws FileNotFoundException {
if (reports.isEmpty()) {
throw new GATKException("no reports");
}
final RecalibrationReport firstReport = reports.values().iterator().next();
final StandardCovariateList covariates = firstReport.getCovariates();
writeCsv(out, reports, covariates);
}
/**
* Print out a collection of reports into a file in Csv format in a way
* that can be used by R scripts (such as the plot generator script).
*
* @param reports map where keys are the unique 'mode' (ORIGINAL, RECALIBRATED, ...)
* of each report and the corresponding value the report itself.
* @param covs the covariates to print out.
* @throws FileNotFoundException if out could not be created anew.
*/
private static void writeCsv(final File out, final Map reports, final StandardCovariateList covs)
throws FileNotFoundException {
final CsvPrinter p = csvPrinter(out, covs);
for (final Map.Entry e : reports.entrySet()) {
p.print(e.getValue(),e.getKey());
}
p.close();
}
public static List generateReportTables(final RecalibrationTables recalibrationTables, final StandardCovariateList covariates) {
final List result = new LinkedList<>();
int rowIndex = 0;
GATKReportTable allCovsReportTable = null;
for (NestedIntegerArray table : recalibrationTables){
final ArrayList> columnNames = new ArrayList<>(); // initialize the array to hold the column names
columnNames.add(new MutablePair<>(covariates.getReadGroupCovariate().parseNameForReport(), "%s")); // save the required covariate name so we can reference it in the future
if (!recalibrationTables.isReadGroupTable(table)) {
columnNames.add(new MutablePair<>(covariates.getQualityScoreCovariate().parseNameForReport(), "%d")); // save the required covariate name so we can reference it in the future
if (recalibrationTables.isAdditionalCovariateTable(table)) {
columnNames.add(covariateValue);
columnNames.add(covariateName);
}
}
columnNames.add(eventType); // the order of these column names is important here
columnNames.add(empiricalQuality);
if (recalibrationTables.isReadGroupTable(table)) {
columnNames.add(estimatedQReported); // only the read group table needs the estimated Q reported
}
columnNames.add(nObservations);
columnNames.add(nErrors);
final String reportTableName = getReportTableName(recalibrationTables, table);
final GATKReportTable.Sorting sort = GATKReportTable.Sorting.SORT_BY_COLUMN;
final GATKReportTable reportTable;
final boolean addToList;
//XXX this "if" implicitly uses the knowledge about the ordering of tables.
if (!recalibrationTables.isAdditionalCovariateTable(table)) {
reportTable = makeNewTableWithColumns(columnNames, reportTableName, sort);
rowIndex = 0; // reset the row index since we're starting with a new table
addToList = true;
} else if (allCovsReportTable == null && recalibrationTables.isAdditionalCovariateTable(table)) {
reportTable = makeNewTableWithColumns(columnNames, reportTableName, sort);
rowIndex = 0; // reset the row index since we're starting with a new table
allCovsReportTable = reportTable;
addToList = true;
} else {
reportTable = allCovsReportTable;
addToList = false;
}
for (final NestedIntegerArray.Leaf row : table.getAllLeaves()) {
final RecalDatum datum = row.value;
final int[] keys = row.keys;
int columnIndex = 0;
int keyIndex = 0;
reportTable.set(rowIndex, columnNames.get(columnIndex++).getLeft(), covariates.getReadGroupCovariate().formatKey(keys[keyIndex++]));
if (!recalibrationTables.isReadGroupTable(table)) {
reportTable.set(rowIndex, columnNames.get(columnIndex++).getLeft(), covariates.getQualityScoreCovariate().formatKey(keys[keyIndex++]));
if (recalibrationTables.isAdditionalCovariateTable(table)){
final Covariate covariate = recalibrationTables.getCovariateForTable(table);
reportTable.set(rowIndex, columnNames.get(columnIndex++).getLeft(), covariate.formatKey(keys[keyIndex++]));
reportTable.set(rowIndex, columnNames.get(columnIndex++).getLeft(), covariate.parseNameForReport());
}
}
final EventType event = EventType.eventFrom(keys[keyIndex]);
reportTable.set(rowIndex, columnNames.get(columnIndex++).getLeft(), event.toString());
reportTable.set(rowIndex, columnNames.get(columnIndex++).getLeft(), datum.getEmpiricalQuality());
if (recalibrationTables.isReadGroupTable(table)) {
reportTable.set(rowIndex, columnNames.get(columnIndex++).getLeft(), datum.getEstimatedQReported()); // we only add the estimated Q reported in the RG table
}
reportTable.set(rowIndex, columnNames.get(columnIndex++).getLeft(), datum.getNumObservations());
reportTable.set(rowIndex, columnNames.get(columnIndex).getLeft(), datum.getNumMismatches());
rowIndex++;
}
if (addToList){
result.add(reportTable); //XXX using a set would be slow because the equals method on GATKReportTable is expensive.
}
}
return result;
}
private static String getReportTableName(RecalibrationTables recalibrationTables, NestedIntegerArray table) {
final String reportTableName;
if (recalibrationTables.isReadGroupTable(table)){
return READGROUP_REPORT_TABLE_TITLE;
} else if (recalibrationTables.isQualityScoreTable(table)){
return QUALITY_SCORE_REPORT_TABLE_TITLE;
} else {
return ALL_COVARIATES_REPORT_TABLE_TITLE;
}
}
private static GATKReportTable makeNewTableWithColumns(ArrayList> columnNames, String reportTableName, GATKReportTable.Sorting sort) {
final GATKReportTable rt = new GATKReportTable(reportTableName, "", columnNames.size(), sort);
for (final Pair columnName : columnNames) {
rt.addColumn(columnName.getLeft(), columnName.getRight());
}
return rt;
}
/**
* Outputs the GATK report to RAC.RECAL_TABLE.
*
* @param RAC The list of shared command line arguments
* @param quantizationInfo Quantization info
* @param recalibrationTables Recalibration tables
* @param covariates The list of requested covariates
*/
public static void outputRecalibrationReport(final PrintStream recalTableStream, final RecalibrationArgumentCollection RAC, final QuantizationInfo quantizationInfo, final RecalibrationTables recalibrationTables, final StandardCovariateList covariates) {
final GATKReport report = createRecalibrationGATKReport(RAC.generateReportTable(covariates.covariateNames()), quantizationInfo.generateReportTable(), generateReportTables(recalibrationTables, covariates));
report.print(recalTableStream);
}
/**
* Creates a consolidated RecalibrationReport report from the tables.
*
* @param argumentTable Argument table
* @param quantizationTable Quantization Table
* @param recalTables Other recal tables
* @return RecalibrationReport report
*/
public static RecalibrationReport createRecalibrationReport(final GATKReportTable argumentTable, final GATKReportTable quantizationTable, final List recalTables) {
final GATKReport report = RecalUtils.createRecalibrationGATKReport(argumentTable, quantizationTable, recalTables);
return new RecalibrationReport(report);
}
/**
* Creates a consolidated GATK report, first generating report tables. Report can then be written to a stream via GATKReport.print(PrintStream).
*
* @param argumentTable Argument table
* @param quantizationInfo Quantization info
* @param recalibrationTables Recalibration tables
* @param covariates The list of covariates
* @return GATK report
*/
public static GATKReport createRecalibrationGATKReport(final GATKReportTable argumentTable, final QuantizationInfo quantizationInfo, final RecalibrationTables recalibrationTables, final StandardCovariateList covariates) {
return createRecalibrationGATKReport(argumentTable, quantizationInfo.generateReportTable(), generateReportTables(recalibrationTables, covariates));
}
/**
* Creates a consolidated GATK report from the tables. Report can then be written to a stream via GATKReport.print(PrintStream).
*
* @param argumentTable Argument table
* @param quantizationTable Quantization Table
* @param recalTables Other recal tables
* @return GATK report
*/
public static GATKReport createRecalibrationGATKReport(final GATKReportTable argumentTable, final GATKReportTable quantizationTable, final List recalTables) {
final GATKReport report = new GATKReport();
report.addTable(argumentTable);
report.addTable(quantizationTable);
report.addTables(recalTables);
return report;
}
/** s
* Write recalibration plots into a file
*
* @param csvFile location of the intermediary file
* @param maybeGzipedExampleReportFile where the report arguments are collected from.
* @param output result plot file name.
*/
public static void generatePlots(final File csvFile, final File maybeGzipedExampleReportFile, final File output) {
final File exampleReportFile = IOUtils.gunzipToTempIfNeeded(maybeGzipedExampleReportFile);
final RScriptExecutor executor = new RScriptExecutor();
executor.addScript(loadBQSRScriptResource());
executor.addArgs(csvFile.getAbsolutePath());
executor.addArgs(exampleReportFile.getAbsolutePath());
executor.addArgs(output.getAbsolutePath());
LogManager.getLogger(RecalUtils.class).debug("R command line: " + executor.getApproximateCommandLine());
executor.exec();
}
private static void writeCsv(final PrintStream deltaTableFile, final RecalibrationTables recalibrationTables, final String recalibrationMode, final StandardCovariateList covariates, final boolean printHeader) {
final NestedIntegerArray deltaTable = createDeltaTable(recalibrationTables, covariates.size());
// add the quality score table to the delta table
final NestedIntegerArray qualTable = recalibrationTables.getQualityScoreTable();
for (final NestedIntegerArray.Leaf leaf : qualTable.getAllLeaves()) { // go through every element in the covariates table to create the delta table
final int[] newCovs = new int[4];
newCovs[0] = leaf.keys[0];
newCovs[1] = covariates.size(); // replace the covariate name with an arbitrary (unused) index for QualityScore. This is a HACK.
newCovs[2] = leaf.keys[1];
newCovs[3] = leaf.keys[2];
addToDeltaTable(deltaTable, newCovs, leaf.value); // add this covariate to the delta table
}
// add the optional covariates to the delta table
for(final NestedIntegerArray covTable : recalibrationTables.getAdditionalTables()){
for (final NestedIntegerArray.Leaf leaf : covTable.getAllLeaves()) {
final int[] covs = new int[4];
covs[0] = leaf.keys[0];
covs[1] = covariates.indexByClass(recalibrationTables.getCovariateForTable(covTable).getClass()); // reset the quality score covariate to 0 from the keyset (so we aggregate all rows regardless of QS)
covs[2] = leaf.keys[2];
covs[3] = leaf.keys[3];
addToDeltaTable(deltaTable, covs, leaf.value); // add this covariate to the delta table
}
}
// output the csv file
if (printHeader) {
printHeader(deltaTableFile);
}
// print each data line
for (final NestedIntegerArray.Leaf leaf : deltaTable.getAllLeaves()) {
final List