All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.broadinstitute.hellbender.tools.copynumber.DenoiseReadCounts Maven / Gradle / Ivy

The newest version!
package org.broadinstitute.hellbender.tools.copynumber;

import org.apache.commons.math3.linear.RealMatrix;
import org.broadinstitute.barclay.argparser.Argument;
import org.broadinstitute.barclay.argparser.CommandLineProgramProperties;
import org.broadinstitute.barclay.help.DocumentedFeature;
import org.broadinstitute.hdf5.HDF5File;
import org.broadinstitute.hdf5.HDF5Library;
import org.broadinstitute.hellbender.cmdline.CommandLineProgram;
import org.broadinstitute.hellbender.cmdline.StandardArgumentDefinitions;
import org.broadinstitute.hellbender.cmdline.programgroups.CopyNumberProgramGroup;
import org.broadinstitute.hellbender.exceptions.UserException;
import org.broadinstitute.hellbender.tools.copynumber.arguments.CopyNumberArgumentValidationUtils;
import org.broadinstitute.hellbender.tools.copynumber.arguments.CopyNumberStandardArgument;
import org.broadinstitute.hellbender.tools.copynumber.denoising.GCBiasCorrector;
import org.broadinstitute.hellbender.tools.copynumber.denoising.HDF5SVDReadCountPanelOfNormals;
import org.broadinstitute.hellbender.tools.copynumber.denoising.SVDDenoisedCopyRatioResult;
import org.broadinstitute.hellbender.tools.copynumber.denoising.SVDDenoisingUtils;
import org.broadinstitute.hellbender.tools.copynumber.denoising.SVDReadCountPanelOfNormals;
import org.broadinstitute.hellbender.tools.copynumber.formats.collections.AnnotatedIntervalCollection;
import org.broadinstitute.hellbender.tools.copynumber.formats.collections.CopyRatioCollection;
import org.broadinstitute.hellbender.tools.copynumber.formats.collections.SimpleCountCollection;
import org.broadinstitute.hellbender.tools.copynumber.formats.records.annotation.CopyNumberAnnotations;

import java.io.File;

/**
 * Denoises read counts to produce denoised copy ratios.
 *
 * 

* Typically, a panel of normals produced by {@link CreateReadCountPanelOfNormals} is provided as input. * The input counts are then standardized by 1) transforming to fractional coverage, * 2) performing optional explicit GC-bias correction (if the panel contains GC-content annotated intervals), * 3) filtering intervals to those contained in the panel, 4) dividing by interval medians contained in the panel, * 5) dividing by the sample median, and 6) transforming to log2 copy ratio. The result is then denoised by * subtracting the projection onto the specified number of principal components from the panel. *

* *

* If no panel is provided, then the input counts are instead standardized by 1) transforming to fractional coverage, * 2) performing optional explicit GC-bias correction (if GC-content annotated intervals are provided), * 3) dividing by the sample median, and 4) transforming to log2 copy ratio. No denoising is performed, * so the denoised result is simply taken to be identical to the standardized result. *

* *

* If performed, explicit GC-bias correction is done by {@link GCBiasCorrector}. *

* *

* Note that {@code number-of-eigensamples} principal components from the input panel will be used for * denoising; if only fewer are available in the panel, then they will all be used. This parameter can * thus be used to control the amount of denoising, which will ultimately affect the sensitivity of the analysis. *

* *

* See comments for {@link CreateReadCountPanelOfNormals} regarding coverage on sex chromosomes. If sex * chromosomes are not excluded from coverage collection, it is strongly recommended that case samples are * denoised only with panels containing only individuals of the same sex as the case samples. *

* *

Inputs

* *
    *
  • * Counts TSV or HDF5 file from {@link CollectReadCounts}. *
  • *
  • * (Optional) Panel-of-normals from {@link CreateReadCountPanelOfNormals}. * If provided, it will be used to standardize and denoise the input counts. This may include * explicit GC-bias correction if annotated intervals were used to create the panel. *
  • *
  • * (Optional) GC-content annotated-intervals from {@link AnnotateIntervals}. * This can be provided in place of a panel of normals to perform explicit GC-bias correction. *
  • *
* *

Outputs

* *
    *
  • * Standardized-copy-ratios file. * This is a tab-separated values (TSV) file with a SAM-style header containing a read group sample name, a sequence dictionary, * a row specifying the column headers contained in {@link CopyRatioCollection.CopyRatioTableColumn}, * and the corresponding entry rows. *
  • *
  • * Denoised-copy-ratios file. * This is a tab-separated values (TSV) file with a SAM-style header containing a read group sample name, a sequence dictionary, * a row specifying the column headers contained in {@link CopyRatioCollection.CopyRatioTableColumn}, * and the corresponding entry rows. *
  • *
* *

Usage examples

* *
 *     gatk DenoiseReadCounts \
 *          -I sample.counts.hdf5 \
 *          --count-panel-of-normals panel_of_normals.pon.hdf5 \
 *          --standardized-copy-ratios sample.standardizedCR.tsv \
 *          --denoised-copy-ratios sample.denoisedCR.tsv
 * 
* *
 *     gatk DenoiseReadCounts \
 *          -I sample.counts.hdf5 \
 *          --annotated-intervals annotated_intervals.tsv \
 *          --standardized-copy-ratios sample.standardizedCR.tsv \
 *          --denoised-copy-ratios sample.denoisedCR.tsv
 * 
* *
 *     gatk DenoiseReadCounts \
 *          -I sample.counts.hdf5 \
 *          --standardized-copy-ratios sample.standardizedCR.tsv \
 *          --denoised-copy-ratios sample.denoisedCR.tsv
 * 
* * @author Samuel Lee <[email protected]> */ @CommandLineProgramProperties( summary = "Denoises read counts to produce denoised copy ratios", oneLineSummary = "Denoises read counts to produce denoised copy ratios", programGroup = CopyNumberProgramGroup.class ) @DocumentedFeature public final class DenoiseReadCounts extends CommandLineProgram { @Argument( doc = "Input TSV or HDF5 file containing integer read counts in genomic intervals for a single case sample (output of CollectReadCounts).", fullName = StandardArgumentDefinitions.INPUT_LONG_NAME, shortName = StandardArgumentDefinitions.INPUT_SHORT_NAME ) private File inputReadCountFile; @Argument( doc = "Input HDF5 file containing the panel of normals (output of CreateReadCountPanelOfNormals).", fullName = CopyNumberStandardArgument.COUNT_PANEL_OF_NORMALS_FILE_LONG_NAME, optional = true ) private File inputPanelOfNormalsFile = null; @Argument( doc = "Input file containing annotations for GC content in genomic intervals (output of AnnotateIntervals). " + "Intervals must be identical to and in the same order as those in the input read-counts file. " + "If a panel of normals is provided, this input will be ignored.", fullName = CopyNumberStandardArgument.ANNOTATED_INTERVALS_FILE_LONG_NAME, optional = true ) private File inputAnnotatedIntervalsFile = null; @Argument( doc = "Output file for standardized copy ratios. GC-bias correction will be performed if annotations for GC content are provided.", fullName = CopyNumberStandardArgument.STANDARDIZED_COPY_RATIOS_FILE_LONG_NAME ) private File outputStandardizedCopyRatiosFile; @Argument( doc = "Output file for denoised copy ratios.", fullName = CopyNumberStandardArgument.DENOISED_COPY_RATIOS_FILE_LONG_NAME ) private File outputDenoisedCopyRatiosFile; @Argument( doc = "Number of eigensamples to use for denoising. " + "If not specified or if the number of eigensamples available in the panel of normals " + "is smaller than this, all eigensamples will be used.", fullName = CopyNumberStandardArgument.NUMBER_OF_EIGENSAMPLES_LONG_NAME, minValue = 0, optional = true ) private Integer numEigensamplesRequested = null; @Override protected Object doWork() { if (!new HDF5Library().load(null)) { //Note: passing null means using the default temp dir. throw new UserException.HardwareFeatureException("Cannot load the required HDF5 library. " + "HDF5 is currently supported on x86-64 architecture and Linux or OSX systems."); } validateArguments(); logger.info(String.format("Reading read-counts file (%s)...", inputReadCountFile)); final SimpleCountCollection readCounts = SimpleCountCollection.read(inputReadCountFile); if (inputPanelOfNormalsFile != null) { //denoise using panel of normals try (final HDF5File hdf5PanelOfNormalsFile = new HDF5File(inputPanelOfNormalsFile)) { //HDF5File implements AutoCloseable final SVDReadCountPanelOfNormals panelOfNormals = HDF5SVDReadCountPanelOfNormals.read(hdf5PanelOfNormalsFile); if (inputAnnotatedIntervalsFile != null) { logger.warn("Panel of normals was provided; ignoring input GC-content annotations..."); } //perform denoising and write result final int numEigensamples = numEigensamplesRequested == null ? panelOfNormals.getNumEigensamples() : Math.min(panelOfNormals.getNumEigensamples(), this.numEigensamplesRequested); if (numEigensamplesRequested != null && numEigensamples < numEigensamplesRequested) { logger.warn(String.format("%d eigensamples were requested but only %d are available in the panel of normals...", numEigensamplesRequested, numEigensamples)); } final SVDDenoisedCopyRatioResult denoisedCopyRatioResult = panelOfNormals.denoise(readCounts, numEigensamples); writeResult(denoisedCopyRatioResult); } } else { //standardize and perform optional GC-bias correction //get GC content (null if not provided) final AnnotatedIntervalCollection annotatedIntervals = CopyNumberArgumentValidationUtils.validateAnnotatedIntervals( inputAnnotatedIntervalsFile, readCounts, logger); final double[] intervalGCContent = annotatedIntervals == null ? null : annotatedIntervals.getRecords().stream() .mapToDouble(i -> i.getAnnotationMap().getValue(CopyNumberAnnotations.GC_CONTENT)) .toArray(); if (intervalGCContent == null) { logger.warn("Neither a panel of normals nor GC-content annotations were provided, so only standardization will be performed..."); } final RealMatrix standardizedCopyRatioValues = SVDDenoisingUtils.preprocessAndStandardizeSample(readCounts.getCounts(), intervalGCContent); //construct a result with denoised result identical to standardized result final SVDDenoisedCopyRatioResult standardizedResult = new SVDDenoisedCopyRatioResult( readCounts.getMetadata(), readCounts.getIntervals(), standardizedCopyRatioValues, standardizedCopyRatioValues); writeResult(standardizedResult); } logger.info(String.format("%s complete.", getClass().getSimpleName())); return null; } private void validateArguments() { CopyNumberArgumentValidationUtils.validateInputs( inputReadCountFile, inputPanelOfNormalsFile, inputAnnotatedIntervalsFile); CopyNumberArgumentValidationUtils.validateOutputFiles( outputStandardizedCopyRatiosFile, outputStandardizedCopyRatiosFile); } private void writeResult(final SVDDenoisedCopyRatioResult result) { logger.info(String.format("Writing standardized and denoised copy ratios to %s and %s...", outputStandardizedCopyRatiosFile.getAbsolutePath(), outputDenoisedCopyRatiosFile.getAbsolutePath())); result.write(outputStandardizedCopyRatiosFile, outputDenoisedCopyRatiosFile); } }




© 2015 - 2025 Weber Informatics LLC | Privacy Policy