All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.broadinstitute.hellbender.tools.copynumber.CreateReadCountPanelOfNormals Maven / Gradle / Ivy

The newest version!
package org.broadinstitute.hellbender.tools.copynumber;

import htsjdk.samtools.SAMSequenceDictionary;
import org.apache.commons.math3.linear.Array2DRowRealMatrix;
import org.apache.commons.math3.linear.RealMatrix;
import org.apache.logging.log4j.Logger;
import org.apache.spark.api.java.JavaSparkContext;
import org.broadinstitute.barclay.argparser.Advanced;
import org.broadinstitute.barclay.argparser.Argument;
import org.broadinstitute.barclay.argparser.CommandLineProgramProperties;
import org.broadinstitute.barclay.help.DocumentedFeature;
import org.broadinstitute.hdf5.HDF5Library;
import org.broadinstitute.hellbender.cmdline.StandardArgumentDefinitions;
import org.broadinstitute.hellbender.cmdline.programgroups.CopyNumberProgramGroup;
import org.broadinstitute.hellbender.engine.spark.SparkCommandLineProgram;
import org.broadinstitute.hellbender.exceptions.UserException;
import org.broadinstitute.hellbender.tools.copynumber.arguments.CopyNumberArgumentValidationUtils;
import org.broadinstitute.hellbender.tools.copynumber.arguments.CopyNumberStandardArgument;
import org.broadinstitute.hellbender.tools.copynumber.denoising.GCBiasCorrector;
import org.broadinstitute.hellbender.tools.copynumber.denoising.HDF5SVDReadCountPanelOfNormals;
import org.broadinstitute.hellbender.tools.copynumber.formats.collections.AnnotatedIntervalCollection;
import org.broadinstitute.hellbender.tools.copynumber.formats.collections.SimpleCountCollection;
import org.broadinstitute.hellbender.tools.copynumber.formats.records.annotation.CopyNumberAnnotations;
import org.broadinstitute.hellbender.tools.copynumber.utils.HDF5Utils;
import org.broadinstitute.hellbender.utils.SimpleInterval;
import org.broadinstitute.hellbender.utils.Utils;

import java.io.File;
import java.util.ArrayList;
import java.util.HashSet;
import java.util.List;
import java.util.ListIterator;
import java.util.stream.Collectors;

/**
 * Creates a panel of normals (PoN) for read-count denoising given the read counts for samples in the panel.
 * The resulting PoN can be used with {@link DenoiseReadCounts} to denoise other samples.
 *
 * 

* The input read counts are first transformed to log2 fractional coverages and preprocessed * according to specified filtering and imputation parameters. Singular value decomposition (SVD) * is then performed to find the first {@code number-of-eigensamples} principal components, * which are stored in the PoN. Some or all of these principal components can then be used for * denoising case samples with {@link DenoiseReadCounts}; it is assumed that the principal components used * represent systematic sequencing biases (rather than statistical noise). Examining the singular values, * which are also stored in the PoN, may be useful in determining the appropriate number * of principal components to use for denoising. *

* *

* If annotated intervals are provided, explicit GC-bias correction will be performed by {@link GCBiasCorrector} * before filtering and SVD. GC-content information for the intervals will be stored in the PoN * and used to perform explicit GC-bias correction identically in {@link DenoiseReadCounts}. * Note that if annotated intervals are not provided, it is still likely that GC-bias correction is * implicitly performed by the SVD denoising process (i.e., some of the principal components arise from GC bias). *

* *

* Note that such SVD denoising cannot distinguish between variance due to systematic sequencing biases and that * due to true common germline CNVs present in the panel; signal from the latter may thus be inadvertently denoised * away. Furthermore, variance arising from coverage on the sex chromosomes may also significantly contribute * to the principal components if the panel contains samples of mixed sex. Therefore, if sex chromosomes * are not excluded from coverage collection, it is strongly recommended that users avoid creating panels of * mixed sex and take care to denoise case samples only with panels containing only individuals of the same sex * as the case samples. (See {@link GermlineCNVCaller}, which avoids these issues by simultaneously learning * a probabilistic model for systematic bias and calling rare and common germline CNVs for samples in the panel.) *

* *

Inputs

* *
    *
  • * Counts files (TSV or HDF5 output of {@link CollectReadCounts}). *
  • *
  • * (Optional) GC-content annotated-intervals file from {@link AnnotateIntervals}. * Explicit GC-bias correction will be performed on the panel samples and identically for subsequent case samples. *
  • *
* *

Outputs

* *
    *
  • * Panel-of-normals file. * This is an HDF5 file containing the panel data in the paths defined in {@link HDF5SVDReadCountPanelOfNormals}. * HDF5 files may be viewed using hdfview * or loaded in Python using PyTables or h5py. *
  • *
* *

Usage examples

* *
 *     gatk CreateReadCountPanelOfNormals \
 *          -I sample_1.counts.hdf5 \
 *          -I sample_2.counts.hdf5 \
 *          ... \
 *          -O cnv.pon.hdf5
 * 
* *
 *     gatk CreateReadCountPanelOfNormals \
 *          -I sample_1.counts.hdf5 \
 *          -I sample_2.counts.tsv \
 *          ... \
 *          --annotated-intervals annotated_intervals.tsv \
 *          -O cnv.pon.hdf5
 * 
* * @author Samuel Lee <[email protected]> */ @CommandLineProgramProperties( summary = "Creates a panel of normals for read-count denoising given the read counts for samples in the panel", oneLineSummary = "Creates a panel of normals for read-count denoising", programGroup = CopyNumberProgramGroup.class ) @DocumentedFeature public final class CreateReadCountPanelOfNormals extends SparkCommandLineProgram { private static final long serialVersionUID = 1L; //default values for filtering private static final double DEFAULT_MINIMUM_INTERVAL_MEDIAN_PERCENTILE = 10.0; private static final double DEFAULT_MAXIMUM_ZEROS_IN_SAMPLE_PERCENTAGE = 5.0; private static final double DEFAULT_MAXIMUM_ZEROS_IN_INTERVAL_PERCENTAGE = 5.0; private static final double DEFAULT_EXTREME_SAMPLE_MEDIAN_PERCENTILE = 2.5; private static final boolean DEFAULT_DO_IMPUTE_ZEROS = true; private static final double DEFAULT_EXTREME_OUTLIER_TRUNCATION_PERCENTILE = 0.1; private static final int DEFAULT_NUMBER_OF_EIGENSAMPLES = 20; private static final int DEFAULT_CHUNK_DIVISOR = 16; private static final int DEFAULT_MAXIMUM_CHUNK_SIZE = HDF5Utils.MAX_NUMBER_OF_VALUES_PER_HDF5_MATRIX / DEFAULT_CHUNK_DIVISOR; //parameter names public static final String MINIMUM_INTERVAL_MEDIAN_PERCENTILE_LONG_NAME = "minimum-interval-median-percentile"; public static final String MAXIMUM_ZEROS_IN_SAMPLE_PERCENTAGE_LONG_NAME = "maximum-zeros-in-sample-percentage"; public static final String MAXIMUM_ZEROS_IN_INTERVAL_PERCENTAGE_LONG_NAME = "maximum-zeros-in-interval-percentage"; public static final String EXTREME_SAMPLE_MEDIAN_PERCENTILE_LONG_NAME = "extreme-sample-median-percentile"; public static final String IMPUTE_ZEROS_LONG_NAME = "do-impute-zeros"; public static final String EXTREME_OUTLIER_TRUNCATION_PERCENTILE_LONG_NAME = "extreme-outlier-truncation-percentile"; public static final String MAXIMUM_CHUNK_SIZE = "maximum-chunk-size"; @Argument( doc = "Input TSV or HDF5 files containing integer read counts in genomic intervals for all samples in the panel of normals (output of CollectReadCounts). " + "Intervals must be identical and in the same order for all samples.", fullName = StandardArgumentDefinitions.INPUT_LONG_NAME, shortName = StandardArgumentDefinitions.INPUT_SHORT_NAME, minElements = 1 ) private List inputReadCountFiles = new ArrayList<>(); @Argument( doc = "Input file containing annotations for GC content in genomic intervals (output of AnnotateIntervals). " + "If provided, explicit GC correction will be performed before performing SVD. " + "Intervals must be identical to and in the same order as those in the input read-counts files.", fullName = CopyNumberStandardArgument.ANNOTATED_INTERVALS_FILE_LONG_NAME, optional = true ) private File inputAnnotatedIntervalsFile = null; @Argument( doc = "Output file for the panel of normals.", fullName = StandardArgumentDefinitions.OUTPUT_LONG_NAME, shortName = StandardArgumentDefinitions.OUTPUT_SHORT_NAME ) private File outputPanelOfNormalsFile; @Argument( doc = "Genomic intervals with a median (across samples) of fractional coverage (optionally corrected for GC bias) " + "less than or equal to this percentile are filtered out. " + "(This is the first filter applied.)", fullName = MINIMUM_INTERVAL_MEDIAN_PERCENTILE_LONG_NAME, minValue = 0., maxValue = 100., optional = true ) private double minimumIntervalMedianPercentile = DEFAULT_MINIMUM_INTERVAL_MEDIAN_PERCENTILE; @Argument( doc = "Samples with a fraction of zero-coverage genomic intervals greater than or equal to this percentage are filtered out. " + "(This is the second filter applied.)", fullName = MAXIMUM_ZEROS_IN_SAMPLE_PERCENTAGE_LONG_NAME, minValue = 0., maxValue = 100., optional = true ) private double maximumZerosInSamplePercentage = DEFAULT_MAXIMUM_ZEROS_IN_SAMPLE_PERCENTAGE; @Argument( doc = "Genomic intervals with a fraction of zero-coverage samples greater than or equal to this percentage are filtered out. " + "(This is the third filter applied.)", fullName = MAXIMUM_ZEROS_IN_INTERVAL_PERCENTAGE_LONG_NAME, minValue = 0., maxValue = 100., optional = true ) private double maximumZerosInIntervalPercentage = DEFAULT_MAXIMUM_ZEROS_IN_INTERVAL_PERCENTAGE; @Argument( doc = "Samples with a median (across genomic intervals) of fractional coverage normalized by genomic-interval medians " + "strictly below this percentile or strictly above the complementary percentile are filtered out. " + "(This is the fourth filter applied.)", fullName = EXTREME_SAMPLE_MEDIAN_PERCENTILE_LONG_NAME, minValue = 0., maxValue = 50., optional = true ) private double extremeSampleMedianPercentile = DEFAULT_EXTREME_SAMPLE_MEDIAN_PERCENTILE; @Argument( doc = "If true, impute zero-coverage values as the median of the non-zero values in the corresponding interval. " + "(This is applied after all filters.)", fullName = IMPUTE_ZEROS_LONG_NAME, optional = true ) private boolean doImputeZeros = DEFAULT_DO_IMPUTE_ZEROS; @Argument( doc = "Fractional coverages normalized by genomic-interval medians that are " + "strictly below this percentile or strictly above the complementary percentile are set to the corresponding percentile value. " + "(This is applied after all filters and imputation.)", fullName = EXTREME_OUTLIER_TRUNCATION_PERCENTILE_LONG_NAME, minValue = 0., maxValue = 50., optional = true ) private double extremeOutlierTruncationPercentile = DEFAULT_EXTREME_OUTLIER_TRUNCATION_PERCENTILE; @Argument( doc = "Number of eigensamples to use for truncated SVD and to store in the panel of normals. " + "The number of samples retained after filtering will be used instead if it is smaller than this.", fullName = CopyNumberStandardArgument.NUMBER_OF_EIGENSAMPLES_LONG_NAME, minValue = 0, optional = true ) private int numEigensamplesRequested = DEFAULT_NUMBER_OF_EIGENSAMPLES; @Advanced @Argument( doc = "Maximum HDF5 matrix chunk size. Large matrices written to HDF5 are chunked into equally sized " + "subsets of rows (plus a subset containing the remainder, if necessary) to avoid a hard limit in " + "Java HDF5 on the number of elements in a matrix. However, since a single row is not allowed to " + "be split across multiple chunks, the number of columns must be less than the maximum number of " + "values in each chunk. Decreasing this number will reduce heap usage when writing chunks.", fullName = MAXIMUM_CHUNK_SIZE, minValue = 1, maxValue = HDF5Utils.MAX_NUMBER_OF_VALUES_PER_HDF5_MATRIX, optional = true ) private int maximumChunkSize = DEFAULT_MAXIMUM_CHUNK_SIZE; @Override protected void runPipeline(final JavaSparkContext ctx) { if (!new HDF5Library().load(null)) { //Note: passing null means using the default temp dir. throw new UserException.HardwareFeatureException("Cannot load the required HDF5 library. " + "HDF5 is currently supported on x86-64 architecture and Linux or OSX systems."); } validateArguments(); //get sample filenames final List sampleFilenames = inputReadCountFiles.stream().map(File::getAbsolutePath).collect(Collectors.toList()); //get sequence dictionary and intervals from the first read-counts file to use to validate remaining files //(this first file is read again below, which is slightly inefficient but is probably not worth the extra code) final File firstReadCountFile = inputReadCountFiles.get(0); logger.info(String.format("Retrieving intervals from first read-counts file (%s)...", firstReadCountFile)); final SimpleCountCollection firstReadCounts = SimpleCountCollection.read(firstReadCountFile); final SAMSequenceDictionary sequenceDictionary = firstReadCounts.getMetadata().getSequenceDictionary(); final List intervals = firstReadCounts.getIntervals(); Utils.validateArg(firstReadCounts.size() <= maximumChunkSize, String.format("The number of intervals (%d) in each read-counts file cannot exceed the maximum chunk size (%d).", firstReadCounts.size(), maximumChunkSize)); //get GC content (null if not provided) final AnnotatedIntervalCollection annotatedIntervals = CopyNumberArgumentValidationUtils.validateAnnotatedIntervals( inputAnnotatedIntervalsFile, firstReadCounts, logger); final double[] intervalGCContent = annotatedIntervals == null ? null : annotatedIntervals.getRecords().stream() .mapToDouble(i -> i.getAnnotationMap().getValue(CopyNumberAnnotations.GC_CONTENT)) .toArray(); //validate input read-counts files (i.e., check intervals and that only integer counts are contained) //and aggregate as a RealMatrix with dimensions numIntervals x numSamples final RealMatrix readCountMatrix = constructReadCountMatrix(logger, inputReadCountFiles, sequenceDictionary, intervals); //create the PoN logger.info("Creating the panel of normals..."); HDF5SVDReadCountPanelOfNormals.create(outputPanelOfNormalsFile, getCommandLine(), sequenceDictionary, readCountMatrix, sampleFilenames, intervals, intervalGCContent, minimumIntervalMedianPercentile, maximumZerosInSamplePercentage, maximumZerosInIntervalPercentage, extremeSampleMedianPercentile, doImputeZeros, extremeOutlierTruncationPercentile, numEigensamplesRequested, maximumChunkSize, ctx); logger.info(String.format("%s complete.", getClass().getSimpleName())); } private void validateArguments() { if (numEigensamplesRequested > inputReadCountFiles.size()) { logger.warn(String.format("Number of eigensamples (%d) is greater than the number of input samples (%d); " + "the number of samples retained after filtering will be used instead.", numEigensamplesRequested, inputReadCountFiles.size())); } Utils.validateArg(inputReadCountFiles.size() == new HashSet<>(inputReadCountFiles).size(), "List of input read-counts files cannot contain duplicates."); inputReadCountFiles.forEach(CopyNumberArgumentValidationUtils::validateInputs); CopyNumberArgumentValidationUtils.validateInputs(inputAnnotatedIntervalsFile); CopyNumberArgumentValidationUtils.validateOutputFiles(outputPanelOfNormalsFile); } private static RealMatrix constructReadCountMatrix(final Logger logger, final List inputReadCountFiles, final SAMSequenceDictionary sequenceDictionary, final List intervals) { logger.info("Validating and aggregating input read-counts files..."); final int numSamples = inputReadCountFiles.size(); final int numIntervals = intervals.size(); final RealMatrix readCountMatrix = new Array2DRowRealMatrix(numSamples, numIntervals); final ListIterator inputReadCountFilesIterator = inputReadCountFiles.listIterator(); while (inputReadCountFilesIterator.hasNext()) { final int sampleIndex = inputReadCountFilesIterator.nextIndex(); final File inputReadCountFile = inputReadCountFilesIterator.next(); logger.info(String.format("Aggregating read-counts file %s (%d / %d)", inputReadCountFile, sampleIndex + 1, numSamples)); final SimpleCountCollection readCounts = SimpleCountCollection.read(inputReadCountFile); if (!CopyNumberArgumentValidationUtils.isSameDictionary(readCounts.getMetadata().getSequenceDictionary(), sequenceDictionary)) { logger.warn(String.format("Sequence dictionary for read-counts file %s does not match those in other read-counts files.", inputReadCountFile)); } Utils.validateArg(readCounts.getIntervals().equals(intervals), String.format("Intervals for read-counts file %s do not match those in other read-counts files.", inputReadCountFile)); readCountMatrix.setRow(sampleIndex, readCounts.getCounts()); } return readCountMatrix; } }




© 2015 - 2025 Weber Informatics LLC | Privacy Policy