org.broadinstitute.hellbender.tools.copynumber.CreateReadCountPanelOfNormals Maven / Gradle / Ivy
package org.broadinstitute.hellbender.tools.copynumber;
import htsjdk.samtools.SAMSequenceDictionary;
import org.apache.commons.math3.linear.Array2DRowRealMatrix;
import org.apache.commons.math3.linear.RealMatrix;
import org.apache.logging.log4j.Logger;
import org.apache.spark.api.java.JavaSparkContext;
import org.broadinstitute.barclay.argparser.Advanced;
import org.broadinstitute.barclay.argparser.Argument;
import org.broadinstitute.barclay.argparser.CommandLineProgramProperties;
import org.broadinstitute.barclay.help.DocumentedFeature;
import org.broadinstitute.hdf5.HDF5Library;
import org.broadinstitute.hellbender.cmdline.StandardArgumentDefinitions;
import org.broadinstitute.hellbender.cmdline.programgroups.CopyNumberProgramGroup;
import org.broadinstitute.hellbender.engine.spark.SparkCommandLineProgram;
import org.broadinstitute.hellbender.exceptions.UserException;
import org.broadinstitute.hellbender.tools.copynumber.arguments.CopyNumberArgumentValidationUtils;
import org.broadinstitute.hellbender.tools.copynumber.arguments.CopyNumberStandardArgument;
import org.broadinstitute.hellbender.tools.copynumber.denoising.GCBiasCorrector;
import org.broadinstitute.hellbender.tools.copynumber.denoising.HDF5SVDReadCountPanelOfNormals;
import org.broadinstitute.hellbender.tools.copynumber.formats.collections.AnnotatedIntervalCollection;
import org.broadinstitute.hellbender.tools.copynumber.formats.collections.SimpleCountCollection;
import org.broadinstitute.hellbender.tools.copynumber.formats.records.annotation.CopyNumberAnnotations;
import org.broadinstitute.hellbender.tools.copynumber.utils.HDF5Utils;
import org.broadinstitute.hellbender.utils.SimpleInterval;
import org.broadinstitute.hellbender.utils.Utils;
import java.io.File;
import java.util.ArrayList;
import java.util.HashSet;
import java.util.List;
import java.util.ListIterator;
import java.util.stream.Collectors;
/**
* Creates a panel of normals (PoN) for read-count denoising given the read counts for samples in the panel.
* The resulting PoN can be used with {@link DenoiseReadCounts} to denoise other samples.
*
*
* The input read counts are first transformed to log2 fractional coverages and preprocessed
* according to specified filtering and imputation parameters. Singular value decomposition (SVD)
* is then performed to find the first {@code number-of-eigensamples} principal components,
* which are stored in the PoN. Some or all of these principal components can then be used for
* denoising case samples with {@link DenoiseReadCounts}; it is assumed that the principal components used
* represent systematic sequencing biases (rather than statistical noise). Examining the singular values,
* which are also stored in the PoN, may be useful in determining the appropriate number
* of principal components to use for denoising.
*
*
*
* If annotated intervals are provided, explicit GC-bias correction will be performed by {@link GCBiasCorrector}
* before filtering and SVD. GC-content information for the intervals will be stored in the PoN
* and used to perform explicit GC-bias correction identically in {@link DenoiseReadCounts}.
* Note that if annotated intervals are not provided, it is still likely that GC-bias correction is
* implicitly performed by the SVD denoising process (i.e., some of the principal components arise from GC bias).
*
*
*
* Note that such SVD denoising cannot distinguish between variance due to systematic sequencing biases and that
* due to true common germline CNVs present in the panel; signal from the latter may thus be inadvertently denoised
* away. Furthermore, variance arising from coverage on the sex chromosomes may also significantly contribute
* to the principal components if the panel contains samples of mixed sex. Therefore, if sex chromosomes
* are not excluded from coverage collection, it is strongly recommended that users avoid creating panels of
* mixed sex and take care to denoise case samples only with panels containing only individuals of the same sex
* as the case samples. (See {@link GermlineCNVCaller}, which avoids these issues by simultaneously learning
* a probabilistic model for systematic bias and calling rare and common germline CNVs for samples in the panel.)
*
*
* Inputs
*
*
* -
* Counts files (TSV or HDF5 output of {@link CollectReadCounts}).
*
* -
* (Optional) GC-content annotated-intervals file from {@link AnnotateIntervals}.
* Explicit GC-bias correction will be performed on the panel samples and identically for subsequent case samples.
*
*
*
* Outputs
*
*
* -
* Panel-of-normals file.
* This is an HDF5 file containing the panel data in the paths defined in {@link HDF5SVDReadCountPanelOfNormals}.
* HDF5 files may be viewed using hdfview
* or loaded in Python using PyTables or h5py.
*
*
*
* Usage examples
*
*
* gatk CreateReadCountPanelOfNormals \
* -I sample_1.counts.hdf5 \
* -I sample_2.counts.hdf5 \
* ... \
* -O cnv.pon.hdf5
*
*
*
* gatk CreateReadCountPanelOfNormals \
* -I sample_1.counts.hdf5 \
* -I sample_2.counts.tsv \
* ... \
* --annotated-intervals annotated_intervals.tsv \
* -O cnv.pon.hdf5
*
*
* @author Samuel Lee <[email protected]>
*/
@CommandLineProgramProperties(
summary = "Creates a panel of normals for read-count denoising given the read counts for samples in the panel",
oneLineSummary = "Creates a panel of normals for read-count denoising",
programGroup = CopyNumberProgramGroup.class
)
@DocumentedFeature
public final class CreateReadCountPanelOfNormals extends SparkCommandLineProgram {
private static final long serialVersionUID = 1L;
//default values for filtering
private static final double DEFAULT_MINIMUM_INTERVAL_MEDIAN_PERCENTILE = 10.0;
private static final double DEFAULT_MAXIMUM_ZEROS_IN_SAMPLE_PERCENTAGE = 5.0;
private static final double DEFAULT_MAXIMUM_ZEROS_IN_INTERVAL_PERCENTAGE = 5.0;
private static final double DEFAULT_EXTREME_SAMPLE_MEDIAN_PERCENTILE = 2.5;
private static final boolean DEFAULT_DO_IMPUTE_ZEROS = true;
private static final double DEFAULT_EXTREME_OUTLIER_TRUNCATION_PERCENTILE = 0.1;
private static final int DEFAULT_NUMBER_OF_EIGENSAMPLES = 20;
private static final int DEFAULT_CHUNK_DIVISOR = 16;
private static final int DEFAULT_MAXIMUM_CHUNK_SIZE = HDF5Utils.MAX_NUMBER_OF_VALUES_PER_HDF5_MATRIX / DEFAULT_CHUNK_DIVISOR;
//parameter names
public static final String MINIMUM_INTERVAL_MEDIAN_PERCENTILE_LONG_NAME = "minimum-interval-median-percentile";
public static final String MAXIMUM_ZEROS_IN_SAMPLE_PERCENTAGE_LONG_NAME = "maximum-zeros-in-sample-percentage";
public static final String MAXIMUM_ZEROS_IN_INTERVAL_PERCENTAGE_LONG_NAME = "maximum-zeros-in-interval-percentage";
public static final String EXTREME_SAMPLE_MEDIAN_PERCENTILE_LONG_NAME = "extreme-sample-median-percentile";
public static final String IMPUTE_ZEROS_LONG_NAME = "do-impute-zeros";
public static final String EXTREME_OUTLIER_TRUNCATION_PERCENTILE_LONG_NAME = "extreme-outlier-truncation-percentile";
public static final String MAXIMUM_CHUNK_SIZE = "maximum-chunk-size";
@Argument(
doc = "Input TSV or HDF5 files containing integer read counts in genomic intervals for all samples in the panel of normals (output of CollectReadCounts). " +
"Intervals must be identical and in the same order for all samples.",
fullName = StandardArgumentDefinitions.INPUT_LONG_NAME,
shortName = StandardArgumentDefinitions.INPUT_SHORT_NAME,
minElements = 1
)
private List inputReadCountFiles = new ArrayList<>();
@Argument(
doc = "Input file containing annotations for GC content in genomic intervals (output of AnnotateIntervals). " +
"If provided, explicit GC correction will be performed before performing SVD. " +
"Intervals must be identical to and in the same order as those in the input read-counts files.",
fullName = CopyNumberStandardArgument.ANNOTATED_INTERVALS_FILE_LONG_NAME,
optional = true
)
private File inputAnnotatedIntervalsFile = null;
@Argument(
doc = "Output file for the panel of normals.",
fullName = StandardArgumentDefinitions.OUTPUT_LONG_NAME,
shortName = StandardArgumentDefinitions.OUTPUT_SHORT_NAME
)
private File outputPanelOfNormalsFile;
@Argument(
doc = "Genomic intervals with a median (across samples) of fractional coverage (optionally corrected for GC bias) " +
"less than or equal to this percentile are filtered out. " +
"(This is the first filter applied.)",
fullName = MINIMUM_INTERVAL_MEDIAN_PERCENTILE_LONG_NAME,
minValue = 0.,
maxValue = 100.,
optional = true
)
private double minimumIntervalMedianPercentile = DEFAULT_MINIMUM_INTERVAL_MEDIAN_PERCENTILE;
@Argument(
doc = "Samples with a fraction of zero-coverage genomic intervals greater than or equal to this percentage are filtered out. " +
"(This is the second filter applied.)",
fullName = MAXIMUM_ZEROS_IN_SAMPLE_PERCENTAGE_LONG_NAME,
minValue = 0.,
maxValue = 100.,
optional = true
)
private double maximumZerosInSamplePercentage = DEFAULT_MAXIMUM_ZEROS_IN_SAMPLE_PERCENTAGE;
@Argument(
doc = "Genomic intervals with a fraction of zero-coverage samples greater than or equal to this percentage are filtered out. " +
"(This is the third filter applied.)",
fullName = MAXIMUM_ZEROS_IN_INTERVAL_PERCENTAGE_LONG_NAME,
minValue = 0.,
maxValue = 100.,
optional = true
)
private double maximumZerosInIntervalPercentage = DEFAULT_MAXIMUM_ZEROS_IN_INTERVAL_PERCENTAGE;
@Argument(
doc = "Samples with a median (across genomic intervals) of fractional coverage normalized by genomic-interval medians " +
"strictly below this percentile or strictly above the complementary percentile are filtered out. " +
"(This is the fourth filter applied.)",
fullName = EXTREME_SAMPLE_MEDIAN_PERCENTILE_LONG_NAME,
minValue = 0.,
maxValue = 50.,
optional = true
)
private double extremeSampleMedianPercentile = DEFAULT_EXTREME_SAMPLE_MEDIAN_PERCENTILE;
@Argument(
doc = "If true, impute zero-coverage values as the median of the non-zero values in the corresponding interval. " +
"(This is applied after all filters.)",
fullName = IMPUTE_ZEROS_LONG_NAME,
optional = true
)
private boolean doImputeZeros = DEFAULT_DO_IMPUTE_ZEROS;
@Argument(
doc = "Fractional coverages normalized by genomic-interval medians that are " +
"strictly below this percentile or strictly above the complementary percentile are set to the corresponding percentile value. " +
"(This is applied after all filters and imputation.)",
fullName = EXTREME_OUTLIER_TRUNCATION_PERCENTILE_LONG_NAME,
minValue = 0.,
maxValue = 50.,
optional = true
)
private double extremeOutlierTruncationPercentile = DEFAULT_EXTREME_OUTLIER_TRUNCATION_PERCENTILE;
@Argument(
doc = "Number of eigensamples to use for truncated SVD and to store in the panel of normals. " +
"The number of samples retained after filtering will be used instead if it is smaller than this.",
fullName = CopyNumberStandardArgument.NUMBER_OF_EIGENSAMPLES_LONG_NAME,
minValue = 0,
optional = true
)
private int numEigensamplesRequested = DEFAULT_NUMBER_OF_EIGENSAMPLES;
@Advanced
@Argument(
doc = "Maximum HDF5 matrix chunk size. Large matrices written to HDF5 are chunked into equally sized " +
"subsets of rows (plus a subset containing the remainder, if necessary) to avoid a hard limit in " +
"Java HDF5 on the number of elements in a matrix. However, since a single row is not allowed to " +
"be split across multiple chunks, the number of columns must be less than the maximum number of " +
"values in each chunk. Decreasing this number will reduce heap usage when writing chunks.",
fullName = MAXIMUM_CHUNK_SIZE,
minValue = 1,
maxValue = HDF5Utils.MAX_NUMBER_OF_VALUES_PER_HDF5_MATRIX,
optional = true
)
private int maximumChunkSize = DEFAULT_MAXIMUM_CHUNK_SIZE;
@Override
protected void runPipeline(final JavaSparkContext ctx) {
if (!new HDF5Library().load(null)) { //Note: passing null means using the default temp dir.
throw new UserException.HardwareFeatureException("Cannot load the required HDF5 library. " +
"HDF5 is currently supported on x86-64 architecture and Linux or OSX systems.");
}
validateArguments();
//get sample filenames
final List sampleFilenames = inputReadCountFiles.stream().map(File::getAbsolutePath).collect(Collectors.toList());
//get sequence dictionary and intervals from the first read-counts file to use to validate remaining files
//(this first file is read again below, which is slightly inefficient but is probably not worth the extra code)
final File firstReadCountFile = inputReadCountFiles.get(0);
logger.info(String.format("Retrieving intervals from first read-counts file (%s)...", firstReadCountFile));
final SimpleCountCollection firstReadCounts = SimpleCountCollection.read(firstReadCountFile);
final SAMSequenceDictionary sequenceDictionary = firstReadCounts.getMetadata().getSequenceDictionary();
final List intervals = firstReadCounts.getIntervals();
Utils.validateArg(firstReadCounts.size() <= maximumChunkSize,
String.format("The number of intervals (%d) in each read-counts file cannot exceed the maximum chunk size (%d).",
firstReadCounts.size(), maximumChunkSize));
//get GC content (null if not provided)
final AnnotatedIntervalCollection annotatedIntervals = CopyNumberArgumentValidationUtils.validateAnnotatedIntervals(
inputAnnotatedIntervalsFile, firstReadCounts, logger);
final double[] intervalGCContent = annotatedIntervals == null
? null
: annotatedIntervals.getRecords().stream()
.mapToDouble(i -> i.getAnnotationMap().getValue(CopyNumberAnnotations.GC_CONTENT))
.toArray();
//validate input read-counts files (i.e., check intervals and that only integer counts are contained)
//and aggregate as a RealMatrix with dimensions numIntervals x numSamples
final RealMatrix readCountMatrix = constructReadCountMatrix(logger, inputReadCountFiles, sequenceDictionary, intervals);
//create the PoN
logger.info("Creating the panel of normals...");
HDF5SVDReadCountPanelOfNormals.create(outputPanelOfNormalsFile, getCommandLine(),
sequenceDictionary, readCountMatrix, sampleFilenames, intervals, intervalGCContent,
minimumIntervalMedianPercentile, maximumZerosInSamplePercentage, maximumZerosInIntervalPercentage,
extremeSampleMedianPercentile, doImputeZeros, extremeOutlierTruncationPercentile, numEigensamplesRequested,
maximumChunkSize, ctx);
logger.info(String.format("%s complete.", getClass().getSimpleName()));
}
private void validateArguments() {
if (numEigensamplesRequested > inputReadCountFiles.size()) {
logger.warn(String.format("Number of eigensamples (%d) is greater than the number of input samples (%d); " +
"the number of samples retained after filtering will be used instead.",
numEigensamplesRequested, inputReadCountFiles.size()));
}
Utils.validateArg(inputReadCountFiles.size() == new HashSet<>(inputReadCountFiles).size(),
"List of input read-counts files cannot contain duplicates.");
inputReadCountFiles.forEach(CopyNumberArgumentValidationUtils::validateInputs);
CopyNumberArgumentValidationUtils.validateInputs(inputAnnotatedIntervalsFile);
CopyNumberArgumentValidationUtils.validateOutputFiles(outputPanelOfNormalsFile);
}
private static RealMatrix constructReadCountMatrix(final Logger logger,
final List inputReadCountFiles,
final SAMSequenceDictionary sequenceDictionary,
final List intervals) {
logger.info("Validating and aggregating input read-counts files...");
final int numSamples = inputReadCountFiles.size();
final int numIntervals = intervals.size();
final RealMatrix readCountMatrix = new Array2DRowRealMatrix(numSamples, numIntervals);
final ListIterator inputReadCountFilesIterator = inputReadCountFiles.listIterator();
while (inputReadCountFilesIterator.hasNext()) {
final int sampleIndex = inputReadCountFilesIterator.nextIndex();
final File inputReadCountFile = inputReadCountFilesIterator.next();
logger.info(String.format("Aggregating read-counts file %s (%d / %d)", inputReadCountFile, sampleIndex + 1, numSamples));
final SimpleCountCollection readCounts = SimpleCountCollection.read(inputReadCountFile);
if (!CopyNumberArgumentValidationUtils.isSameDictionary(readCounts.getMetadata().getSequenceDictionary(), sequenceDictionary)) {
logger.warn(String.format("Sequence dictionary for read-counts file %s does not match those in other read-counts files.", inputReadCountFile));
}
Utils.validateArg(readCounts.getIntervals().equals(intervals),
String.format("Intervals for read-counts file %s do not match those in other read-counts files.", inputReadCountFile));
readCountMatrix.setRow(sampleIndex, readCounts.getCounts());
}
return readCountMatrix;
}
}