All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.broadinstitute.hellbender.tools.copynumber.ModelSegments Maven / Gradle / Ivy

The newest version!
package org.broadinstitute.hellbender.tools.copynumber;

import com.google.common.collect.ImmutableList;
import com.google.common.collect.ImmutableSet;
import htsjdk.samtools.SAMSequenceDictionary;
import htsjdk.samtools.util.Interval;
import htsjdk.samtools.util.IntervalList;
import htsjdk.samtools.util.OverlapDetector;
import org.broadinstitute.barclay.argparser.Advanced;
import org.broadinstitute.barclay.argparser.Argument;
import org.broadinstitute.barclay.argparser.ArgumentCollection;
import org.broadinstitute.barclay.argparser.CommandLineProgramProperties;
import org.broadinstitute.barclay.help.DocumentedFeature;
import org.broadinstitute.hellbender.cmdline.CommandLineProgram;
import org.broadinstitute.hellbender.cmdline.StandardArgumentDefinitions;
import org.broadinstitute.hellbender.cmdline.programgroups.CopyNumberProgramGroup;
import org.broadinstitute.hellbender.exceptions.GATKException;
import org.broadinstitute.hellbender.tools.copynumber.arguments.CopyNumberArgumentValidationUtils;
import org.broadinstitute.hellbender.tools.copynumber.arguments.CopyNumberStandardArgument;
import org.broadinstitute.hellbender.tools.copynumber.arguments.SomaticGenotypingArgumentCollection;
import org.broadinstitute.hellbender.tools.copynumber.arguments.SomaticModelingArgumentCollection;
import org.broadinstitute.hellbender.tools.copynumber.arguments.SomaticSegmentationArgumentCollection;
import org.broadinstitute.hellbender.tools.copynumber.formats.collections.AbstractLocatableCollection;
import org.broadinstitute.hellbender.tools.copynumber.formats.collections.AbstractRecordCollection;
import org.broadinstitute.hellbender.tools.copynumber.formats.collections.AllelicCountCollection;
import org.broadinstitute.hellbender.tools.copynumber.formats.collections.CopyRatioCollection;
import org.broadinstitute.hellbender.tools.copynumber.formats.collections.CopyRatioSegmentCollection;
import org.broadinstitute.hellbender.tools.copynumber.formats.collections.LegacySegmentCollection;
import org.broadinstitute.hellbender.tools.copynumber.formats.collections.ModeledSegmentCollection;
import org.broadinstitute.hellbender.tools.copynumber.formats.collections.ParameterDecileCollection;
import org.broadinstitute.hellbender.tools.copynumber.formats.collections.SimpleIntervalCollection;
import org.broadinstitute.hellbender.tools.copynumber.formats.metadata.SampleLocatableMetadata;
import org.broadinstitute.hellbender.tools.copynumber.formats.metadata.SimpleLocatableMetadata;
import org.broadinstitute.hellbender.tools.copynumber.formats.records.CopyRatio;
import org.broadinstitute.hellbender.tools.copynumber.formats.records.CopyRatioSegment;
import org.broadinstitute.hellbender.tools.copynumber.formats.records.LegacySegment;
import org.broadinstitute.hellbender.tools.copynumber.models.AlleleFractionModeller;
import org.broadinstitute.hellbender.tools.copynumber.models.AlleleFractionPrior;
import org.broadinstitute.hellbender.tools.copynumber.models.CopyRatioModeller;
import org.broadinstitute.hellbender.tools.copynumber.models.MultidimensionalModeller;
import org.broadinstitute.hellbender.tools.copynumber.segmentation.MultisampleMultidimensionalKernelSegmenter;
import org.broadinstitute.hellbender.tools.copynumber.utils.genotyping.NaiveHeterozygousPileupGenotypingUtils;
import org.broadinstitute.hellbender.tools.copynumber.utils.segmentation.KernelSegmenter;
import org.broadinstitute.hellbender.utils.SimpleInterval;
import org.broadinstitute.hellbender.utils.Utils;
import org.broadinstitute.hellbender.utils.param.ParamUtils;

import java.io.File;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collection;
import java.util.Collections;
import java.util.List;
import java.util.Objects;
import java.util.function.Function;
import java.util.stream.Collectors;
import java.util.stream.IntStream;
import java.util.stream.Stream;

/**
 * Models segmented copy ratios from denoised copy ratios and segmented minor-allele fractions from allelic counts.
 *
 * 

* Possible data inputs are: 1) denoised copy ratios for the case sample, 2) allelic counts for the case sample, * and 3) allelic counts for a matched-normal sample. All available inputs will be used to to perform * segmentation and model inference. *

* *

Genotyping step

* *

* If allelic counts are available, the first step in the inference process is to genotype heterozygous sites, * as the allelic counts at these sites will subsequently be modeled to infer segmented minor-allele fraction. * We perform a relatively simple and naive genotyping based on the allele counts (i.e., pileups), which is * controlled by a small number of parameters ({@code minimum-total-allele-count}, * {@code genotyping-homozygous-log-ratio-threshold}, and {@code genotyping-homozygous-log-ratio-threshold}). * If the matched normal is available, its allelic counts will be used to genotype the sites, and * we will simply assume these genotypes are the same in the case sample. (This can be critical, for example, * for determining sites with loss of heterozygosity in high purity case samples; such sites will be genotyped as * homozygous if the matched-normal sample is not available.) *

* *

Segmentation step

* *

* Next, we segment, if available, the denoised copy ratios and the alternate-allele fractions at the * genotyped heterozygous sites. This is done using kernel segmentation (see {@link KernelSegmenter}). * Various segmentation parameters control the sensitivity of the segmentation and should be selected * appropriately for each analysis. If a Picard interval-list file has been specified by the {@code segments} * argument, the corresponding segmentation will be used instead and this step will be skipped. *

* *

* If both copy ratios and allele fractions are available, we perform segmentation using a combined kernel * that is sensitive to changes that occur not only in either of the two but also in both. However, in this case, * we simply discard all allele fractions at sites that lie outside of the available copy-ratio intervals * (rather than imputing the missing copy-ratio data); these sites are filtered out during the genotyping step * discussed above. This can have implications for analyses involving the sex chromosomes; * see comments in {@link CreateReadCountPanelOfNormals}. *

* *

Modeling step

* *

* After segmentation is complete, we run Markov-chain Monte Carlo (MCMC) to determine posteriors for * segmented models for the log2 copy ratio and the minor-allele fraction; see {@link CopyRatioModeller} * and {@link AlleleFractionModeller}, respectively. After the first run of MCMC is complete, * smoothing of the segmented posteriors is performed by merging adjacent segments whose posterior * credible intervals sufficiently overlap according to specified segmentation-smoothing parameters. * Then, additional rounds of segmentation smoothing (with intermediate MCMC optionally performed in between rounds) * are performed until convergence, at which point a final round of MCMC is performed. *

* *

Inputs

* *
    *
  • * (Optional) Denoised-copy-ratios file from {@link DenoiseReadCounts}. * If allelic counts are not provided, then this is required. *
  • *
  • * (Optional) Allelic-counts file from {@link CollectAllelicCounts}. * If denoised copy ratios are not provided, then this is required. *
  • *
  • * (Optional) Matched-normal allelic-counts file from {@link CollectAllelicCounts}. * This can only be provided if allelic counts for the case sample are also provided. *
  • *
  • * (Optional, Advanced) Picard interval-list file containing a multisample segmentation output by * a previous run of {@link ModelSegments} in multisample mode. * Segmentation step will not be performed. * See description of multisample mode below. *
  • *
  • * Output prefix. * This is used as the basename for output files. *
  • *
  • * Output directory. * This will be created if it does not exist. *
  • *
* *

Outputs

* *
    *
  • * Modeled-segments .modelBegin.seg and .modelFinal.seg files. * These are tab-separated values (TSV) files with a SAM-style header containing a read group sample name, a sequence dictionary, * a row specifying the column headers contained in {@link ModeledSegmentCollection.ModeledSegmentTableColumn}, * and the corresponding entry rows. * The initial result before segmentation smoothing is output to the .modelBegin.seg file * and the final result after segmentation smoothing is output to the .modelFinal.seg file. *
  • *
  • * Allele-fraction-model global-parameter files (.modelBegin.af.param and .modelFinal.af.param). * These are tab-separated values (TSV) files with a SAM-style header containing a read group sample name, * a row specifying the column headers contained in {@link ParameterDecileCollection.ParameterTableColumn}, * and the corresponding entry rows. * The initial result before segmentation smoothing is output to the .modelBegin.af.param file * and the final result after segmentation smoothing is output to the .modelFinal.af.param file. *
  • *
  • * Copy-ratio-model global-parameter files (.modelBegin.cr.param and .modelFinal.cr.param). * These are tab-separated values (TSV) files with a SAM-style header containing a read group sample name, * a row specifying the column headers contained in {@link ParameterDecileCollection.ParameterTableColumn}, * and the corresponding entry rows. * The initial result before segmentation smoothing is output to the .modelBegin.cr.param file * and the final result after segmentation smoothing is output to the .modelFinal.cr.param file. *
  • *
  • * Copy-ratio segments file (.cr.seg). * This is a tab-separated values (TSV) file with a SAM-style header containing a read group sample name, a sequence dictionary, * a row specifying the column headers contained in {@link CopyRatioSegmentCollection.CopyRatioSegmentTableColumn}, * and the corresponding entry rows. * It contains the segments from the .modelFinal.seg file converted to a format suitable for input to {@link CallCopyRatioSegments}. *
  • *
  • * CBS-formatted .cr.igv.seg and .af.igv.seg files (compatible with IGV). * These are tab-separated values (TSV) files with CBS-format column headers * (see * http://software.broadinstitute.org/cancer/software/genepattern/file-formats-guide#CBS) * and the corresponding entry rows that can be plotted using IGV (see * * https://software.broadinstitute.org/software/igv/SEG). * The posterior medians of the log2 copy ratio and minor-allele fraction are given in the SEGMENT_MEAN * columns in the .cr.igv.seg and .af.igv.seg files, respectively. *
  • *
  • * (Optional) Allelic-counts file containing the counts at sites genotyped as heterozygous in the case sample (.hets.tsv). * This is a tab-separated values (TSV) file with a SAM-style header containing a read group sample name, a sequence dictionary, * a row specifying the column headers contained in {@link AllelicCountCollection.AllelicCountTableColumn}, * and the corresponding entry rows. * This is only output if allelic counts are provided as input. *
  • *
  • * (Optional) Allelic-counts file containing the counts at sites genotyped as heterozygous in the matched-normal sample (.hets.normal.tsv). * This is a tab-separated values (TSV) file with a SAM-style header containing a read group sample name, a sequence dictionary, * a row specifying the column headers contained in {@link AllelicCountCollection.AllelicCountTableColumn}, * and the corresponding entry rows. * This is only output if matched-normal allelic counts are provided as input. *
  • *
* *

Usage examples

* *
 *     gatk ModelSegments \
 *          --denoised-copy-ratios tumor.denoisedCR.tsv \
 *          --allelic-counts tumor.allelicCounts.tsv \
 *          --normal-allelic-counts normal.allelicCounts.tsv \
 *          --output-prefix tumor \
 *          -O output_dir
 * 
* *
 *     gatk ModelSegments \
 *          --denoised-copy-ratios normal.denoisedCR.tsv \
 *          --allelic-counts normal.allelicCounts.tsv \
 *          --output-prefix normal \
 *          -O output_dir
 * 
* *
 *     gatk ModelSegments \
 *          --allelic-counts tumor.allelicCounts.tsv \
 *          --normal-allelic-counts normal.allelicCounts.tsv \
 *          --output-prefix tumor \
 *          -O output_dir
 * 
* *
 *     gatk ModelSegments \
 *          --denoised-copy-ratios normal.denoisedCR.tsv \
 *          --output-prefix normal \
 *          -O output_dir
 * 
* *
 *     gatk ModelSegments \
 *          --allelic-counts tumor.allelicCounts.tsv \
 *          --output-prefix tumor \
 *          -O output_dir
 * 
* *

Multisample Mode (ADVANCED / EXPERIMENTAL)

* * Multisample mode is activated when inputs for more than one case sample are specified via the * {@code denoised-copy-ratios} and/or {@code allelic-counts} arguments. In this mode, {@link ModelSegments} * finds common segments across multiple case samples using denoised copy ratios and allelic counts. * This segmentation can be used as input to subsequent, individual runs of {@link ModelSegments} * on each of the case samples. * *

* Possible data inputs are: 1) denoised copy ratios for the case samples, 2) allelic counts for the case samples, * and 3) allelic counts for a matched-normal sample. All available inputs will be used to to perform * segmentation. *

* *

* As in single-sample mode, the first step is to genotype heterozygous sites. If allelic counts from * a matched normal are available, heterozygous sites in all samples are defined using the normal, as usual; * otherwise, each case sample is genotyped individually. The intersection of heterozygous sites across * case samples after filtering on total count and overlap with available copy-ratio intervals is then used for * segmentation. (When the matched normal is available, this intersection is inconsequential if * {@code minimum-total-allele-count-case} is appropriately set to zero. As in single-sample mode, * determining sites with loss of heterozygosity in high purity case samples will be difficult if the * matched normal is not available.) *

* *

* Next, we jointly segment, if available, the denoised copy ratios and the alternate-allele fractions at the * genotyped heterozygous sites across all case samples (which can include the matched normal, if desired). * The same caveats discussed above also apply to segmentation in multisample mode; as in single-sample mode, * segmentation parameters should be tuned appropriately for each analysis. Moreover, parameters may also need * to be tuned as a function of the number of samples used. *

* *

* The final output of multisample mode is a Picard interval-list file specifying the joint segmentation. * This can be provided to subsequent, individual runs of {@link ModelSegments} via the {@code segments} argument * to perform modeling for each of the case samples; the segmentation step will be skipped in these runs. *

* *

* Note that the genotyping step will be repeated in these runs, so filters identical to those used in the * multisample-mode run should be used. If allelic counts from a matched normal are available, the resulting set of * heterozygous sites used for modeling in these runs should then be identical to that used for segmentation in * the multisample-mode run if {@code minimum-total-allele-count-case} is appropriately set to zero for all runs. * (However, if this is set to a non-zero value or if no matched normal is available, because the intersection * of heterozygous sites across samples performed in the multisample-mode run cannot be replicated in the * single-sample mode runs, this may ultimately yield sets of sites for modeling that differ across samples.) *

* *

* See below for usage examples illustrating a run in multisample mode followed by multiple runs in single-sample mode. *

* *

Inputs

* *
    *
  • * (Optional) List of more than one denoised-copy-ratios files from {@link DenoiseReadCounts}. * If a list of allelic counts is not provided, then this is required. * If a list of allelic counts is also provided, then sample order must match across both lists. *
  • *
  • * (Optional) List of more than one allelic-counts files from {@link CollectAllelicCounts}. * If a list of denoised copy ratios is not provided, then this is required. * If a list of denoised copy ratios is also provided, then sample order must match across both lists. *
  • *
  • * (Optional) Matched-normal allelic-counts file from {@link CollectAllelicCounts}. * This can only be provided if a list of allelic counts for the case samples are also provided. *
  • *
* *

Outputs

* *
    *
  • * Multisample-segments .interval_list Picard interval-list file. * This segmentation can be used as input to subsequent, individual runs of {@link ModelSegments} on each of * the case samples. *
  • *
* *

Usage examples

* *

Multisample-mode run (outputs {@code multisample-segmentation.interval_list} containing the multisample segmentation)

* *
 *     gatk ModelSegments \
 *          --denoised-copy-ratios normal.denoisedCR.tsv \
 *          --denoised-copy-ratios tumor-1.denoisedCR.tsv \
 *          ...
 *          --denoised-copy-ratios tumor-N.denoisedCR.tsv \
 *          --allelic-counts normal.allelicCounts.tsv \
 *          --allelic-counts tumor-1.allelicCounts.tsv \
 *          ...
 *          --allelic-counts tumor-N.allelicCounts.tsv \
 *          --normal-allelic-counts normal.allelicCounts.tsv \
 *          --output-prefix multisample-segmentation
 *          -O output_dir
 * 
* *

Single-sample mode runs (segmentation is taken from {@code multisample-segmentation.interval_list}, * so the segmentation step is skipped in each run)

* *
 *     gatk ModelSegments \
 *          --segments multisample-segmentation.interval_list
 *          --denoised-copy-ratios normal.denoisedCR.tsv \
 *          --allelic-counts normal.allelicCounts.tsv \
 *          --normal-allelic-counts normal.allelicCounts.tsv \
 *          --output-prefix normal
 *          -O output_dir
 * 
* *
 *     gatk ModelSegments \
 *          --segments multisample-segmentation.interval_list
 *          --denoised-copy-ratios tumor-1.denoisedCR.tsv \
 *          --allelic-counts tumor-1.allelicCounts.tsv \
 *          --normal-allelic-counts normal.allelicCounts.tsv \
 *          --output-prefix tumor-1
 *          -O output_dir
 * 
* *
 *     ...
 * 
* *
 *     gatk ModelSegments \
 *          --segments multisample-segmentation.interval_list
 *          --denoised-copy-ratios tumor-N.denoisedCR.tsv \
 *          --allelic-counts tumor-N.allelicCounts.tsv \
 *          --normal-allelic-counts normal.allelicCounts.tsv \
 *          --output-prefix tumor-N
 *          -O output_dir
 * 
* * @author Samuel Lee <[email protected]> */ @CommandLineProgramProperties( summary = "Models segmented copy ratios from denoised copy ratios and segmented minor-allele fractions from allelic counts; " + "if multiple samples are specified, finds a joint segmentation that can be used in subsequent runs to perform modeling of each sample", oneLineSummary = "Models segmented copy ratios from denoised copy ratios and segmented minor-allele fractions from allelic counts", programGroup = CopyNumberProgramGroup.class ) @DocumentedFeature public final class ModelSegments extends CommandLineProgram { public enum RunMode { MULTIPLE_SAMPLE, SINGLE_SAMPLE } private enum DataMode { COPY_RATIO_ONLY, ALLELE_FRACTION_ONLY, COPY_RATIO_AND_ALLELE_FRACTION } //filename tags for output public static final String HET_ALLELIC_COUNTS_FILE_SUFFIX = ".hets.tsv"; public static final String NORMAL_HET_ALLELIC_COUNTS_FILE_SUFFIX = ".hets.normal.tsv"; public static final String SEGMENTS_FILE_SUFFIX = ".seg"; public static final String PICARD_INTERVAL_LIST_FILE_SUFFIX = ".interval_list"; public static final String BEGIN_FIT_FILE_TAG = ".modelBegin"; public static final String FINAL_FIT_FILE_TAG = ".modelFinal"; public static final String COPY_RATIO_MODEL_PARAMETER_FILE_SUFFIX = ".cr.param"; public static final String ALLELE_FRACTION_MODEL_PARAMETER_FILE_SUFFIX = ".af.param"; public static final String COPY_RATIO_SEGMENTS_FOR_CALLER_FILE_SUFFIX = ".cr" + SEGMENTS_FILE_SUFFIX; public static final String COPY_RATIO_LEGACY_SEGMENTS_FILE_SUFFIX = ".cr.igv" + SEGMENTS_FILE_SUFFIX; public static final String ALLELE_FRACTION_LEGACY_SEGMENTS_FILE_SUFFIX = ".af.igv" + SEGMENTS_FILE_SUFFIX; @Argument( doc = "Input files containing denoised copy ratios (output of DenoiseReadCounts). " + "If multiple samples are specified, multisample kernel segmentation will be performed but modeling will be skipped; " + "sample order must match that of input allelic-counts files.", fullName = CopyNumberStandardArgument.DENOISED_COPY_RATIOS_FILE_LONG_NAME, optional = true, minElements = 1 ) private List inputDenoisedCopyRatiosFiles = null; @Argument( doc = "Input files containing allelic counts (output of CollectAllelicCounts). " + "If multiple samples are specified, multisample kernel segmentation will be performed but modeling will be skipped; " + "sample order must match that of input denoised-copy-ratios files.", fullName = CopyNumberStandardArgument.ALLELIC_COUNTS_FILE_LONG_NAME, optional = true, minElements = 1 ) private List inputAllelicCountsFiles = null; @Argument( doc = "Input file containing allelic counts for a matched normal (output of CollectAllelicCounts). " + "If specified, these allelic counts will be used to perform genotyping but will not be used for multisample kernel segmentation; " + "if the latter is desired, additionally specify this file as one of the arguments to --allelic-counts.", fullName = CopyNumberStandardArgument.NORMAL_ALLELIC_COUNTS_FILE_LONG_NAME, optional = true ) private File inputNormalAllelicCountsFile = null; @Advanced @Argument( doc = "Input Picard interval-list file specifying segments. " + "If provided, kernel segmentation will be skipped.", fullName = CopyNumberStandardArgument.SEGMENTS_FILE_LONG_NAME, optional = true ) private File inputSegmentsFile = null; @Argument( doc = "Prefix for output filenames.", fullName = CopyNumberStandardArgument.OUTPUT_PREFIX_LONG_NAME ) private String outputPrefix; @Argument( doc = "Output directory. This will be created if it does not exist.", fullName = StandardArgumentDefinitions.OUTPUT_LONG_NAME, shortName = StandardArgumentDefinitions.OUTPUT_SHORT_NAME ) private File outputDir; @ArgumentCollection private SomaticGenotypingArgumentCollection genotypingArguments = new SomaticGenotypingArgumentCollection(); @ArgumentCollection private SomaticSegmentationArgumentCollection segmentationArguments = new SomaticSegmentationArgumentCollection(); @ArgumentCollection private SomaticModelingArgumentCollection modelingArguments = new SomaticModelingArgumentCollection(); private RunMode runMode; private DataMode dataMode; private void logHeapUsage(final String phase) { final int mb = 1024 * 1024; final Runtime runtime = Runtime.getRuntime(); logger.info("Used memory (MB) after " + phase + ": " + (runtime.totalMemory() - runtime.freeMemory()) / mb); } @Override protected Object doWork() { logHeapUsage("initializing engine"); setModesAndValidateArguments(); //read input files, validate data, and perform genotyping final ModelSegmentsData modelSegmentsData = new ModelSegmentsData(); if (runMode == RunMode.MULTIPLE_SAMPLE) { //multisample mode, only perform segmentation final SimpleIntervalCollection segments = new MultisampleMultidimensionalKernelSegmenter( modelSegmentsData.denoisedCopyRatiosPerSample, modelSegmentsData.genotypingResult.getHetAllelicCountsPerSample()) .findSegmentation( segmentationArguments.maxNumSegmentsPerChromosome, segmentationArguments.kernelVarianceCopyRatio, segmentationArguments.kernelVarianceAlleleFraction, segmentationArguments.kernelScalingAlleleFraction, segmentationArguments.kernelApproximationDimension, ImmutableSet.copyOf(segmentationArguments.windowSizes).asList(), segmentationArguments.numChangepointsPenaltyFactor, segmentationArguments.numChangepointsPenaltyFactor); logHeapUsage("segmentation"); final File segmentsIntervalListFile = new File(outputDir, outputPrefix + PICARD_INTERVAL_LIST_FILE_SUFFIX); logger.info(String.format("Writing segments as Picard interval list to %s...", segmentsIntervalListFile.getAbsolutePath())); final IntervalList segmentsIntervalList = new IntervalList(segments.getMetadata().getSequenceDictionary()); segments.getIntervals().forEach(i -> segmentsIntervalList.add(new Interval(i))); segmentsIntervalList.write(segmentsIntervalListFile); } else { //single-sample mode //both denoisedCopyRatios and hetAllelicCounts are guaranteed to be non-null at this point; //missing data has already been imputed as an empty collection with the appropriate metadata final CopyRatioCollection denoisedCopyRatios = modelSegmentsData.denoisedCopyRatiosPerSample.get(0); final AllelicCountCollection hetAllelicCounts = modelSegmentsData.genotypingResult.getHetAllelicCountsPerSample().get(0); final SampleLocatableMetadata metadata = denoisedCopyRatios.getMetadata(); //write allelic-counts files containing hets for the case and the matched-normal when available if (dataMode != DataMode.COPY_RATIO_ONLY) { final File hetAllelicCountsFile = new File(outputDir, outputPrefix + HET_ALLELIC_COUNTS_FILE_SUFFIX); if (inputNormalAllelicCountsFile == null) { //case-only mode logger.info(String.format("Writing heterozygous allelic counts to %s...", hetAllelicCountsFile.getAbsolutePath())); } else { //matched-normal mode final File hetNormalAllelicCountsFile = new File(outputDir, outputPrefix + NORMAL_HET_ALLELIC_COUNTS_FILE_SUFFIX); logger.info(String.format("Writing heterozygous allelic counts for matched normal to %s...", hetNormalAllelicCountsFile.getAbsolutePath())); final AllelicCountCollection hetNormalAllelicCounts = modelSegmentsData.genotypingResult.getHetNormalAllelicCounts(); hetNormalAllelicCounts.write(hetNormalAllelicCountsFile); logger.info(String.format("Writing allelic counts for case sample at heterozygous sites in matched normal to %s...", hetAllelicCountsFile.getAbsolutePath())); } hetAllelicCounts.write(hetAllelicCountsFile); } final SimpleIntervalCollection segments; if (inputSegmentsFile == null) { segments = new MultisampleMultidimensionalKernelSegmenter( Collections.singletonList(denoisedCopyRatios), Collections.singletonList(hetAllelicCounts)) .findSegmentation( segmentationArguments.maxNumSegmentsPerChromosome, segmentationArguments.kernelVarianceCopyRatio, segmentationArguments.kernelVarianceAlleleFraction, segmentationArguments.kernelScalingAlleleFraction, segmentationArguments.kernelApproximationDimension, ImmutableSet.copyOf(segmentationArguments.windowSizes).asList(), segmentationArguments.numChangepointsPenaltyFactor, segmentationArguments.numChangepointsPenaltyFactor); logHeapUsage("segmentation"); } else { final IntervalList segmentsIntervalList = IntervalList.fromFile(inputSegmentsFile); ParamUtils.isPositive(segmentsIntervalList.size(), "Segments file must contain at least one segment."); logger.info(String.format("Using input segmentation from %s containing %d segments...", inputSegmentsFile, segmentsIntervalList.size())); segments = new SimpleIntervalCollection( metadata, segmentsIntervalList.getIntervals().stream() .map(i -> new SimpleInterval(i.getContig(), i.getStart(), i.getEnd())) .collect(Collectors.toList())); } logger.info("Modeling available denoised copy ratios and heterozygous allelic counts..."); //initial MCMC model fitting performed by MultidimensionalModeller constructor final AlleleFractionPrior alleleFractionPrior = new AlleleFractionPrior(modelingArguments.minorAlleleFractionPriorAlpha); final MultidimensionalModeller modeller = new MultidimensionalModeller( segments, denoisedCopyRatios, hetAllelicCounts, alleleFractionPrior, modelingArguments.numSamplesCopyRatio, modelingArguments.numBurnInCopyRatio, modelingArguments.numSamplesAlleleFraction, modelingArguments.numBurnInAlleleFraction); //write initial segments and parameters to file writeModeledSegmentsAndParameterFiles(modeller, BEGIN_FIT_FILE_TAG); //segmentation smoothing modeller.smoothSegments( modelingArguments.maxNumSmoothingIterations, modelingArguments.numSmoothingIterationsPerFit, modelingArguments.smoothingCredibleIntervalThresholdCopyRatio, modelingArguments.smoothingCredibleIntervalThresholdAlleleFraction); //write final segments and parameters to file writeModeledSegmentsAndParameterFiles(modeller, FINAL_FIT_FILE_TAG); logHeapUsage("modeling"); //write final segments for copy-ratio caller (TODO remove this and MEAN_LOG2_COPY_RATIO column when new caller is available) final OverlapDetector copyRatioMidpointOverlapDetector = denoisedCopyRatios.getMidpointOverlapDetector(); final CopyRatioSegmentCollection copyRatioSegmentsFinal = new CopyRatioSegmentCollection( modeller.getModeledSegments().getMetadata(), modeller.getModeledSegments().getIntervals().stream() .map(s -> new CopyRatioSegment(s, new ArrayList<>(copyRatioMidpointOverlapDetector.getOverlaps(s)))) .collect(Collectors.toList())); writeSegments(copyRatioSegmentsFinal, COPY_RATIO_SEGMENTS_FOR_CALLER_FILE_SUFFIX); //write IGV-compatible files final LegacySegmentCollection copyRatioLegacySegments = new LegacySegmentCollection( metadata, modeller.getModeledSegments().getRecords().stream() .map(s -> new LegacySegment( metadata.getSampleName(), s.getInterval(), s.getNumPointsCopyRatio(), s.getLog2CopyRatioSimplePosteriorSummary().getDecile50())) .collect(Collectors.toList())); final LegacySegmentCollection alleleFractionLegacySegments = new LegacySegmentCollection( metadata, modeller.getModeledSegments().getRecords().stream() .map(s -> new LegacySegment( metadata.getSampleName(), s.getInterval(), s.getNumPointsAlleleFraction(), s.getMinorAlleleFractionSimplePosteriorSummary().getDecile50())) .collect(Collectors.toList())); writeSegments(copyRatioLegacySegments, COPY_RATIO_LEGACY_SEGMENTS_FILE_SUFFIX); writeSegments(alleleFractionLegacySegments, ALLELE_FRACTION_LEGACY_SEGMENTS_FILE_SUFFIX); } logger.info(String.format("%s complete.", getClass().getSimpleName())); return null; } /** * Performs reading and validation of data, as well as genotyping (if allele-fraction data is available). * * Regardless of {@code dataMode}, the respective lists for the copy-ratio and allele-fraction data for the * case samples will be of the same length and populated; empty collections will be used to impute missing data * using the same sample order as the non-missing data. This is done so that a * {@link MultisampleMultidimensionalKernelSegmenter} can be used in all scenarios. * * In contrast, {@code normalAllelicCounts} and {@code segments} will be set to {@code null} if the respective * inputs are missing and subsequent control-flow code for toggling genotyping and segmentation must be aware of * this convention. */ private final class ModelSegmentsData { final ImmutableList denoisedCopyRatiosPerSample; final ImmutableList allelicCountsPerSample; final AllelicCountCollection normalAllelicCounts; final SimpleIntervalCollection segments; final NaiveHeterozygousPileupGenotypingUtils.NaiveHeterozygousPileupGenotypingResult genotypingResult; private ModelSegmentsData() { Utils.nonNull(dataMode); //read available data and impute metadata for missing data switch (dataMode) { case COPY_RATIO_ONLY: denoisedCopyRatiosPerSample = ImmutableList.copyOf(inputDenoisedCopyRatiosFiles.stream() .map(CopyRatioCollection::new) .iterator()); allelicCountsPerSample = ImmutableList.copyOf(denoisedCopyRatiosPerSample.stream() .map(CopyRatioCollection::getMetadata) .map(m -> new AllelicCountCollection(m, Collections.emptyList())) .iterator()); break; case ALLELE_FRACTION_ONLY: allelicCountsPerSample = ImmutableList.copyOf(inputAllelicCountsFiles.stream() .map(AllelicCountCollection::new) .iterator()); denoisedCopyRatiosPerSample = ImmutableList.copyOf(allelicCountsPerSample.stream() .map(AllelicCountCollection::getMetadata) .map(m -> new CopyRatioCollection(m, Collections.emptyList())) .iterator()); break; case COPY_RATIO_AND_ALLELE_FRACTION: denoisedCopyRatiosPerSample = ImmutableList.copyOf(inputDenoisedCopyRatiosFiles.stream() .map(f -> readOptionalFileOrNull(f, CopyRatioCollection::new)) .iterator()); allelicCountsPerSample = ImmutableList.copyOf(inputAllelicCountsFiles.stream() .map(f -> readOptionalFileOrNull(f, AllelicCountCollection::new)) .iterator()); //check that sample metadata matches across copy-ratio and allele-fraction data IntStream.range(0, inputDenoisedCopyRatiosFiles.size()).boxed() .forEach(i -> CopyNumberArgumentValidationUtils.getValidatedMetadata( denoisedCopyRatiosPerSample.get(i), allelicCountsPerSample.get(i))); break; default: throw new GATKException.ShouldNeverReachHereException("Unknown DataMode."); } normalAllelicCounts = readOptionalFileOrNull(inputNormalAllelicCountsFile, AllelicCountCollection::new); final IntervalList segmentsIntervalList = readOptionalFileOrNull(inputSegmentsFile, IntervalList::fromFile); segments = segmentsIntervalList == null ? null : new SimpleIntervalCollection( new SimpleLocatableMetadata(segmentsIntervalList.getHeader().getSequenceDictionary()), segmentsIntervalList.getIntervals().stream() .map(i -> new SimpleInterval(i.getContig(), i.getStart(), i.getEnd())) .collect(Collectors.toList())); logHeapUsage("reading files"); final SAMSequenceDictionary sequenceDictionary = CopyNumberArgumentValidationUtils.getValidatedSequenceDictionary( Stream.of(denoisedCopyRatiosPerSample, allelicCountsPerSample, Arrays.asList(normalAllelicCounts, segments)) .flatMap(Collection::stream) .toArray(AbstractLocatableCollection[]::new)); Utils.validateArg((int) denoisedCopyRatiosPerSample.stream() .map(CopyRatioCollection::getIntervals) .distinct() .count() == 1, "Copy-ratio intervals must be identical across all case samples."); Utils.validateArg((int) Stream.of(allelicCountsPerSample, Collections.singletonList(normalAllelicCounts)) .flatMap(Collection::stream) .filter(Objects::nonNull) .map(AllelicCountCollection::getIntervals) .distinct() .count() == 1, "Allelic-count sites must be identical across all samples."); logHeapUsage("validating data"); final SimpleIntervalCollection copyRatioIntervals = new SimpleIntervalCollection( new SimpleLocatableMetadata(sequenceDictionary), denoisedCopyRatiosPerSample.get(0).getIntervals()); genotypingResult = NaiveHeterozygousPileupGenotypingUtils.genotypeHets( allelicCountsPerSample, normalAllelicCounts, copyRatioIntervals, genotypingArguments.minTotalAlleleCountCase, genotypingArguments.minTotalAlleleCountNormal, genotypingArguments.genotypingHomozygousLogRatioThreshold, genotypingArguments.genotypingBaseErrorRate); logHeapUsage("genotyping"); } } private void setModesAndValidateArguments() { CopyNumberArgumentValidationUtils.validateInputs( Stream.of( inputDenoisedCopyRatiosFiles, inputAllelicCountsFiles, Collections.singletonList(inputNormalAllelicCountsFile), Collections.singletonList(inputSegmentsFile)) .flatMap(Collection::stream) .toArray(File[]::new)); Utils.nonEmpty(outputPrefix); CopyNumberArgumentValidationUtils.validateAndPrepareOutputDirectories(outputDir); Utils.validateArg(!(inputDenoisedCopyRatiosFiles.isEmpty() && inputAllelicCountsFiles.isEmpty()), "Must provide at least one denoised-copy-ratios file or allelic-counts file."); Utils.validateArg(!(inputAllelicCountsFiles.isEmpty() && inputNormalAllelicCountsFile != null), "Must provide an allelic-counts file for the case sample to run in matched-normal mode."); Utils.validateArg(!(inputNormalAllelicCountsFile != null && genotypingArguments.minTotalAlleleCountCase > 0), "The minimum total count for filtering allelic counts in case samples must be set to zero in matched-normal mode. " + "If the effect of statistical noise due to low depth in case samples on segmentation is a concern, " + "consider using only denoised copy ratios or externally preprocessing allelic-count files " + "to remove sites that are poorly covered across all samples."); runMode = (inputDenoisedCopyRatiosFiles.size() > 1 || inputAllelicCountsFiles.size() > 1) ? RunMode.MULTIPLE_SAMPLE : RunMode.SINGLE_SAMPLE; if (runMode == RunMode.MULTIPLE_SAMPLE) { if (!inputDenoisedCopyRatiosFiles.isEmpty() && !inputAllelicCountsFiles.isEmpty()) { Utils.validateArg(inputDenoisedCopyRatiosFiles.size() == inputAllelicCountsFiles.size(), "Number of denoised-copy-ratios and allelic-counts files for the case samples must be equal " + "if both input types are specified in multisample mode."); } Utils.validateArg(inputSegmentsFile == null, "Segments file cannot be specified in multisample mode."); } if (inputAllelicCountsFiles.isEmpty()) { dataMode = DataMode.COPY_RATIO_ONLY; } else if (inputDenoisedCopyRatiosFiles.isEmpty()) { dataMode = DataMode.ALLELE_FRACTION_ONLY; } else { dataMode = DataMode.COPY_RATIO_AND_ALLELE_FRACTION; } modelingArguments.validateArguments(); } private T readOptionalFileOrNull(final File file, final Function read) { if (file == null) { return null; } logger.info(String.format("Reading file (%s)...", file)); return read.apply(file); } private void writeModeledSegmentsAndParameterFiles(final MultidimensionalModeller modeller, final String fileTag) { final ModeledSegmentCollection modeledSegments = modeller.getModeledSegments(); writeSegments(modeledSegments, fileTag + SEGMENTS_FILE_SUFFIX); final File copyRatioParameterFile = new File(outputDir, outputPrefix + fileTag + COPY_RATIO_MODEL_PARAMETER_FILE_SUFFIX); final File alleleFractionParameterFile = new File(outputDir, outputPrefix + fileTag + ALLELE_FRACTION_MODEL_PARAMETER_FILE_SUFFIX); modeller.writeModelParameterFiles(copyRatioParameterFile, alleleFractionParameterFile); } private void writeSegments(final AbstractRecordCollection segments, final String fileSuffix) { final File segmentsFile = new File(outputDir, outputPrefix + fileSuffix); logger.info(String.format("Writing segments to %s...", segmentsFile.getAbsolutePath())); segments.write(segmentsFile); } }




© 2015 - 2025 Weber Informatics LLC | Privacy Policy