All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.broadinstitute.hellbender.tools.copynumber.plotting.PlotModeledSegments Maven / Gradle / Ivy

The newest version!
package org.broadinstitute.hellbender.tools.copynumber.plotting;

import htsjdk.samtools.SAMSequenceDictionary;
import org.broadinstitute.barclay.argparser.Argument;
import org.broadinstitute.barclay.argparser.CommandLineProgramProperties;
import org.broadinstitute.barclay.help.DocumentedFeature;
import org.broadinstitute.hellbender.cmdline.CommandLineProgram;
import org.broadinstitute.hellbender.cmdline.StandardArgumentDefinitions;
import org.broadinstitute.hellbender.cmdline.programgroups.CopyNumberProgramGroup;
import org.broadinstitute.hellbender.tools.copynumber.DenoiseReadCounts;
import org.broadinstitute.hellbender.tools.copynumber.ModelSegments;
import org.broadinstitute.hellbender.tools.copynumber.arguments.CopyNumberArgumentValidationUtils;
import org.broadinstitute.hellbender.tools.copynumber.arguments.CopyNumberStandardArgument;
import org.broadinstitute.hellbender.tools.copynumber.formats.collections.AllelicCountCollection;
import org.broadinstitute.hellbender.tools.copynumber.formats.collections.CopyRatioCollection;
import org.broadinstitute.hellbender.tools.copynumber.formats.collections.ModeledSegmentCollection;
import org.broadinstitute.hellbender.tools.copynumber.formats.metadata.SampleLocatableMetadata;
import org.broadinstitute.hellbender.tools.copynumber.formats.records.AllelicCount;
import org.broadinstitute.hellbender.tools.copynumber.formats.records.CopyRatio;
import org.broadinstitute.hellbender.tools.copynumber.formats.records.ModeledSegment;
import org.broadinstitute.hellbender.utils.R.RScriptExecutor;
import org.broadinstitute.hellbender.utils.Utils;
import org.broadinstitute.hellbender.utils.io.Resource;
import org.broadinstitute.hellbender.utils.reference.ReferenceUtils;

import java.io.File;
import java.util.ArrayList;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.Map;
import java.util.stream.Collectors;

/**
 * Creates plots of denoised and segmented copy-ratio and minor-allele-fraction estimates.
 *
 * 

Inputs

* *
    *
  • * Modeled-segments file from {@link ModelSegments}. *
  • *
  • * (Optional) Denoised-copy-ratios file from {@link DenoiseReadCounts}. * If allelic counts are not provided, then this is required. *
  • *
  • * (Optional) Allelic-counts file containing the counts at sites genotyped as heterozygous (.hets.tsv output of {@link ModelSegments}). * If denoised copy ratios are not provided, then this is required. *
  • *
  • * Sequence-dictionary file. * This determines the order and representation of contigs in the plot. *
  • *
  • * Output prefix. * This is used as the basename for output files. *
  • *
  • * Output directory. * This will be created if it does not exist. *
  • *
* *

Outputs

* *
    *
  • * Modeled-segments-plot file. * This shows the input denoised copy ratios and/or alternate-allele fractions as points, as well as box plots * for the available posteriors in each segment. The colors of the points alternate with the segmentation. * Copy ratios are only plotted up to the maximum value specified by the argument {@code maximum-copy-ratio}. * Point sizes can be specified by the arguments {@code point-size-copy-ratio} and {@code point-size-allele-fraction}. *
  • *
* *

Usage examples

* *
 *     gatk PlotModeledSegments \
 *          --denoised-copy-ratios tumor.denoisedCR.tsv \
 *          --allelic-counts tumor.hets.tsv \
 *          --segments tumor.modelFinal.seg \
 *          --sequence-dictionary contigs_to_plot.dict \
 *          --output-prefix tumor \
 *          -O output_dir
 * 
* *
 *     gatk PlotModeledSegments \
 *          --denoised-copy-ratios tumor.denoisedCR.tsv \
 *          --segments tumor.modelFinal.seg \
 *          --sequence-dictionary contigs_to_plot.dict \
 *          --output-prefix tumor \
 *          -O output_dir
 * 
* *
 *     gatk PlotModeledSegments \
 *          --allelic-counts normal.hets.tsv \
 *          --segments normal.modelFinal.seg \
 *          --sequence-dictionary contigs_to_plot.dict \
 *          --output-prefix normal \
 *          -O output_dir
 * 
* * @author Samuel Lee <[email protected]> */ @CommandLineProgramProperties( summary = "Creates plots of denoised and segmented copy-ratio and minor-allele-fraction estimates", oneLineSummary = "Creates plots of denoised and segmented copy-ratio and minor-allele-fraction estimates", programGroup = CopyNumberProgramGroup.class ) @DocumentedFeature public final class PlotModeledSegments extends CommandLineProgram { private static final String PLOT_MODELED_SEGMENTS_R_SCRIPT = "PlotModeledSegments.R"; private static final String MODELED_SEGMENTS_PLOT_FILE_SUFFIX = ".modeled.png"; @Argument( doc = "Input file containing denoised copy ratios (output of DenoiseReadCounts).", fullName = CopyNumberStandardArgument.DENOISED_COPY_RATIOS_FILE_LONG_NAME, optional = true ) private File inputDenoisedCopyRatiosFile; @Argument( doc = "Input file containing allelic counts at heterozygous sites (.hets.tsv output of ModelSegments).", fullName = CopyNumberStandardArgument.ALLELIC_COUNTS_FILE_LONG_NAME, optional = true ) private File inputAllelicCountsFile; @Argument( doc = "Input file containing modeled segments (output of ModelSegments).", fullName = CopyNumberStandardArgument.SEGMENTS_FILE_LONG_NAME ) private File inputModeledSegmentsFile; @Argument( doc = PlottingUtils.SEQUENCE_DICTIONARY_DOC_STRING, fullName = StandardArgumentDefinitions.SEQUENCE_DICTIONARY_NAME, shortName = StandardArgumentDefinitions.SEQUENCE_DICTIONARY_NAME ) private File inputSequenceDictionaryFile; @Argument( doc = PlottingUtils.MINIMUM_CONTIG_LENGTH_DOC_STRING, fullName = PlottingUtils.MINIMUM_CONTIG_LENGTH_LONG_NAME, minValue = 0, optional = true ) private int minContigLength = PlottingUtils.DEFAULT_MINIMUM_CONTIG_LENGTH; @Argument( doc = PlottingUtils.MAXIMUM_COPY_RATIO_DOC_STRING, fullName = PlottingUtils.MAXIMUM_COPY_RATIO_LONG_NAME, minValue = 0, optional = true ) private double maxCopyRatio = PlottingUtils.DEFAULT_MAXIMUM_COPY_RATIO; @Argument( doc = PlottingUtils.POINT_SIZE_COPY_RATIO_DOC_STRING, fullName = PlottingUtils.POINT_SIZE_COPY_RATIO_LONG_NAME, minValue = 0, optional = true ) private double pointSizeCopyRatio = PlottingUtils.DEFAULT_POINT_SIZE_COPY_RATIO; @Argument( doc = PlottingUtils.POINT_SIZE_ALLELE_FRACTION_DOC_STRING, fullName = PlottingUtils.POINT_SIZE_ALLELE_FRACTION_LONG_NAME, minValue = 0, optional = true ) private double pointSizeAlleleFraction = PlottingUtils.DEFAULT_POINT_SIZE_ALLELE_FRACTION; @Argument( doc = "Prefix for output filenames.", fullName = CopyNumberStandardArgument.OUTPUT_PREFIX_LONG_NAME ) private String outputPrefix; @Argument( doc = "Output directory. This will be created if it does not exist.", fullName = StandardArgumentDefinitions.OUTPUT_LONG_NAME, shortName = StandardArgumentDefinitions.OUTPUT_SHORT_NAME ) private File outputDir; //read input files private CopyRatioCollection denoisedCopyRatios; private AllelicCountCollection allelicCounts; private ModeledSegmentCollection modeledSegments; @Override protected Object doWork() { validateArguments(); logger.info("Reading and validating input files..."); denoisedCopyRatios = inputDenoisedCopyRatiosFile == null ? null : new CopyRatioCollection(inputDenoisedCopyRatiosFile); allelicCounts = inputAllelicCountsFile == null ? null : new AllelicCountCollection(inputAllelicCountsFile); modeledSegments = new ModeledSegmentCollection(inputModeledSegmentsFile); final SampleLocatableMetadata metadata = CopyNumberArgumentValidationUtils.getValidatedMetadata( modeledSegments, denoisedCopyRatios, allelicCounts); final String sampleName = metadata.getSampleName(); //perform a basic check on the total number of copy-ratio intervals/allele-fraction sites per contig //(we do not check the number of intervals/sites overlapping each segment, in order to be robust against future //changes to the segmentation that might assign intervals to more than one segment) validateNumPointsPerContig(); //validate sequence dictionaries and load contig names and lengths into a LinkedHashMap final SAMSequenceDictionary sequenceDictionary = metadata.getSequenceDictionary(); final SAMSequenceDictionary sequenceDictionaryToPlot = ReferenceUtils.loadFastaDictionary(inputSequenceDictionaryFile); PlottingUtils.validateSequenceDictionarySubset(sequenceDictionary, sequenceDictionaryToPlot); final Map contigLengthMap = PlottingUtils.getContigLengthMap(sequenceDictionaryToPlot, minContigLength, logger); //check that contigs in input files are present in sequence dictionary and that data points are valid given lengths PlottingUtils.validateContigs(contigLengthMap, denoisedCopyRatios, inputDenoisedCopyRatiosFile, logger); PlottingUtils.validateContigs(contigLengthMap, allelicCounts, inputAllelicCountsFile, logger); PlottingUtils.validateContigs(contigLengthMap, modeledSegments, inputModeledSegmentsFile, logger); final List contigNames = new ArrayList<>(contigLengthMap.keySet()); final List contigLengths = new ArrayList<>(contigLengthMap.values()); final File outputFile = new File(outputDir, outputPrefix + MODELED_SEGMENTS_PLOT_FILE_SUFFIX); logger.info(String.format("Writing plot to %s...", outputFile.getAbsolutePath())); writeModeledSegmentsPlot(sampleName, contigNames, contigLengths, outputFile); logger.info(String.format("%s complete.", getClass().getSimpleName())); return null; } private void validateArguments() { Utils.validateArg(!(inputDenoisedCopyRatiosFile == null && inputAllelicCountsFile == null), "Must provide at least a denoised-copy-ratios file or an allelic-counts file."); CopyNumberArgumentValidationUtils.validateInputs( inputDenoisedCopyRatiosFile, inputAllelicCountsFile, inputModeledSegmentsFile, inputSequenceDictionaryFile); Utils.nonEmpty(outputPrefix); CopyNumberArgumentValidationUtils.validateAndPrepareOutputDirectories(outputDir); } private void validateNumPointsPerContig() { if (inputDenoisedCopyRatiosFile != null) { final Map modeledSegmentsContigToNumPointsMap = modeledSegments.getRecords().stream() .collect(Collectors.groupingBy(ModeledSegment::getContig, LinkedHashMap::new, Collectors.summingInt(ModeledSegment::getNumPointsCopyRatio))); final Map denoisedCopyRatiosContigToNumPointsMap = denoisedCopyRatios.getRecords().stream() .collect(Collectors.groupingBy(CopyRatio::getContig, LinkedHashMap::new, Collectors.summingInt(x -> 1))); Utils.validateArg(modeledSegmentsContigToNumPointsMap.keySet().stream() .allMatch(c -> (!denoisedCopyRatiosContigToNumPointsMap.containsKey(c) && modeledSegmentsContigToNumPointsMap.get(c) == 0) || (denoisedCopyRatiosContigToNumPointsMap.containsKey(c) && modeledSegmentsContigToNumPointsMap.get(c).equals(denoisedCopyRatiosContigToNumPointsMap.get(c)))), "Number of denoised-copy-ratio points in input modeled-segments file is inconsistent with that in input denoised-copy-ratios file."); } if (inputAllelicCountsFile != null) { final Map modeledSegmentsContigToNumPointsMap = modeledSegments.getRecords().stream() .collect(Collectors.groupingBy(ModeledSegment::getContig, LinkedHashMap::new, Collectors.summingInt(ModeledSegment::getNumPointsAlleleFraction))); final Map allelicCountsContigToNumPointsMap = allelicCounts.getRecords().stream() .collect(Collectors.groupingBy(AllelicCount::getContig, LinkedHashMap::new, Collectors.summingInt(x -> 1))); Utils.validateArg(modeledSegmentsContigToNumPointsMap.keySet().stream() .allMatch(c -> (!allelicCountsContigToNumPointsMap.containsKey(c) && modeledSegmentsContigToNumPointsMap.get(c) == 0) || (allelicCountsContigToNumPointsMap.containsKey(c) && modeledSegmentsContigToNumPointsMap.get(c).equals(allelicCountsContigToNumPointsMap.get(c)))), "Number of allelic-count points in input modeled-segments file is inconsistent with that in input heterozygous allelic-counts file."); } } /** * @param sampleName Sample name derived from input files * @param contigNames List containing contig names * @param contigLengths List containing contig lengths (same order as contigNames) */ private void writeModeledSegmentsPlot(final String sampleName, final List contigNames, final List contigLengths, final File outputFile) { final String contigNamesArg = String.join(PlottingUtils.CONTIG_DELIMITER, contigNames); //names separated by delimiter final String contigLengthsArg = contigLengths.stream().map(Object::toString).collect(Collectors.joining(PlottingUtils.CONTIG_DELIMITER)); //lengths separated by delimiter final RScriptExecutor executor = new RScriptExecutor(); //this runs the R statement "source("CNVPlottingLibrary.R")" before the main script runs executor.addScript(new Resource(PlottingUtils.CNV_PLOTTING_R_LIBRARY, PlotModeledSegments.class)); executor.addScript(new Resource(PLOT_MODELED_SEGMENTS_R_SCRIPT, PlotModeledSegments.class)); executor.addArgs( "--sample_name=" + sampleName, "--denoised_copy_ratios_file=" + (inputDenoisedCopyRatiosFile == null ? null : CopyNumberArgumentValidationUtils.getCanonicalPath(inputDenoisedCopyRatiosFile)), "--allelic_counts_file=" + (inputAllelicCountsFile == null ? null : CopyNumberArgumentValidationUtils.getCanonicalPath(inputAllelicCountsFile)), "--modeled_segments_file=" + CopyNumberArgumentValidationUtils.getCanonicalPath(inputModeledSegmentsFile), "--contig_names=" + contigNamesArg, "--contig_lengths=" + contigLengthsArg, "--maximum_copy_ratio=" + maxCopyRatio, "--point_size_copy_ratio=" + pointSizeCopyRatio, "--point_size_allele_fraction=" + pointSizeAlleleFraction, "--output_file=" + CopyNumberArgumentValidationUtils.getCanonicalPath(outputFile)); executor.exec(); } }




© 2015 - 2025 Weber Informatics LLC | Privacy Policy