All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.broadinstitute.hellbender.tools.copynumber.PostprocessGermlineCNVCalls Maven / Gradle / Ivy

The newest version!
package org.broadinstitute.hellbender.tools.copynumber;

import htsjdk.samtools.SAMSequenceDictionary;
import htsjdk.samtools.SAMSequenceRecord;
import htsjdk.variant.variantcontext.writer.VariantContextWriter;
import org.broadinstitute.barclay.argparser.Argument;
import org.broadinstitute.barclay.argparser.CommandLineProgramProperties;
import org.broadinstitute.barclay.help.DocumentedFeature;
import org.broadinstitute.hellbender.cmdline.programgroups.CopyNumberProgramGroup;
import org.broadinstitute.hellbender.engine.GATKTool;
import org.broadinstitute.hellbender.exceptions.GATKException;
import org.broadinstitute.hellbender.exceptions.UserException;
import org.broadinstitute.hellbender.tools.copynumber.arguments.CopyNumberArgumentValidationUtils;
import org.broadinstitute.hellbender.tools.copynumber.formats.collections.AbstractLocatableCollection;
import org.broadinstitute.hellbender.tools.copynumber.formats.collections.AbstractRecordCollection;
import org.broadinstitute.hellbender.tools.copynumber.formats.collections.BaselineCopyNumberCollection;
import org.broadinstitute.hellbender.tools.copynumber.formats.collections.CopyNumberPosteriorDistributionCollection;
import org.broadinstitute.hellbender.tools.copynumber.formats.collections.IntegerCopyNumberSegmentCollection;
import org.broadinstitute.hellbender.tools.copynumber.formats.collections.LinearCopyRatioCollection;
import org.broadinstitute.hellbender.tools.copynumber.formats.collections.NonLocatableDoubleCollection;
import org.broadinstitute.hellbender.tools.copynumber.formats.collections.OverlappingIntegerCopyNumberSegmentCollection;
import org.broadinstitute.hellbender.tools.copynumber.formats.collections.SimpleIntervalCollection;
import org.broadinstitute.hellbender.tools.copynumber.formats.metadata.LocatableMetadata;
import org.broadinstitute.hellbender.tools.copynumber.formats.metadata.SampleLocatableMetadata;
import org.broadinstitute.hellbender.tools.copynumber.formats.metadata.SimpleSampleLocatableMetadata;
import org.broadinstitute.hellbender.tools.copynumber.formats.records.CopyNumberPosteriorDistribution;
import org.broadinstitute.hellbender.tools.copynumber.formats.records.IntegerCopyNumberSegment;
import org.broadinstitute.hellbender.tools.copynumber.formats.records.IntervalCopyNumberGenotypingData;
import org.broadinstitute.hellbender.tools.copynumber.formats.records.LinearCopyRatio;
import org.broadinstitute.hellbender.tools.copynumber.gcnv.GermlineCNVIntervalVariantComposer;
import org.broadinstitute.hellbender.tools.copynumber.gcnv.GermlineCNVNamingConstants;
import org.broadinstitute.hellbender.tools.copynumber.gcnv.GermlineCNVSegmentVariantComposer;
import org.broadinstitute.hellbender.tools.copynumber.gcnv.IntegerCopyNumberState;
import org.broadinstitute.hellbender.utils.SimpleInterval;
import org.broadinstitute.hellbender.utils.Utils;
import org.broadinstitute.hellbender.utils.io.IOUtils;
import org.broadinstitute.hellbender.utils.io.Resource;
import org.broadinstitute.hellbender.utils.python.PythonScriptExecutor;
import org.broadinstitute.hellbender.utils.reference.ReferenceUtils;

import java.io.BufferedReader;
import java.io.File;
import java.io.FileReader;
import java.io.IOException;
import java.nio.file.Paths;
import java.util.ArrayList;
import java.util.HashSet;
import java.util.List;
import java.util.Set;
import java.util.stream.Collectors;
import java.util.stream.IntStream;

/**
 * Postprocesses the output of {@link GermlineCNVCaller} and generates VCF files as well as a concatenated denoised
 * copy ratio file.
 *
 * 

This tool generates "intervals" and "segments" VCF files that serve complementary purposes. The intervals VCF * file provides a detailed listing of the most likely copy-number call for each genomic interval included in the * call-set, along with call quality, call genotype, and the phred-scaled posterior probability vector for all * integer copy-number states. Given that CNV events often span several consecutive intervals, it may be desirable * to coalesce contiguous intervals with the same copy-number call into a constant copy-number segments. This tool * further performs segmentation and genotyping by calling a dedicated python script in `gcnvkernel`. The segmentation * algorithm further provides various quality metrics for the segment.

* *

For both VCF outputs, the CNV genotype is determined as follows: the alternative allele for a CNV call is * either <DEL> or <DUP>, depending on whether the most likely * copy-number call is below or above the reference copy-number for the contig. The user may specify the * reference copy-number state on autosomal contigs using the argument autosomal-ref-copy-number. * The list of allosomal contigs may also be specified via the argument allosomal-contig. All * undeclared contigs are assumed to be autosomal. The reference copy-number on an allosomal contig is determined * by the sex karyotype of the sample and is set to the pre-determined contig ploidy state fetched from the output * calls of {@link DetermineGermlineContigPloidy}.

* *

Finally, the tool concatenates posterior means for denoised copy ratios from all the call shards produced by * the {@link GermlineCNVCaller} into a single file.

* *

This tool can also take a VCF specifying breakpoints to be used instead of HMM-derived segmentation using posterior * probabilities from the intervals. This functionality enables the calculation of new quality scores using breakpoints * derived from another source, as with JointGermlineCNVSegmentation applied to multiple samples. When using this functionality, * an input-intervals-vcf from the original PostprocessGermlineCNVCalls call without multi-sample * breakpoints should also be provided.

* *

Python environment setup

* *

The computation done by this tool, aside from input data parsing and validation, is performed outside of the Java * Virtual Machine and using the gCNV computational python module, namely {@code gcnvkernel}. It is crucial that * the user has properly set up a python conda environment with {@code gcnvkernel} and its dependencies * installed. If the user intends to run {@link PostprocessGermlineCNVCalls} using one of the official GATK Docker images, * the python environment is already set up. Otherwise, the environment must be created and activated as described in the * main GATK README.md file.

* *

Advanced users may wish to set the PYTENSOR_FLAGS environment variable to override the GATK PyTensor * configuration. For example, by running * PYTENSOR_FLAGS="base_compiledir=PATH/TO/BASE_COMPILEDIR" gatk DetermineGermlineContigPloidy ..., users can specify * the PyTensor compilation directory (which is set to $HOME/.pytensor by default). See PyTensor documentation * at * https://pytensor.readthedocs.io/en/latest/library/config.html. *

* *

Required inputs:

*
    *
  • A list of paths to {@link GermlineCNVCaller} calls shards
  • *
  • A list of paths to {@link GermlineCNVCaller} model shards
  • *
  • Path to the output calls of {@link DetermineGermlineContigPloidy}
  • *
  • Index of the sample in the call-set (which is expected to be the same across all shards)
  • *
  • Output path for writing the intervals VCF
  • *
  • Output path for writing the segments VCF
  • *
  • Output path for writing the concatenated denoised copy ratios
  • *
* *

The calls or model shards can be specified in arbitrary order.

* *

Usage examples

* *
 *   gatk PostprocessGermlineCNVCalls \
 *     --calls-shard-path path/to/shard_1-calls
 *     --calls-shard-path path/to/shard_2-calls
 *     --model-shard-path path/to/shard_1-model
 *     --model-shard-path path/to/shard_2-model
 *     --sample-index 0
 *     --autosomal-ref-copy-number 2
 *     --allosomal-contig X
 *     --allosomal-contig Y
 *     --output-genotyped-intervals sample_0_genotyped_intervals.vcf
 *     --output-genotyped-segments sample_0_genotyped_segments.vcf
 *     --output-denoised-copy-ratios sample_0_denoised_copy_ratios.tsv
 * 
* *
 *   gatk PostprocessGermlineCNVCalls \
 *     --calls-shard-path path/to/shard_1-calls
 *     --calls-shard-path path/to/shard_2-calls
 *     --model-shard-path path/to/shard_1-model
 *     --model-shard-path path/to/shard_2-model
 *     --sample-index 0
 *     --autosomal-ref-copy-number 2
 *     --allosomal-contig X
 *     --allosomal-contig Y
 *     --input-intervals-vcf sample_0_genotyped_intervals.vcf
 *     --clustered-breakpoints cohort_breakpoints.vcf
 *     --output-genotyped-intervals sample_0_genotyped_intervals.clustered.vcf
 *     --output-genotyped-segments sample_0_genotyped_segments.clustered.vcf
 *     --output-denoised-copy-ratios sample_0_denoised_copy_ratios.clustered.tsv
 *     -R reference.fasta
 * 
* * @author Mehrtash Babadi <[email protected]> * @author Andrey Smirnov <[email protected]> */ @CommandLineProgramProperties( summary = "Postprocesses the output of GermlineCNVCaller and generates VCFs and denoised copy ratios", oneLineSummary = "Postprocesses the output of GermlineCNVCaller and generates VCFs and denoised copy ratios", programGroup = CopyNumberProgramGroup.class ) @DocumentedFeature public final class PostprocessGermlineCNVCalls extends GATKTool { public static final String SEGMENT_GERMLINE_CNV_CALLS_PYTHON_SCRIPT = "segment_gcnv_calls.py"; public static final String CALLS_SHARD_PATH_LONG_NAME = "calls-shard-path"; public static final String MODEL_SHARD_PATH_LONG_NAME = "model-shard-path"; public static final String CONTIG_PLOIDY_CALLS_LONG_NAME = "contig-ploidy-calls"; public static final String SAMPLE_INDEX_LONG_NAME = "sample-index"; public static final String OUTPUT_INTERVALS_VCF_LONG_NAME = "output-genotyped-intervals"; public static final String OUTPUT_SEGMENTS_VCF_LONG_NAME = "output-genotyped-segments"; public static final String OUTPUT_DENOISED_COPY_RATIOS_LONG_NAME = "output-denoised-copy-ratios"; public static final String AUTOSOMAL_REF_COPY_NUMBER_LONG_NAME = "autosomal-ref-copy-number"; public static final String ALLOSOMAL_CONTIG_LONG_NAME = "allosomal-contig"; public static final String INPUT_INTERVALS_LONG_NAME = "input-intervals-vcf"; public static final String CLUSTERED_FILE_LONG_NAME = "clustered-breakpoints"; public static final String DUPLICATION_QS_THRESHOLD_LONG_NAME = "duplication-qs-threshold"; public static final String HET_DEL_QS_THRESHOLD_LONG_NAME = "het-deletion-qs-threshold"; public static final String HOM_DEL_QS_THRESHOLD_LONG_NAME = "hom-deletion-qs-threshold"; public static final String SITE_FREQUENCY_THRESHOLD_LONG_NAME = "site-frequency-threshold"; @Argument( doc = "List of paths to GermlineCNVCaller call directories.", fullName = CALLS_SHARD_PATH_LONG_NAME, minElements = 1 ) private List inputUnsortedCallsShardPaths; @Argument( doc = "List of paths to GermlineCNVCaller model directories.", fullName = MODEL_SHARD_PATH_LONG_NAME, minElements = 1 ) private List inputUnsortedModelShardPaths; @Argument( doc = "Path to contig-ploidy calls directory (output of DetermineGermlineContigPloidy).", fullName = CONTIG_PLOIDY_CALLS_LONG_NAME ) private File inputContigPloidyCallsPath; @Argument( doc = "Sample index in the call-set (must be contained in all shards).", fullName = SAMPLE_INDEX_LONG_NAME, minValue = 0 ) private int sampleIndex; @Argument( doc = "Reference copy-number on autosomal intervals.", fullName = AUTOSOMAL_REF_COPY_NUMBER_LONG_NAME, minValue = 0 ) private int refAutosomalCopyNumber = 2; @Argument( doc = "Contigs to treat as allosomal (i.e. choose their reference copy-number allele according to " + "the sample karyotype).", fullName = ALLOSOMAL_CONTIG_LONG_NAME, optional = true ) private List allosomalContigList; @Argument( doc = "Input VCF with combined intervals for all samples", fullName = INPUT_INTERVALS_LONG_NAME, optional = true ) private File combinedIntervalsVCFFile = null; @Argument( doc = "VCF with clustered breakpoints and copy number calls for all samples, can be generated with GATK JointGermlineCNVSegmentation tool", fullName = CLUSTERED_FILE_LONG_NAME, optional = true ) private File clusteredBreakpointsVCFFile = null; @Argument( doc = "Output intervals VCF file.", fullName = OUTPUT_INTERVALS_VCF_LONG_NAME ) private File outputIntervalsVCFFile; @Argument( doc = "Output segments VCF file.", fullName = OUTPUT_SEGMENTS_VCF_LONG_NAME ) private File outputSegmentsVCFFile; @Argument( doc = "Output denoised copy ratio file.", fullName = OUTPUT_DENOISED_COPY_RATIOS_LONG_NAME ) private File outputDenoisedCopyRatioFile; @Argument( doc = "Filter out heterozygous deletions with quality lower than this.", fullName = HET_DEL_QS_THRESHOLD_LONG_NAME, optional = true, minValue = 0 ) private int hetDelQSThreshold = 100; @Argument( doc = "Filter out homozygous deletions with quality lower than this.", fullName = HOM_DEL_QS_THRESHOLD_LONG_NAME, optional = true, minValue = 0 ) private int homDelQSThreshold = 400; @Argument( doc = "Filter out duplications with quality lower than this.", fullName = DUPLICATION_QS_THRESHOLD_LONG_NAME, optional = true, minValue = 0 ) private int dupeQSThreshold = 50; @Argument( doc = "Filter out variants with site frequency higher than this.", fullName = SITE_FREQUENCY_THRESHOLD_LONG_NAME, optional = true, minValue = 0 ) private double siteFrequencyThreshold = 0.01; /** * A list of {@link SimpleIntervalCollection} for each shard */ private List sortedIntervalCollections; /** * Sequence dictionary that should be equivalent in each shard */ private SAMSequenceDictionary sequenceDictionary; /** * The sample name corresponding to the provided sample index */ private String sampleName; /** * Number of shards */ private int numShards; /** * Reference integer copy-number state for autosomal intervals */ private IntegerCopyNumberState refAutosomalIntegerCopyNumberState; /** * Set of allosomal contigs */ private Set allosomalContigSet; /** * Intervals extracted from call shards, unsorted */ private List unsortedIntervalCollectionsFromCalls; /** * Intervals extracted from model shards, unsorted */ private List unsortedIntervalCollectionsFromModels; /** * Call shard directories put in correct order */ private List sortedCallsShardPaths; /** * Model shard directories put in correct order */ private List sortedModelShardPaths; @Override public void onStartup() { super.onStartup(); /* check for successful import of gcnvkernel */ PythonScriptExecutor.checkPythonEnvironmentForPackage("gcnvkernel"); } /** * Inputs to this tool are directories, not files, so we need a custom method to find an appropriate file inside the * directories to pull a dictionary out of * @return a SAM sequence dictionary that is consistent for all shards */ @Override public SAMSequenceDictionary getBestAvailableSequenceDictionary() { if (getMasterSequenceDictionary() != null) { sequenceDictionary = getMasterSequenceDictionary(); return sequenceDictionary; } final List unsortedIntervalCollectionsFromCalls = getIntervalCollectionsFromPaths(inputUnsortedCallsShardPaths); final List unsortedIntervalCollectionsFromModels = getIntervalCollectionsFromPaths(inputUnsortedModelShardPaths); /* assert that all shards have the same SAM sequence dictionary */ sequenceDictionary = unsortedIntervalCollectionsFromCalls.get(0) .getMetadata().getSequenceDictionary(); Utils.validateArg(unsortedIntervalCollectionsFromCalls.stream() .map(SimpleIntervalCollection::getMetadata) .map(LocatableMetadata::getSequenceDictionary) .allMatch(shardSAMSequenceDictionary -> shardSAMSequenceDictionary.equals(sequenceDictionary)), "The SAM sequence dictionary is not the same for all of the call shards."); Utils.validateArg(unsortedIntervalCollectionsFromModels.stream() .map(SimpleIntervalCollection::getMetadata) .map(LocatableMetadata::getSequenceDictionary) .allMatch(shardSAMSequenceDictionary -> shardSAMSequenceDictionary.equals(sequenceDictionary)), "The SAM sequence dictionary is either not the same for all of the model shards, " + "or is different from the SAM sequence dictionary of calls shards."); return sequenceDictionary; } /** * Performs various validations on input arguments. Since many of these validations requires loading and parsing * reusable data, we store them as global variables (shard interval lists, sample name, etc). */ @Override public void onTraversalStart() { validateArguments(); numShards = inputUnsortedCallsShardPaths.size(); sequenceDictionary = getBestAvailableSequenceDictionary(); /* get the correct shard sort order and sort all collections */ final List sortedCallShardsOrder = AbstractLocatableCollection.getShardedCollectionSortOrder( getUnsortedIntervalCollectionsFromCalls()); final List sortedModelShardsOrder = AbstractLocatableCollection.getShardedCollectionSortOrder( getUnsortedIntervalCollectionsFromModels()); sortedCallsShardPaths = sortedCallShardsOrder.stream() .map(inputUnsortedCallsShardPaths::get) .collect(Collectors.toList()); sortedModelShardPaths = sortedModelShardsOrder.stream() .map(inputUnsortedModelShardPaths::get) .collect(Collectors.toList()); final List sortedIntervalCollectionsFromCalls = sortedCallShardsOrder.stream() .map(getUnsortedIntervalCollectionsFromCalls()::get) .collect(Collectors.toList()); final List sortedIntervalCollectionsFromModels = sortedModelShardsOrder.stream() .map(getUnsortedIntervalCollectionsFromModels()::get) .collect(Collectors.toList()); Utils.validateArg(sortedIntervalCollectionsFromCalls.equals(sortedIntervalCollectionsFromModels), "The interval lists found in model and call shards do not match. Make sure that the calls and model " + "paths are provided in matching order."); sortedIntervalCollections = sortedIntervalCollectionsFromCalls; checkForSingletonInterval(sortedIntervalCollections); /* assert that allosomal contigs are contained in the SAM sequence dictionary */ final Set allContigs = sequenceDictionary.getSequences().stream() .map(SAMSequenceRecord::getSequenceName) .collect(Collectors.toSet()); allosomalContigSet = new HashSet<>(allosomalContigList); if (allosomalContigSet.isEmpty()) { logger.warn(String.format("Allosomal contigs were not specified; setting ref copy-number allele " + "to (%d) for all intervals.", refAutosomalCopyNumber)); } else { Utils.validateArg(allContigs.containsAll(allosomalContigSet), String.format( "The specified allosomal contigs must be contained in the SAM sequence dictionary of the " + "call-set (specified allosomal contigs: %s, all contigs: %s)", allosomalContigSet.stream().collect(Collectors.joining(", ", "[", "]")), allContigs.stream().collect(Collectors.joining(", ", "[", "]")))); } /* get sample name from the first shard and assert that all shards have the same sample name */ sampleName = getShardSampleName(0); Utils.validate(IntStream.range(1, numShards) .mapToObj(this::getShardSampleName) .allMatch(shardSampleName -> shardSampleName.equals(sampleName)), "The sample name is not the same for all of the shards."); refAutosomalIntegerCopyNumberState = new IntegerCopyNumberState(refAutosomalCopyNumber); } private void validateArguments() { Utils.validateArg(inputUnsortedCallsShardPaths.size() == inputUnsortedModelShardPaths.size(), "The number of input call shards must match the number of input model shards."); inputUnsortedCallsShardPaths.forEach(CopyNumberArgumentValidationUtils::validateInputs); inputUnsortedModelShardPaths.forEach(CopyNumberArgumentValidationUtils::validateInputs); CopyNumberArgumentValidationUtils.validateInputs(inputContigPloidyCallsPath); CopyNumberArgumentValidationUtils.validateOutputFiles( outputIntervalsVCFFile, outputSegmentsVCFFile, outputDenoisedCopyRatioFile); } @Override public void traverse() {} // no traversal for this tool @Override public Object onTraversalSuccess() { generateIntervalsVCFFileFromAllShards(); generateSegmentsVCFFileFromAllShards(); concatenateDenoisedCopyRatioFiles(); logger.info(String.format("%s complete.", getClass().getSimpleName())); return null; } private void generateIntervalsVCFFileFromAllShards() { logger.info("Generating intervals VCF file..."); final VariantContextWriter intervalsVCFWriter = createVCFWriter(outputIntervalsVCFFile); final GermlineCNVIntervalVariantComposer germlineCNVIntervalVariantComposer = new GermlineCNVIntervalVariantComposer(intervalsVCFWriter, sampleName, refAutosomalIntegerCopyNumberState, allosomalContigSet); germlineCNVIntervalVariantComposer.composeVariantContextHeader(sequenceDictionary, getDefaultToolVCFHeaderLines()); logger.info(String.format("Writing intervals VCF file to %s...", outputIntervalsVCFFile.getAbsolutePath())); for (int shardIndex = 0; shardIndex < numShards; shardIndex++) { logger.info(String.format("Analyzing shard %d / %d...", shardIndex + 1, numShards)); germlineCNVIntervalVariantComposer.writeAll(getShardIntervalCopyNumberPosteriorData(shardIndex)); } intervalsVCFWriter.close(); } private void generateSegmentsVCFFileFromAllShards() { logger.info("Generating segments..."); /* perform segmentation */ final File pythonScriptOutputPath = IOUtils.createTempDir("gcnv-segmented-calls"); final boolean pythonScriptSucceeded = executeSegmentGermlineCNVCallsPythonScript( sampleIndex, inputContigPloidyCallsPath, sortedCallsShardPaths, sortedModelShardPaths, combinedIntervalsVCFFile, clusteredBreakpointsVCFFile, pythonScriptOutputPath); if (!pythonScriptSucceeded) { throw new UserException("Python return code was non-zero."); } /* parse segments */ logger.info("Parsing Python output..."); final File copyNumberSegmentsFile = getCopyNumberSegmentsFile(pythonScriptOutputPath, sampleIndex); final List records; //if we supply a breakpoints file, then allow overlapping segments final AbstractRecordCollection integerCopyNumberSegmentCollection; final String sampleNameFromSegmentCollection; if (clusteredBreakpointsVCFFile == null) { integerCopyNumberSegmentCollection = new IntegerCopyNumberSegmentCollection(copyNumberSegmentsFile); sampleNameFromSegmentCollection= integerCopyNumberSegmentCollection .getMetadata().getSampleName(); } else { integerCopyNumberSegmentCollection = new OverlappingIntegerCopyNumberSegmentCollection(copyNumberSegmentsFile); sampleNameFromSegmentCollection = integerCopyNumberSegmentCollection .getMetadata().getSampleName(); } Utils.validate(sampleNameFromSegmentCollection.equals(sampleName), String.format("Sample name found in the header of copy-number segments file is " + "different from the expected sample name (found: %s, expected: %s).", sampleNameFromSegmentCollection, sampleName)); records = integerCopyNumberSegmentCollection.getRecords(); /* write variants */ logger.info(String.format("Writing segments VCF file to %s...", outputSegmentsVCFFile.getAbsolutePath())); final VariantContextWriter segmentsVCFWriter = createVCFWriter(outputSegmentsVCFFile); final GermlineCNVSegmentVariantComposer germlineCNVSegmentVariantComposer = new GermlineCNVSegmentVariantComposer(segmentsVCFWriter, sampleName, refAutosomalIntegerCopyNumberState, allosomalContigSet, referenceArguments.getReferenceSpecifier() == null ? null : ReferenceUtils.createReferenceReader(referenceArguments.getReferenceSpecifier()), dupeQSThreshold, hetDelQSThreshold, homDelQSThreshold, siteFrequencyThreshold, clusteredBreakpointsVCFFile); germlineCNVSegmentVariantComposer.composeVariantContextHeader(sequenceDictionary, getDefaultToolVCFHeaderLines()); germlineCNVSegmentVariantComposer.writeAll(records); segmentsVCFWriter.close(); } private void concatenateDenoisedCopyRatioFiles() { logger.info("Generating denoised copy ratios..."); final List concatenatedIntervals = new ArrayList<>(); final List concatenatedDenoisedCopyRatioValues = new ArrayList<>(); /* Read in and concatenate all denoised copy ratio files into one list */ for (int shardIndex = 0; shardIndex < numShards; shardIndex++) { final File shardRootDirectory = sortedCallsShardPaths.get(shardIndex); final File denoisedCopyRatioFile = getSampleDenoisedCopyRatioFile(shardRootDirectory, sampleIndex); final NonLocatableDoubleCollection shardNonLocatableLinearCopyRatioCollectionForShard = new NonLocatableDoubleCollection(denoisedCopyRatioFile); final List shardIntervals = sortedIntervalCollections.get(shardIndex).getIntervals(); final String sampleNameFromDenoisedCopyRatioFile = shardNonLocatableLinearCopyRatioCollectionForShard .getMetadata().getSampleName(); Utils.validate(sampleNameFromDenoisedCopyRatioFile.equals(sampleName), String.format("Sample name found in the header of denoised copy ratio file for shard %d " + "is different from the expected sample name (found: %s, expected: %s).", shardIndex, sampleNameFromDenoisedCopyRatioFile, sampleName)); final List shardDenoisedCopyRatioRecords = shardNonLocatableLinearCopyRatioCollectionForShard.getRecords(); Utils.validate(shardIntervals.size() == shardDenoisedCopyRatioRecords.size(), String.format("The number of entries in denoised copy ratio file for shard %d does " + "not match the number of entries in the shard interval list (copy ratio list size: %d, " + "interval list size: %d)", shardIndex, shardDenoisedCopyRatioRecords.size(), shardIntervals.size())); concatenatedIntervals.addAll(shardIntervals); concatenatedDenoisedCopyRatioValues.addAll(shardDenoisedCopyRatioRecords); } /* Attach the corresponding intervals */ final List linearCopyRatios = IntStream.range(0, concatenatedIntervals.size()) .mapToObj(intervalIndex -> new LinearCopyRatio( concatenatedIntervals.get(intervalIndex), concatenatedDenoisedCopyRatioValues.get(intervalIndex))) .collect(Collectors.toList()); final SimpleSampleLocatableMetadata metadata = new SimpleSampleLocatableMetadata(sampleName, sequenceDictionary); /* Make a locatable collection of denoised copy ratios and write it to file */ final LinearCopyRatioCollection linearCopyRatioCollection = new LinearCopyRatioCollection(metadata, linearCopyRatios); logger.info(String.format("Writing denoised copy ratios to %s...", outputDenoisedCopyRatioFile.getAbsolutePath())); linearCopyRatioCollection.write(outputDenoisedCopyRatioFile); } /** * Retrieves intervals either from a `gcnvkernel` model output path or the root directory of calls path. */ private static List getIntervalCollectionsFromPaths(final List shardsPathList) { return shardsPathList.stream() .map(shardPath -> new SimpleIntervalCollection( getIntervalFileFromShardDirectory(shardPath))) .collect(Collectors.toList()); } /** * Extracts sample name from a designated text file in the sample calls directory. */ private String getShardSampleName(final int shardIndex) { final File shardSampleNameTextFile = getSampleNameTextFile(sortedCallsShardPaths.get(shardIndex), sampleIndex); try { final BufferedReader reader = new BufferedReader(new FileReader(shardSampleNameTextFile)); return reader.readLine(); } catch (final IOException ex) { throw new UserException.BadInput(String.format("Could not read the sample name text file at %s.", shardSampleNameTextFile.getAbsolutePath())); } } /** * Returns a list of {@link IntervalCopyNumberGenotypingData} for {@link #sampleIndex} from a * single calls shard. */ private List getShardIntervalCopyNumberPosteriorData(final int shardIndex) { /* read copy-number posteriors for the shard */ final File shardRootDirectory = sortedCallsShardPaths.get(shardIndex); final File sampleCopyNumberPosteriorFile = getSampleCopyNumberPosteriorFile(shardRootDirectory, sampleIndex); final CopyNumberPosteriorDistributionCollection copyNumberPosteriorDistributionCollection = new CopyNumberPosteriorDistributionCollection(sampleCopyNumberPosteriorFile); final String sampleNameFromCopyNumberPosteriorFile = copyNumberPosteriorDistributionCollection .getMetadata().getSampleName(); Utils.validate(sampleNameFromCopyNumberPosteriorFile.equals(sampleName), String.format("Sample name found in the header of copy-number posterior file for shard %d " + "different from the expected sample name (found: %s, expected: %s).", shardIndex, sampleNameFromCopyNumberPosteriorFile, sampleName)); /* read copy-number baselines for the shard */ final File sampleBaselineCopyNumberFile = getSampleBaselineCopyNumberFile(shardRootDirectory, sampleIndex); final BaselineCopyNumberCollection baselineCopyNumberCollection = new BaselineCopyNumberCollection(sampleBaselineCopyNumberFile); final String sampleNameFromBaselineCopyNumberFile = baselineCopyNumberCollection .getMetadata().getSampleName(); Utils.validate(sampleNameFromBaselineCopyNumberFile.equals(sampleName), String.format("Sample name found in the header of baseline copy-number file for shard %d " + "different from the expected sample name (found: %s, expected: %s).", shardIndex, sampleNameFromBaselineCopyNumberFile, sampleName)); /* attach the intervals to make locatable posteriors */ final List shardIntervals = sortedIntervalCollections.get(shardIndex).getIntervals(); final List copyNumberPosteriorDistributions = copyNumberPosteriorDistributionCollection.getRecords(); Utils.validate(shardIntervals.size() == copyNumberPosteriorDistributions.size(), String.format("The number of entries in the copy-number posterior file for shard %d does " + "not match the number of entries in the shard interval list (posterior list size: %d, " + "interval list size: %d)", shardIndex, copyNumberPosteriorDistributions.size(), shardIntervals.size())); final List baselineCopyNumbers = baselineCopyNumberCollection.getRecords(); Utils.validate(shardIntervals.size() == baselineCopyNumbers.size(), String.format("The number of entries in the baseline copy-number file for shard %d does " + "not match the number of entries in the shard interval list (baseline copy-number " + "list size: %d, interval list size: %d)", shardIndex, baselineCopyNumbers.size(), shardIntervals.size())); return IntStream.range(0, copyNumberPosteriorDistributionCollection.size()) .mapToObj(intervalIndex -> new IntervalCopyNumberGenotypingData( shardIntervals.get(intervalIndex), copyNumberPosteriorDistributions.get(intervalIndex), baselineCopyNumbers.get(intervalIndex))) .collect(Collectors.toList()); } /** * Gets the copy-number posterior file from the shard directory for a given sample index. */ private static File getSampleCopyNumberPosteriorFile(final File callShardPath, final int sampleIndex) { return Paths.get( callShardPath.getAbsolutePath(), GermlineCNVNamingConstants.SAMPLE_PREFIX + sampleIndex, GermlineCNVNamingConstants.COPY_NUMBER_POSTERIOR_FILE_NAME).toFile(); } /** * Gets the baseline copy-number file from the shard directory for a given sample index. */ private static File getSampleBaselineCopyNumberFile(final File callShardPath, final int sampleIndex) { return Paths.get( callShardPath.getAbsolutePath(), GermlineCNVNamingConstants.SAMPLE_PREFIX + sampleIndex, GermlineCNVNamingConstants.BASELINE_COPY_NUMBER_FILE_NAME).toFile(); } /** * Gets the sample name text file from the shard directory for a given sample index. */ private static File getSampleNameTextFile(final File callShardRootPath, final int sampleIndex) { return Paths.get( callShardRootPath.getAbsolutePath(), GermlineCNVNamingConstants.SAMPLE_PREFIX + sampleIndex, GermlineCNVNamingConstants.SAMPLE_NAME_TXT_FILE).toFile(); } /** * Gets the denoised copy ratio file from the shard directory given sample index. */ private static File getSampleDenoisedCopyRatioFile(final File callShardPath, final int sampleIndex) { return Paths.get( callShardPath.getAbsolutePath(), GermlineCNVNamingConstants.SAMPLE_PREFIX + sampleIndex, GermlineCNVNamingConstants.DENOISED_COPY_RATIO_MEAN_FILE_NAME).toFile(); } /** * Gets the interval list file from the shard directory. */ private static File getIntervalFileFromShardDirectory(final File shardPath) { return new File(shardPath, GermlineCNVNamingConstants.INTERVAL_LIST_FILE_NAME); } /** * Runs the segmenter python scripts and returns the exit code (true for a successful python call). */ private static boolean executeSegmentGermlineCNVCallsPythonScript(final int sampleIndex, final File contigPloidyCallsPath, final List sortedCallDirectories, final List sortedModelDirectories, final File combinedIntervalsVCFFile, final File clusteredBreakpointsVCFFile, final File pythonScriptOutputPath) { /* the inputs to this method are expected to be previously validated */ try { Utils.nonNull(contigPloidyCallsPath); Utils.nonNull(sortedCallDirectories); Utils.nonNull(sortedModelDirectories); Utils.nonNull(pythonScriptOutputPath); sortedCallDirectories.forEach(Utils::nonNull); sortedModelDirectories.forEach(Utils::nonNull); } catch (final IllegalArgumentException ex) { throw new GATKException.ShouldNeverReachHereException(ex); } final PythonScriptExecutor executor = new PythonScriptExecutor(true); final List arguments = new ArrayList<>(); arguments.add("--ploidy_calls_path"); arguments.add(CopyNumberArgumentValidationUtils.getCanonicalPath(contigPloidyCallsPath)); arguments.add("--model_shards"); arguments.addAll(sortedModelDirectories.stream().map(CopyNumberArgumentValidationUtils::getCanonicalPath).collect(Collectors.toList())); arguments.add("--calls_shards"); arguments.addAll(sortedCallDirectories.stream().map(CopyNumberArgumentValidationUtils::getCanonicalPath).collect(Collectors.toList())); arguments.add("--output_path"); arguments.add(CopyNumberArgumentValidationUtils.getCanonicalPath(pythonScriptOutputPath)); arguments.add("--sample_index"); arguments.add(String.valueOf(sampleIndex)); if (combinedIntervalsVCFFile != null) { arguments.add("--intervals_vcf"); arguments.add(combinedIntervalsVCFFile.toString()); } if (clusteredBreakpointsVCFFile != null) { arguments.add("--clustered_vcf"); arguments.add(clusteredBreakpointsVCFFile.toString()); } return executor.executeScript( new Resource(SEGMENT_GERMLINE_CNV_CALLS_PYTHON_SCRIPT, PostprocessGermlineCNVCalls.class), null, arguments); } /** * Gets the copy-number segments file from the output of `segment_gcnv_calls.py` for a given sample index. */ private static File getCopyNumberSegmentsFile(final File pythonSegmenterOutputPath, final int sampleIndex) { return Paths.get( pythonSegmenterOutputPath.getAbsolutePath(), GermlineCNVNamingConstants.SAMPLE_PREFIX + sampleIndex, GermlineCNVNamingConstants.COPY_NUMBER_SEGMENTS_FILE_NAME).toFile(); } /** * Get intervals from each call shard in the provided (potentially arbitrary) order * @return unsorted intervals */ private List getUnsortedIntervalCollectionsFromCalls() { if (unsortedIntervalCollectionsFromCalls == null) { unsortedIntervalCollectionsFromCalls = getIntervalCollectionsFromPaths(inputUnsortedCallsShardPaths); } return unsortedIntervalCollectionsFromCalls; } /** * Get intervals from each model shard in the provided (potentially arbitrary) order * @return unsorted intervals */ private List getUnsortedIntervalCollectionsFromModels() { if (unsortedIntervalCollectionsFromModels == null) { unsortedIntervalCollectionsFromModels = getIntervalCollectionsFromPaths(inputUnsortedModelShardPaths); } return unsortedIntervalCollectionsFromModels; } /** * Validate that the concatenation of the sharded interval lists does not have singleton intervals, i.e. intervals * that are the only ones on their corresponding contigs. */ private void checkForSingletonInterval(final List intervalCollections){ intervalCollections.stream() .flatMap(list -> list.getIntervals().stream()) .collect(Collectors.groupingBy(SimpleInterval::getContig, Collectors.counting())) .entrySet().stream() .forEach(entry -> { if (entry.getValue() == 1) { throw new IllegalArgumentException( String.format("Records contain a singleton interval on contig (%s)." + " Please run FilterIntervals tool first.", entry.getKey())); } }); } }




© 2015 - 2025 Weber Informatics LLC | Privacy Policy