All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.broadinstitute.hellbender.tools.copynumber.plotting.PlottingUtils Maven / Gradle / Ivy

The newest version!
package org.broadinstitute.hellbender.tools.copynumber.plotting;

import htsjdk.samtools.SAMSequenceDictionary;
import htsjdk.samtools.SAMSequenceRecord;
import htsjdk.samtools.util.Locatable;
import org.apache.logging.log4j.Logger;
import org.broadinstitute.hellbender.tools.copynumber.formats.collections.AbstractSampleLocatableCollection;
import org.broadinstitute.hellbender.utils.SimpleInterval;
import org.broadinstitute.hellbender.utils.Utils;
import org.broadinstitute.hellbender.utils.param.ParamUtils;

import java.io.File;
import java.util.LinkedHashMap;
import java.util.LinkedHashSet;
import java.util.Map;
import java.util.Set;
import java.util.stream.Collectors;

/**
 * @author Samuel Lee <[email protected]>
 */
final class PlottingUtils {
    static final String CNV_PLOTTING_R_LIBRARY = "CNVPlottingLibrary.R";

    static final String CONTIG_DELIMITER = "CONTIG_DELIMITER";  //used to delimit contig names and lengths passed to the R script

    static final String MINIMUM_CONTIG_LENGTH_LONG_NAME = "minimum-contig-length";
    static final String MAXIMUM_COPY_RATIO_LONG_NAME = "maximum-copy-ratio";
    static final String POINT_SIZE_COPY_RATIO_LONG_NAME = "point-size-copy-ratio";
    static final String POINT_SIZE_ALLELE_FRACTION_LONG_NAME = "point-size-allele-fraction";

    static final int DEFAULT_MINIMUM_CONTIG_LENGTH = 1000000;   //can be used to filter out mitochondrial contigs, unlocalized contigs, etc.
    static final double DEFAULT_MAXIMUM_COPY_RATIO = 4.;
    static final double DEFAULT_POINT_SIZE_COPY_RATIO = 0.2;
    static final double DEFAULT_POINT_SIZE_ALLELE_FRACTION = 0.4;

    static final String SEQUENCE_DICTIONARY_DOC_STRING = "File containing a sequence dictionary, which specifies the contigs to be plotted and their relative lengths. " +
            "The sequence dictionary must be a subset of those contained in other input files. " +
            "Contigs will be plotted in the order given. " +
            "Contig names should not include the string \"" + PlottingUtils.CONTIG_DELIMITER + "\". " +
            "The tool only considers contigs in the given dictionary for plotting, and " +
            "data for contigs absent in the dictionary generate only a warning. In other words, you may " +
            "modify a reference dictionary for use with this tool to include only contigs for which plotting is desired, " +
            "and sort the contigs to the order in which the plots should display the contigs.";

    static final String MINIMUM_CONTIG_LENGTH_DOC_STRING = "Threshold length (in bp) for contigs to be plotted. " +
            "Contigs with lengths less than this threshold will not be plotted. " +
            "This can be used to filter out mitochondrial contigs, unlocalized contigs, etc.";

    static final String MAXIMUM_COPY_RATIO_DOC_STRING = "Maximum copy ratio to be plotted. " +
            "If Infinity, the maximum copy ratio will be automatically determined.";

    static final String POINT_SIZE_COPY_RATIO_DOC_STRING = "Point size to use for plotting copy-ratio points.";

    static final String POINT_SIZE_ALLELE_FRACTION_DOC_STRING = "Point size to use for plotting allele-fraction points.";

    private PlottingUtils() {}

    static Map getContigLengthMap(final SAMSequenceDictionary sequenceDictionary,
                                                   final int minContigLength,
                                                   final Logger logger) {
        Utils.nonNull(sequenceDictionary);
        ParamUtils.isPositiveOrZero(minContigLength, "Minimum contig length must be non-negative.");
        Utils.nonNull(logger);
        Utils.validateArg(sequenceDictionary.getSequences().stream().map(SAMSequenceRecord::getSequenceName).noneMatch(n -> n.contains(CONTIG_DELIMITER)),
                String.format("Contig names cannot contain \"%s\".", CONTIG_DELIMITER));
        final Map contigLengthMap = sequenceDictionary.getSequences().stream()
                .filter(s -> s.getSequenceLength() >= minContigLength)
                .collect(Collectors.toMap(SAMSequenceRecord::getSequenceName, SAMSequenceRecord::getSequenceLength,
                        (c, l) -> {
                            throw new IllegalArgumentException(String.format("Duplicate contig in sequence dictionary: %s", c));
                        },
                        LinkedHashMap::new));
        Utils.validateArg(contigLengthMap.size() > 0,
                "There must be at least one contig above the threshold length in the sequence dictionary.");
        logger.info("Contigs above length threshold: " + contigLengthMap.toString());
        return contigLengthMap;
    }

    //validate contig names and lengths
    static  void validateContigs(final Map contigLengthMap,
                                                      final AbstractSampleLocatableCollection locatableCollection,
                                                      final File file,
                                                      final Logger logger) {
        Utils.nonNull(contigLengthMap);
        Utils.nonNull(logger);
        if (locatableCollection == null) {
            Utils.validateArg(file == null, "File can only be null if collection is also null.");
            return;
        }
        final Set contigNames = contigLengthMap.keySet();
        final Set fileContigNames = locatableCollection.getRecords().stream().map(T::getContig).collect(Collectors.toSet());
        if (!contigNames.containsAll(fileContigNames)) {
            logger.warn(String.format("Contigs present in the file %s are missing from the sequence dictionary and will not be plotted.", file));
        }
        final Map fileContigMaxPositionMap = locatableCollection.getIntervals().stream().filter(i -> contigNames.contains(i.getContig()))
                .collect(Collectors.toMap(SimpleInterval::getContig, SimpleInterval::getEnd, Integer::max));
        fileContigMaxPositionMap.forEach((c, integer) -> Utils.validateArg(integer <= contigLengthMap.get(c),
                String.format("Position present in the file %s exceeds contig length in the sequence dictionary.", file)));
    }

    //validate sequence dictionary contains another as a subset, using only sequence names and lengths
    static void validateSequenceDictionarySubset(final SAMSequenceDictionary sequenceDictionary,
                                                 final SAMSequenceDictionary sequenceDictionarySubset) {
        Utils.nonNull(sequenceDictionary);
        Utils.nonNull(sequenceDictionarySubset);
        final Set sequences = sequenceDictionary.getSequences().stream()
                .map(s -> new SAMSequenceRecord(s.getMd5(), s.getSequenceLength()))
                .collect(Collectors.toCollection(LinkedHashSet::new));
        final Set sequencesSubset = sequenceDictionary.getSequences().stream()
                .map(s -> new SAMSequenceRecord(s.getMd5(), s.getSequenceLength()))
                .collect(Collectors.toCollection(LinkedHashSet::new));
        Utils.validateArg(sequences.containsAll(sequencesSubset),
                String.format("Sequence dictionary (%s) must be a subset of those contained in other input files (%s).",
                        sequencesSubset, sequences));
    }
}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy