org.broadinstitute.hellbender.utils.downsampling.AlleleBiasedDownsamplingUtils Maven / Gradle / Ivy

Show more of this group Show more artifacts with this name
Show all versions of gatk Show documentation
Development on GATK 4
There is a newer version: 4.6.0.0
package org.broadinstitute.hellbender.utils.downsampling;

import com.google.common.annotations.VisibleForTesting;
import com.google.common.collect.Sets;
import htsjdk.variant.variantcontext.Allele;
import org.apache.commons.collections4.map.DefaultedMap;
import org.apache.logging.log4j.Logger;
import org.broadinstitute.hellbender.exceptions.UserException;
import org.broadinstitute.hellbender.utils.MathUtils;
import org.broadinstitute.hellbender.utils.Utils;
import org.broadinstitute.hellbender.utils.text.XReadLines;

import java.io.File;
import java.io.IOException;
import java.util.*;
import java.util.stream.Collectors;
import java.util.stream.IntStream;

/**
 * The purpose of this set of utilities is to downsample a set of reads to remove contamination.
 * It is NOT a general-purpose downsampling mechanism.
 */
public final class AlleleBiasedDownsamplingUtils {

    private AlleleBiasedDownsamplingUtils() {
    }

    /**
     * Computes an allele biased version of the allele counts for a given pileup
     *
     * @param alleleCounts              the allele counts for the original pileup
     * @param numReadsToRemove          number of total reads to remove per allele
     * @return non-null array of new counts needed per allele
     */
    @VisibleForTesting
    static int[] targetAlleleCounts(final int[] alleleCounts, final int numReadsToRemove) {
        final int numAlleles = alleleCounts.length;

        int maxScore = scoreAlleleCounts(alleleCounts);
        int[] alleleCountsOfMax = alleleCounts;

        final int numReadsToRemovePerAllele = numReadsToRemove / 2;

        for ( int i = 0; i < numAlleles; i++ ) {
            for ( int j = i; j < numAlleles; j++ ) {
                final int[] newCounts = alleleCounts.clone();

                // split these cases so we don't lose on the floor (since we divided by 2)
                if ( i == j ) {
                    newCounts[i] = Math.max(0, newCounts[i] - numReadsToRemove);
                } else {
                    newCounts[i] = Math.max(0, newCounts[i] - numReadsToRemovePerAllele);
                    newCounts[j] = Math.max(0, newCounts[j] - numReadsToRemovePerAllele);
                }

                final int score = scoreAlleleCounts(newCounts);

                if ( score < maxScore ) {
                    maxScore = score;
                    alleleCountsOfMax = newCounts;
                }
            }
        }

        return alleleCountsOfMax;
    }

    private static int scoreAlleleCounts(final int[] alleleCounts) {
        if ( alleleCounts.length < 2 ) {
            return 0;
        }

        final int[] alleleCountsCopy = alleleCounts.clone();
        Arrays.sort(alleleCountsCopy);

        final int maxCount = alleleCountsCopy[alleleCounts.length - 1];
        final int nextBestCount = alleleCountsCopy[alleleCounts.length - 2];
        final int remainderCount = (int)MathUtils.sum(alleleCountsCopy) - maxCount - nextBestCount;

        // try to get the best score:
        //    - in the het case the counts should be equal with nothing else
        //    - in the hom case the non-max should be zero
        return Math.min(maxCount - nextBestCount + remainderCount, Math.abs(nextBestCount + remainderCount));
    }

    /**
     * Computes reads to remove based on an allele biased down-sampling
     *
     * @param alleleEvidenceMap             original list of records per allele
     * @param contaminationFraction      the fraction of total reads to remove per allele
     * @return list of reads TO REMOVE from allele biased down-sampling
     */
    public static  List selectAlleleBiasedEvidence(final Map> alleleEvidenceMap, final double contaminationFraction) {

        final int totalEvidence = Utils.nonNull(alleleEvidenceMap).values().stream().mapToInt(list -> list.size()).sum();
        //no checks here - done on the public level
        final int numEvidenceToRemove = (int) (totalEvidence * contaminationFraction);

        // make a listing of allele counts
        final List alleles = new ArrayList<>(alleleEvidenceMap.keySet());
        alleles.remove(Allele.NO_CALL);    // ignore the no-call bin

        final int[] alleleCounts = alleles.stream().mapToInt(allele -> alleleEvidenceMap.get(allele).size()).toArray();
        final int[] targetAlleleCounts = targetAlleleCounts(alleleCounts, numEvidenceToRemove);

        return IntStream.range(0, alleles.size()).filter(i -> alleleCounts[i] > targetAlleleCounts[i])
                .mapToObj(i -> i)
                .flatMap(i -> downsampleElements(alleleEvidenceMap.get(alleles.get(i)), alleleCounts[i] - targetAlleleCounts[i]).stream())
                .collect(Collectors.toList());
    }

    private static  List downsampleElements(final List evidence, final int numElementsToRemove) {
        if ( numElementsToRemove == 0 ) {  //remove none
            return Collections.emptyList();
        } else if ( numElementsToRemove >= evidence.size()) {    //remove all
            return Collections.unmodifiableList(evidence);
        } else {
            return Collections.unmodifiableList(Arrays.stream(MathUtils.sampleIndicesWithoutReplacement(evidence.size(), numElementsToRemove))
                    .mapToObj(evidence::get)
                    .collect(Collectors.toList()));
        }
    }

    /**
     * Create sample-contamination maps from file.
     * The format is: tab-separated with no header,
     * each line is: sampleID contaminationFraction
     *
     * @param file   Filename containing two columns: SampleID and Contamination
     * @param defaultContaminationFraction default contamination fraction, used for samples that do no specify one
     * @param sampleIDs          Set of Samples of interest (no reason to include every sample in file) or null to turn off checking
     * @param logger                      for logging output
     * @return sample-contamination Map. The returned map is a {@link DefaultedMap} that defaults to the defaultContaminationFraction for unspecified samples
     * @throws UserException if there's an IO problem reading the file.
     * @throws UserException if the file is malformed
     */

    public static DefaultedMap loadContaminationFile(final File file, final double defaultContaminationFraction, final Set sampleIDs, final Logger logger) {
        final DefaultedMap sampleContamination = new DefaultedMap<>(defaultContaminationFraction);
        final Set nonSamplesInContaminationFile = new LinkedHashSet<>(sampleContamination.keySet());
        try ( final XReadLines reader = new XReadLines(Utils.nonNull(file).toPath(), true) ){
            for (final String line : reader) {
                if (line.isEmpty()) {
                    continue;
                }

                final String [] fields = line.split("\t");
                if (fields.length != 2){
                    throw new UserException.MalformedFile("Contamination file must have exactly two, tab-delimited columns. Offending line:\n" + line);
                }
                if (fields[0].isEmpty() || fields[1].isEmpty()) {
                    throw new UserException.MalformedFile("Contamination file can not have empty strings in either column. Offending line:\n" + line);
                }

                final double contamination;
                try {
                    contamination = Double.parseDouble(fields[1]);
                } catch (final NumberFormatException e) {
                    throw new UserException.MalformedFile("Contamination file contains unparsable double in the second field. Offending line: " + line);
                }
                final String sampleName= fields[0];
                if (sampleContamination.containsKey(sampleName)) {
                    throw new UserException.MalformedFile("Contamination file contains duplicate entries for input name " + sampleName);
                }
                if (contamination < 0.0 || contamination > 1.0){
                    throw new UserException.MalformedFile("Contamination file contains unacceptable contamination value (must be 0<=x<=1): " + line);
                }
                if (sampleIDs == null || sampleIDs.contains(sampleName)) {
                    sampleContamination.put(sampleName, contamination);
                } else {
                    nonSamplesInContaminationFile.add(sampleName);
                }
            }

            //output to the user info lines telling which samples are in the Contamination File
            if (! sampleContamination.isEmpty()) {
                logger.info(String.format("The following samples were found in the Contamination file and will be processed at the contamination level therein: %s", sampleContamination.keySet().toString()));

                //output to the user info lines telling which samples are NOT in the Contamination File
                if(sampleIDs!=null){
                    final Set samplesNotInContaminationFile = Sets.difference(sampleIDs, sampleContamination.keySet());
                    if (! samplesNotInContaminationFile.isEmpty()) {
                        logger.info(String.format("The following samples were NOT found in the Contamination file and will be processed at the default contamination level: %s", samplesNotInContaminationFile.toString()));
                    }
                }
            }

            //output to the user Samples that do not have lines in the Contamination File
            if (! nonSamplesInContaminationFile.isEmpty()) {
                logger.info(String.format("The following entries were found in the Contamination file but were not SAMPLEIDs. They will be ignored: %s", nonSamplesInContaminationFile.toString()));
            }

            return sampleContamination;

        } catch (IOException e) {
            throw new UserException.CouldNotReadInputFile("I/O Error while reading sample-contamination file " + file.getAbsolutePath() + ": " + e.getMessage());
        }
    }
}