picard.fingerprint.CalculateFingerprintMetrics Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of picard Show documentation
Show all versions of picard Show documentation
A set of command line tools (in Java) for manipulating high-throughput sequencing (HTS) data and formats such as SAM/BAM/CRAM and VCF.
/*
* The MIT License
*
* Copyright (c) 2019 The Broad Institute
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
* THE SOFTWARE.
*/
package picard.fingerprint;
import htsjdk.samtools.metrics.MetricsFile;
import htsjdk.samtools.util.IOUtil;
import htsjdk.tribble.util.MathUtils;
import org.apache.commons.math3.random.MersenneTwister;
import org.apache.commons.math3.random.RandomDataGenerator;
import org.apache.commons.math3.random.RandomGenerator;
import org.apache.commons.math3.stat.inference.ChiSquareTest;
import org.broadinstitute.barclay.argparser.Argument;
import org.broadinstitute.barclay.argparser.CommandLineProgramProperties;
import org.broadinstitute.barclay.help.DocumentedFeature;
import picard.cmdline.CommandLineProgram;
import picard.cmdline.StandardOptionDefinitions;
import picard.cmdline.programgroups.DiagnosticsAndQCProgramGroup;
import picard.util.MathUtil;
import java.io.File;
import java.nio.file.Path;
import java.util.List;
import java.util.Map;
import java.util.Optional;
import java.util.concurrent.TimeUnit;
import java.util.stream.Collectors;
import java.util.stream.IntStream;
import static picard.fingerprint.HaplotypeProbabilities.Genotype.HET_ALLELE12;
import static picard.fingerprint.HaplotypeProbabilities.Genotype.HOM_ALLELE1;
import static picard.fingerprint.HaplotypeProbabilities.Genotype.HOM_ALLELE2;
/**
* Calculates various metrics on a sample fingerprint, indicating whether the fingerprint satisfies the assumptions we have.
* For example, if too many sites are heterozygous, that would get flagged.
*
* @author Yossi Farjoun
*/
@CommandLineProgramProperties(
summary = CalculateFingerprintMetrics.USAGE_SUMMARY + CalculateFingerprintMetrics.USAGE_DETAILS,
oneLineSummary = CalculateFingerprintMetrics.USAGE_SUMMARY,
programGroup = DiagnosticsAndQCProgramGroup.class
)
@DocumentedFeature
public class CalculateFingerprintMetrics extends CommandLineProgram {
static final String USAGE_DETAILS =
"This tools collects various statistics that pertain to a single fingerprint (not the comparison, or " +
"\'fingerprinting\' of two distinct samples) and reports the results in a metrics file. " +
"" +
"The statistics collected are p-values, where the null-hypothesis is that the fingerprint is collected from " +
"a non-contaminated, diploid human, whose genotypes are modelled by the probabilities given in the " +
"HAPLOTYPE_MAP file." +
"
" +
"Please see the FingerprintMetrics " +
"definitions " +
"for a complete description of the metrics produced by this tool.
" +
"
"+
"" +
"
Example
\n" +
"\" +\n" +
"java -jar picard.jar CalculateFingerprintMetrics \\\n" +
" INPUT=sample.bam \\\n" +
" HAPLOTYPE_MAP=fingerprinting_haplotype_database.txt \\\n" +
" OUTPUT=sample.fingerprint_metrics\n" +
"
\n";
static final String USAGE_SUMMARY="Calculate statistics on fingerprints, checking their viability";
@Argument(shortName = StandardOptionDefinitions.INPUT_SHORT_NAME, doc = "One or more input files (SAM/BAM/CRAM or VCF).")
public List INPUT;
@Argument(shortName = StandardOptionDefinitions.OUTPUT_SHORT_NAME, doc = "The output file to write (Metrics).")
public File OUTPUT;
@Argument(shortName = "H", doc = "The file lists a set of SNPs, optionally arranged in high-LD blocks, to be used for fingerprinting. See " +
"https://software.broadinstitute.org/gatk/documentation/article?id=9526 for details.")
public File HAPLOTYPE_MAP;
@Argument(doc = "Specificies which data-type should be used as the basic unit. Fingerprints from readgroups can " +
"be \"rolled-up\" to the LIBRARY, SAMPLE, or FILE level before being used." +
" Fingerprints from VCF can be be examined by SAMPLE or FILE.")
public CrosscheckMetric.DataType CALCULATE_BY = CrosscheckMetric.DataType.READGROUP;
@Argument(doc="LOD score threshold for considering a genotype to be definitive.")
public final double GENOTYPE_LOD_THRESHOLD = 3;
@Argument(doc="Number of randomization trials for calculating the DISCRIMINATORY_POWER metric.")
public final int NUMBER_OF_SAMPLING = 100;
// a fixed random seed for reproducibility;
private static final int RANDOM_SEED = 42;
private static final ChiSquareTest chiSquareTest = new ChiSquareTest();
@Override
protected int doWork() {
final List inputPaths = IOUtil.getPaths(INPUT);
IOUtil.assertPathsAreReadable(inputPaths);
IOUtil.assertFileIsReadable(HAPLOTYPE_MAP);
IOUtil.assertFileIsWritable(OUTPUT);
final FingerprintChecker checker = new FingerprintChecker(HAPLOTYPE_MAP);
final MetricsFile metricsFile = getMetricsFile();
final Map fpMap = checker.fingerprintFiles(inputPaths, 1, 1, TimeUnit.DAYS);
final Map mergedFpMap = Fingerprint.mergeFingerprintsBy(fpMap,Fingerprint.getFingerprintIdDetailsStringFunction(CALCULATE_BY));
metricsFile.addAllMetrics(mergedFpMap.values().stream().map(this::getFingerprintMetrics).collect(Collectors.toList()));
metricsFile.write(OUTPUT);
return 0;
}
public FingerprintMetrics getFingerprintMetrics(final Fingerprint fingerprint) {
final RandomGenerator rg = new MersenneTwister(RANDOM_SEED);
//get expectation of counts and expected fractions
double[] genotypeCounts = fingerprint.values().stream()
.map(HaplotypeProbabilities::getPosteriorProbabilities)
.reduce(MathUtil::sum).orElseGet(() -> new double[]{0, 0, 0});
double[] expectedRatios = fingerprint.values().stream()
.map(HaplotypeProbabilities::getPriorProbablities)
.reduce(MathUtil::sum).orElseGet(() -> new double[]{0, 0, 0});
final double[] homVsHetExpect = new double[]{expectedRatios[HOM_ALLELE1.v] + expectedRatios[HOM_ALLELE2.v], expectedRatios[HET_ALLELE12.v]};
final double[] homVsHetCounts = new double[]{genotypeCounts[HOM_ALLELE1.v] + genotypeCounts[HOM_ALLELE2.v], genotypeCounts[HET_ALLELE12.v]};
final double[] homAllele1VsAllele2Expect = new double[]{expectedRatios[HOM_ALLELE1.v], expectedRatios[HOM_ALLELE2.v]};
final double[] homAllele1VsAllele2Counts = new double[]{genotypeCounts[HOM_ALLELE1.v], genotypeCounts[HOM_ALLELE2.v]};
final long[] roundedHom1vsHom2Counts = MathUtil.round(homAllele1VsAllele2Counts);
final long[] roundedHetVsHomCounts = MathUtil.round(homVsHetCounts);
final long[] roundedGenotypeCounts = MathUtil.round(genotypeCounts);
final FingerprintMetrics fingerprintMetrics = new FingerprintMetrics();
fingerprintMetrics.SAMPLE_ALIAS = fingerprint.getSample();
fingerprintMetrics.SOURCE = Optional.ofNullable(fingerprint.getSource()).map(p -> p.toUri().toString()).orElse("");
fingerprintMetrics.INFO = fingerprint.getInfo();
fingerprintMetrics.HAPLOTYPES = fingerprint.values().size();
fingerprintMetrics.HAPLOTYPES_WITH_EVIDENCE = fingerprint.values().stream().filter(HaplotypeProbabilities::hasEvidence).count();
fingerprintMetrics.DEFINITE_GENOTYPES = fingerprint.values().stream().filter(h -> h.getLodMostProbableGenotype() >= GENOTYPE_LOD_THRESHOLD).count();
fingerprintMetrics.NUM_HOM_ALLELE1 = roundedGenotypeCounts[HOM_ALLELE1.v];
fingerprintMetrics.NUM_HOM_ALLELE2 = roundedGenotypeCounts[HOM_ALLELE2.v];
fingerprintMetrics.NUM_HOM_ANY = roundedHetVsHomCounts[1];
fingerprintMetrics.NUM_HET = roundedGenotypeCounts[HET_ALLELE12.v];
fingerprintMetrics.EXPECTED_HOM_ALLELE1 = expectedRatios[HOM_ALLELE1.v];
fingerprintMetrics.EXPECTED_HOM_ALLELE2 = expectedRatios[HOM_ALLELE2.v];
fingerprintMetrics.EXPECTED_HET = expectedRatios[HET_ALLELE12.v];
// calculate p-value
final double chiSquaredTest = chiSquareTest.chiSquareTest(expectedRatios, roundedGenotypeCounts);
fingerprintMetrics.CHI_SQUARED_PVALUE = chiSquaredTest;
fingerprintMetrics.LOG10_CHI_SQUARED_PVALUE = Math.log10(chiSquaredTest);
// calculate LOD (cross-entropy)
fingerprintMetrics.CROSS_ENTROPY_LOD = MathUtil.klDivergance(genotypeCounts, expectedRatios);
// calculate p-value
final double hetsChiSquaredTest = chiSquareTest.chiSquareTest(homVsHetExpect, roundedHetVsHomCounts);
fingerprintMetrics.HET_CHI_SQUARED_PVALUE = hetsChiSquaredTest;
fingerprintMetrics.LOG10_HET_CHI_SQUARED_PVALUE = Math.log10(hetsChiSquaredTest);
// calculate LOD (cross-entropy)
fingerprintMetrics.HET_CROSS_ENTROPY_LOD = MathUtil.klDivergance(homVsHetCounts, homVsHetExpect);
// calculate p-value
final double homsChiSquaredTest = chiSquareTest.chiSquareTest(homAllele1VsAllele2Expect, roundedHom1vsHom2Counts);
fingerprintMetrics.HOM_CHI_SQUARED_PVALUE = homsChiSquaredTest;
fingerprintMetrics.LOG10_HOM_CHI_SQUARED_PVALUE = Math.log10(homsChiSquaredTest);
// calculate LOD (cross-entropy)
fingerprintMetrics.HOM_CROSS_ENTROPY_LOD = MathUtil.klDivergance(homAllele1VsAllele2Counts, homAllele1VsAllele2Expect);;
final MathUtils.RunningStat randomTrials = new MathUtils.RunningStat();
// get a bunch of random permutations of the fingerprint and compare to self
IntStream.range(0, NUMBER_OF_SAMPLING).forEach(i ->
randomTrials.push(FingerprintChecker.calculateMatchResults(fingerprint, randomizeFingerprint(fingerprint, rg)).getLOD()));
fingerprintMetrics.LOD_SELF_CHECK = FingerprintChecker.calculateMatchResults(fingerprint, fingerprint).getLOD();
fingerprintMetrics.DISCRIMINATORY_POWER = fingerprintMetrics.LOD_SELF_CHECK - randomTrials.mean();
return fingerprintMetrics;
}
/** Creates a new fingerprint from the current one by randomizing the probabilities within each haplotype
*
*/
private static Fingerprint randomizeFingerprint(final Fingerprint fingerprint, final RandomGenerator rg) {
final Fingerprint retVal = new Fingerprint(null, null, null);
final RandomDataGenerator rng = new RandomDataGenerator(rg);
fingerprint.forEach((key, hp) -> {
final HaplotypeProbabilitiesFromGenotypeLikelihoods permutedHaplotypeProbabilities = new HaplotypeProbabilitiesFromGenotypeLikelihoods(key);
permutedHaplotypeProbabilities.addToLogLikelihoods(
hp.getRepresentativeSnp(),
hp.getRepresentativeSnp().getAlleles(),
MathUtil.permute(hp.getLogLikelihoods(), rng));
retVal.add(permutedHaplotypeProbabilities);
});
return retVal;
}
}