picard.fingerprint.CheckFingerprint Maven / Gradle / Ivy
/*
* The MIT License
*
* Copyright (c) 2016 The Broad Institute
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
* THE SOFTWARE.
*/
package picard.fingerprint;
import htsjdk.samtools.BamFileIoUtils;
import htsjdk.samtools.SAMReadGroupRecord;
import htsjdk.samtools.SamReader;
import htsjdk.samtools.SamReaderFactory;
import htsjdk.samtools.metrics.MetricsFile;
import htsjdk.samtools.util.CloserUtil;
import htsjdk.samtools.util.IOUtil;
import htsjdk.samtools.util.Log;
import htsjdk.samtools.util.SequenceUtil;
import htsjdk.variant.utils.SAMSequenceDictionaryExtractor;
import htsjdk.variant.vcf.VCFFileReader;
import htsjdk.variant.vcf.VCFHeader;
import picard.PicardException;
import picard.analysis.FingerprintingDetailMetrics;
import picard.analysis.FingerprintingSummaryMetrics;
import picard.cmdline.CommandLineProgram;
import picard.cmdline.CommandLineProgramProperties;
import picard.cmdline.Option;
import picard.cmdline.StandardOptionDefinitions;
import picard.cmdline.programgroups.Fingerprinting;
import java.io.File;
import java.util.Collections;
import java.util.List;
/**
* Attempts to check the sample identity of the sequence/genotype data in the provided file (SAM/BAM or VCF)
* against a set of known genotypes in the supplied genotype file (in either GELI or VCF format).
*
* @author Tim Fennell
*/
@CommandLineProgramProperties(
usage = CheckFingerprint.USAGE_DETAILS,
usageShort = "Computes a fingerprint from the supplied input (SAM/BAM or VCF) file and compares it to the provided genotypes",
programGroup = Fingerprinting.class
)
public class CheckFingerprint extends CommandLineProgram {
static final String USAGE_DETAILS = "Computes a fingerprint from the supplied input file (SAM/BAM or VCF) file and " +
"compares it to the expected fingerprint genotypes provided. The key output is a LOD score " +
"which represents the relative likelihood of the sequence data originating from the same " +
"sample as the genotypes vs. from a random sample. Two outputs are produced: (1) a summary " +
"metrics file that gives metrics at the single sample level (if the input was a VCF) or at the read " +
"level (lane or index within a lane) (if the input was a SAM/BAM) " +
"versus a set of known genotypes for the expected sample, and (2) a detail metrics file that " +
"contains an individual SNP/Haplotype comparison within a fingerprint comparison. The two " +
"files may be specified individually using the SUMMARY_OUTPUT and DETAIL_OUTPUT options. " +
"Alternatively the OUTPUT option may be used instead to give the base of the two output " +
"files, with the summary metrics having a file extension '" + CheckFingerprint.FINGERPRINT_SUMMARY_FILE_SUFFIX + "' " +
"and the detail metrics having a file extension '" + CheckFingerprint.FINGERPRINT_DETAIL_FILE_SUFFIX + "'.";
@Option(shortName=StandardOptionDefinitions.INPUT_SHORT_NAME, doc = "Input file SAM/BAM or VCF. If a VCF is used, " +
"it must have at least one sample. If there are more than one samples in the VCF, the parameter OBSERVED_SAMPLE_ALIAS must " +
"be provided in order to indicate which sample's data to use. If there are no samples in the VCF, an exception will be thrown.")
public File INPUT;
@Option(optional = true, doc = "If the input is a VCF, this parameters used to select which sample's data in the VCF to use.")
public String OBSERVED_SAMPLE_ALIAS;
@Option(shortName=StandardOptionDefinitions.OUTPUT_SHORT_NAME, doc = "The base prefix of output files to write. The summary metrics " +
"will have the file extension '" + CheckFingerprint.FINGERPRINT_SUMMARY_FILE_SUFFIX + "' and the detail metrics will have " +
"the extension '" + CheckFingerprint.FINGERPRINT_DETAIL_FILE_SUFFIX + "'.", mutex = {"SUMMARY_OUTPUT", "DETAIL_OUTPUT"})
public String OUTPUT;
@Option(shortName = "S", doc = "The text file to which to write summary metrics.", mutex = {"OUTPUT"})
public File SUMMARY_OUTPUT;
@Option(shortName = "D", doc = "The text file to which to write detail metrics.", mutex = {"OUTPUT"})
public File DETAIL_OUTPUT;
@Option(shortName="G", doc = "File of genotypes (VCF or GELI) to be used in comparison. May contain " +
"any number of genotypes; CheckFingerprint will use only those that are usable for fingerprinting.")
public File GENOTYPES;
@Option(shortName = "SAMPLE_ALIAS", optional=true, doc = "This parameter can be used to specify which sample's genotypes to use from the " +
"expected VCF file (the GENOTYPES file). If it is not supplied, the sample name from the input " +
"(VCF or BAM read group header) will be used.")
public String EXPECTED_SAMPLE_ALIAS;
@Option(shortName="H", doc = "The file lists a set of SNPs, optionally arranged in high-LD blocks, to be used for fingerprinting. See " +
"https://software.broadinstitute.org/gatk/documentation/article?id=9526 for details.")
public File HAPLOTYPE_MAP;
@Option(shortName="LOD", doc = "When counting haplotypes checked and matching, count only haplotypes " +
"where the most likely haplotype achieves at least this LOD.")
public double GENOTYPE_LOD_THRESHOLD = 5;
@Option(optional=true, shortName="IGNORE_RG", doc = "If the input is a SAM/BAM, and this parameter is true, treat the " +
"entire input BAM as one single read group in the calculation, " +
"ignoring RG annotations, and producing a single fingerprint metric for the entire BAM.")
public boolean IGNORE_READ_GROUPS = false;
private final Log log = Log.getInstance(CheckFingerprint.class);
public static final String FINGERPRINT_SUMMARY_FILE_SUFFIX = "fingerprinting_summary_metrics";
public static final String FINGERPRINT_DETAIL_FILE_SUFFIX = "fingerprinting_detail_metrics";
// Stock main method
public static void main(final String[] args) {
new CheckFingerprint().instanceMainWithExit(args);
}
@Override
protected int doWork() {
final File outputDetailMetricsFile, outputSummaryMetricsFile;
if (OUTPUT == null) {
outputDetailMetricsFile = DETAIL_OUTPUT;
outputSummaryMetricsFile = SUMMARY_OUTPUT;
}
else {
if (!OUTPUT.endsWith(".")) OUTPUT = OUTPUT + ".";
outputDetailMetricsFile = new File(OUTPUT + FINGERPRINT_DETAIL_FILE_SUFFIX);
outputSummaryMetricsFile = new File(OUTPUT + FINGERPRINT_SUMMARY_FILE_SUFFIX);
}
IOUtil.assertFileIsReadable(INPUT);
IOUtil.assertFileIsReadable(HAPLOTYPE_MAP);
IOUtil.assertFileIsReadable(GENOTYPES);
IOUtil.assertFileIsWritable(outputDetailMetricsFile);
IOUtil.assertFileIsWritable(outputSummaryMetricsFile);
final FingerprintChecker checker = new FingerprintChecker(HAPLOTYPE_MAP);
List results;
String observedSampleAlias = null;
final boolean isBamOrSamFile = isBamOrSamFile(INPUT);
if (isBamOrSamFile) {
SequenceUtil.assertSequenceDictionariesEqual(SAMSequenceDictionaryExtractor.extractDictionary(INPUT), SAMSequenceDictionaryExtractor.extractDictionary(GENOTYPES), true);
SequenceUtil.assertSequenceDictionariesEqual(SAMSequenceDictionaryExtractor.extractDictionary(INPUT), checker.getHeader().getSequenceDictionary(), true);
// Verify that there's only one sample in the SAM/BAM.
final SamReader in = SamReaderFactory.makeDefault().referenceSequence(REFERENCE_SEQUENCE).open(INPUT);
for (final SAMReadGroupRecord rec : in.getFileHeader().getReadGroups()) {
if (observedSampleAlias == null) {
observedSampleAlias = rec.getSample();
}
else if (!observedSampleAlias.equals(rec.getSample())) {
throw new PicardException("INPUT SAM/BAM file must not contain data from multiple samples.");
}
}
CloserUtil.close(in);
// If expected sample alias isn't supplied, assume it's the one from the INPUT file's RGs
if (EXPECTED_SAMPLE_ALIAS == null) {
EXPECTED_SAMPLE_ALIAS = observedSampleAlias;
}
results = checker.checkFingerprints(
Collections.singletonList(INPUT),
Collections.singletonList(GENOTYPES),
EXPECTED_SAMPLE_ALIAS,
IGNORE_READ_GROUPS);
} else { // Input is a VCF
// Note that FingerprintChecker.loadFingerprints() verifies that the VCF's Sequence Dictionaries agree with that of the Haplotye Map File
// Verify that there is only one sample in the VCF
final VCFFileReader fileReader = new VCFFileReader(INPUT, false);
final VCFHeader fileHeader = fileReader.getFileHeader();
if (fileHeader.getNGenotypeSamples() < 1) {
throw new PicardException("INPUT VCF file must contain at least one sample.");
}
if ((fileHeader.getNGenotypeSamples() > 1) && (OBSERVED_SAMPLE_ALIAS == null)) {
throw new PicardException("INPUT VCF file contains multiple samples and yet the OBSERVED_SAMPLE_ALIAS parameter is not set.");
}
// set observedSampleAlias to the parameter, if set. Otherwise, if here, this must be a single sample VCF, get it's sample
observedSampleAlias = (OBSERVED_SAMPLE_ALIAS != null) ? OBSERVED_SAMPLE_ALIAS : fileHeader.getGenotypeSamples().get(0);
// Now verify that observedSampleAlias is, in fact, in the VCF
if (!fileHeader.getGenotypeSamples().contains(observedSampleAlias)) {
throw new PicardException("INPUT VCF file does not contain OBSERVED_SAMPLE_ALIAS: " + observedSampleAlias);
}
if (OBSERVED_SAMPLE_ALIAS == null) {
observedSampleAlias = fileHeader.getGenotypeSamples().get(0);
}
fileReader.close();
// If expected sample alias isn't supplied, assume it's the one from the INPUT file
if (EXPECTED_SAMPLE_ALIAS == null) {
EXPECTED_SAMPLE_ALIAS = observedSampleAlias;
}
results = checker.checkFingerprints(
Collections.singletonList(INPUT),
Collections.singletonList(GENOTYPES),
observedSampleAlias,
EXPECTED_SAMPLE_ALIAS);
}
final MetricsFile summaryFile = getMetricsFile();
final MetricsFile detailsFile = getMetricsFile();
for (final FingerprintResults fpr : results) {
final MatchResults mr = fpr.getMatchResults().first();
final FingerprintingSummaryMetrics metrics = new FingerprintingSummaryMetrics();
metrics.READ_GROUP = fpr.getReadGroup();
metrics.SAMPLE = EXPECTED_SAMPLE_ALIAS;
metrics.LL_EXPECTED_SAMPLE = mr.getSampleLikelihood();
metrics.LL_RANDOM_SAMPLE = mr.getPopulationLikelihood();
metrics.LOD_EXPECTED_SAMPLE = mr.getLOD();
for (final LocusResult lr : mr.getLocusResults()) {
final DiploidGenotype expectedGenotype = lr.getExpectedGenotype();
final DiploidGenotype observedGenotype = lr.getMostLikelyGenotype();
// Update the summary metrics
metrics.HAPLOTYPES_WITH_GENOTYPES++;
if (lr.getLodGenotype() >= GENOTYPE_LOD_THRESHOLD) {
metrics.HAPLOTYPES_CONFIDENTLY_CHECKED++;
if (lr.getExpectedGenotype() == lr.getMostLikelyGenotype()) {
metrics.HAPLOTYPES_CONFIDENTLY_MATCHING++;
}
if (expectedGenotype.isHeterozygous() && observedGenotype.isHomomozygous()) {
metrics.HET_AS_HOM++;
}
if (expectedGenotype.isHomomozygous() && observedGenotype.isHeterozygous()) {
metrics.HOM_AS_HET++;
}
if (expectedGenotype.isHomomozygous() && observedGenotype.isHomomozygous()
&& expectedGenotype.compareTo(observedGenotype) != 0) {
metrics.HOM_AS_OTHER_HOM++;
}
}
// Build the detail metrics
final FingerprintingDetailMetrics details = new FingerprintingDetailMetrics();
details.READ_GROUP = fpr.getReadGroup();
details.SAMPLE = EXPECTED_SAMPLE_ALIAS;
details.SNP = lr.getSnp().getName();
details.SNP_ALLELES = lr.getSnp().getAlleleString();
details.CHROM = lr.getSnp().getChrom();
details.POSITION = lr.getSnp().getPos();
details.EXPECTED_GENOTYPE = expectedGenotype.toString();
details.OBSERVED_GENOTYPE = observedGenotype.toString();
details.LOD = lr.getLodGenotype();
details.OBS_A = lr.getAllele1Count();
details.OBS_B = lr.getAllele2Count();
detailsFile.addMetric(details);
}
summaryFile.addMetric(metrics);
log.info("Read Group: " + metrics.READ_GROUP + " / " + observedSampleAlias + " vs. " + metrics.SAMPLE + ": LOD = " + metrics.LOD_EXPECTED_SAMPLE);
}
summaryFile.write(outputSummaryMetricsFile);
detailsFile.write(outputDetailMetricsFile);
return 0;
}
protected String[] customCommandLineValidation() {
IOUtil.assertFileIsReadable(INPUT);
boolean isBamOrSamFile = isBamOrSamFile(INPUT);
if (!isBamOrSamFile && IGNORE_READ_GROUPS) {
return new String[]{"The parameter IGNORE_READ_GROUPS can only be used with BAM/SAM inputs."};
}
if (isBamOrSamFile && OBSERVED_SAMPLE_ALIAS != null) {
return new String[]{"The parameter OBSERVED_SAMPLE_ALIAS can only be used with a VCF input."};
}
return super.customCommandLineValidation();
}
static boolean isBamOrSamFile(final File f) {
return (BamFileIoUtils.isBamFile(f) || f.getName().endsWith(IOUtil.SAM_FILE_EXTENSION));
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy