All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.broadinstitute.hellbender.tools.walkers.variantutils.VariantsToTable Maven / Gradle / Ivy

There is a newer version: 4.6.0.0
Show newest version
package org.broadinstitute.hellbender.tools.walkers.variantutils;

import htsjdk.variant.variantcontext.Allele;
import htsjdk.variant.variantcontext.VariantContext;
import htsjdk.variant.vcf.VCFConstants;
import htsjdk.variant.vcf.VCFHeader;
import htsjdk.variant.vcf.VCFHeaderLineCount;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
import org.broadinstitute.barclay.argparser.Advanced;
import org.broadinstitute.barclay.argparser.Argument;
import org.broadinstitute.barclay.argparser.CommandLineProgramProperties;
import org.broadinstitute.barclay.help.DocumentedFeature;
import org.broadinstitute.hellbender.cmdline.StandardArgumentDefinitions;
import org.broadinstitute.hellbender.utils.variant.GATKVCFConstants;
import picard.cmdline.programgroups.VariantEvaluationProgramGroup;
import org.broadinstitute.hellbender.engine.FeatureContext;
import org.broadinstitute.hellbender.engine.ReadsContext;
import org.broadinstitute.hellbender.engine.ReferenceContext;
import org.broadinstitute.hellbender.engine.VariantWalker;
import org.broadinstitute.hellbender.exceptions.UserException;
import org.broadinstitute.hellbender.utils.Utils;
import org.broadinstitute.hellbender.utils.variant.GATKVariantContextUtils;
import org.broadinstitute.hellbender.utils.variant.VcfUtils;

import java.io.FileNotFoundException;
import java.io.PrintStream;
import java.lang.reflect.Array;
import java.util.*;
import java.util.function.Function;

import static org.broadinstitute.hellbender.utils.Utils.split;

/**
 * Extract fields from a VCF file to a tab-delimited table
 *
 * 

* This tool extracts specified fields for each variant in a VCF file to a tab-delimited table, which may be easier * to work with than a VCF. By default, the tool only extracts PASS or . (unfiltered) variants in the VCF file. Filtered variants may be * included in the output by adding the --show-filtered flag. The tool can extract both INFO (i.e. site-level) fields and * FORMAT (i.e. sample-level) fields. *

* *

INFO/site-level fields

*

* Use the `-F` argument to extract INFO fields; each field will occupy a single column in the output * file. The field can be any standard VCF column (e.g. CHROM, ID, QUAL) or any annotation name in the INFO field (e.g. AC, AF). * The tool also supports the following additional fields: *

    *
  • EVENTLENGTH (length of the event)
  • *
  • TRANSITION (1 for a bi-allelic transition (SNP), 0 for bi-allelic transversion (SNP), -1 for INDELs and multi-allelics)
  • *
  • HET (count of het genotypes)
  • *
  • HOM-REF (count of homozygous reference genotypes)
  • *
  • HOM-VAR (count of homozygous variant genotypes)
  • *
  • NO-CALL (count of no-call genotypes)
  • *
  • TYPE (type of variant, possible values are NO_VARIATION, SNP, MNP, INDEL, SYMBOLIC, and MIXED
  • *
  • VAR (count of non-reference genotypes)
  • *
  • NSAMPLES (number of samples)
  • *
  • NCALLED (number of called samples)
  • *
  • MULTI-ALLELIC (is this variant multi-allelic? true/false)
  • *
*

* Use the `-ASF` argument to extract allele-specific/per allele INFO fields and split them appropriately when * splitting multi-allelic variants. *

* *

FORMAT/sample-level fields

*

* Use the `-GF` argument to extract FORMAT/sample-level fields. The tool will create a new column per sample * with the name "SAMPLE_NAME.FORMAT_FIELD_NAME" e.g. NA12877.GQ, NA12878.GQ. *

*

* Use the `-ASGF` argument to extract allele-specific/per allele FORMAT fields and split them appropriately * when splitting multi-allelic variants. If AD is specified as an allele-specific genotype field the ref and alt * counts will be given for each alt. *

* *

Inputs

*
    *
  • A VCF file to convert to a table
  • *
* *

Output

*

* A tab-delimited file containing the values of the requested fields in the VCF file. *

* *

Usage example

*
 *     gatk VariantsToTable \
 *     -V input.vcf \
 *     -F CHROM -F POS -F TYPE -GF AD \
 *     -O output.table
 * 
*

would produce a file that looks like:

*
 *     CHROM  POS        TYPE   HSCX1010N.AD  HSCX1010T.AD
 *     1      31782997   SNP    77,0          53,4
 *     1      40125052   SNP    97,0          92,7
 *     1      65068538   SNP    49,0          35,4
 *     1      111146235  SNP    69,1          77,4
 * 
* *

Notes

*
    *
  • It is common for certain annotations to be absent for some variants. By default, this tool will emit an NA for a missing annotation. If you prefer that the tool fail upon encountering a missing annotation, use the --error-if-missing-data flag.
  • *
  • If multiple samples are present in the VCF, the genotype fields will be ordered alphabetically by sample name.
  • *
  • Filtered sites are ignored by default. To include them in the output, use the --show-filtered flag.
  • *
  • Allele-specific filtering is not yet supported. For PASS sites, all alleles will be given, regardless of their AS_FilterStatus.
  • *
*/ @CommandLineProgramProperties( summary = "Extract specified fields for each variant in a VCF file to a tab-delimited table, which may be easier to work with than a VCF", oneLineSummary = "Extract fields from a VCF file to a tab-delimited table", programGroup = VariantEvaluationProgramGroup.class ) @DocumentedFeature public final class VariantsToTable extends VariantWalker { public final static String SPLIT_MULTI_ALLELIC_LONG_NAME = "split-multi-allelic"; public final static String SPLIT_MULTI_ALLELIC_SHORT_NAME = "SMA"; static final Logger logger = LogManager.getLogger(VariantsToTable.class); @Argument(fullName = StandardArgumentDefinitions.OUTPUT_LONG_NAME, shortName = StandardArgumentDefinitions.OUTPUT_SHORT_NAME, doc="File to which the tab-delimited table is written") private String out = null; /** * Any standard VCF column (CHROM, ID, QUAL) or any annotation name in the INFO field (e.g., -F AC) to include in * the output table. To capture FORMAT field values, see the -GF argument. This argument accepts any number * of inputs e.g. -F CHROM -F POS */ @Argument(fullName="fields", shortName="F", doc="The name of a standard VCF field or an INFO field to include in the output table", optional=true) protected List fieldsToTake = new ArrayList<>(); /** * Any annotation name in the FORMAT field (e.g., GQ, PL) to include in the output table. * This argument accepts any number of inputs e.g. -GF GQ -GF PL */ @Argument(fullName="genotype-fields", shortName="GF", doc="The name of a genotype field to include in the output table", optional=true) private List genotypeFieldsToTake = new ArrayList<>(); @Argument(shortName="ASF", doc="The name of an allele-specific INFO field to be split if present", optional=true) private List asFieldsToTake = new ArrayList<>(); @Argument(shortName="ASGF", doc="The name of an allele-specific FORMAT field to be split if present", optional=true) private List asGenotypeFieldsToTake = new ArrayList<>(); /** * By default this tool only emits values for records where the FILTER field is either PASS or . (unfiltered). * Turn on this flag to emit values regardless of the value of the FILTER field. */ @Advanced @Argument(fullName="show-filtered", shortName="raw", doc="Include filtered records in the output", optional=true) private boolean showFiltered = false; /** * By default, a variant record with multiple ALT alleles will be summarized in one line, with per alt-allele fields * (e.g. allele depth) separated by commas. This may cause difficulty when the table is loaded by an R script, for example. * Use this flag to write multi-allelic records on separate lines of output. Fields that are not allele-specific will be duplicated. */ @Argument(fullName=SPLIT_MULTI_ALLELIC_LONG_NAME, shortName=SPLIT_MULTI_ALLELIC_SHORT_NAME, doc="Split multi-allelic records into multiple lines", optional=true) protected boolean splitMultiAllelic = false; /** * Use this flag to emit each field within a variant on a separate line. The resulting table will have * four columns: RecordID, Sample, Variable, and Value. Variable refers to the field name, Value to the value of the * field. The tool prints "site" under Sample column for an INFO or standard field. * * Example: -F CHROM -GF AD will print the following table * RecordID Sample Variable Value * 1 site CHROM 20 * 1 NA12878 AD 36,28 * 2 site CHROM 20 * 2 NA12878 AD 26,27 * 3 site CHROM 20 */ @Advanced @Argument(fullName="moltenize", shortName="moltenize", doc="Produce molten output", optional=true) private boolean moltenizeOutput = false; /** * By default, this tool will write NA for missing data. * Turn on this flag, and the tool will throw an error and exit if it encounters missing data. */ @Advanced @Argument(fullName="error-if-missing-data", shortName="EMD", doc="Fail on missing data", optional=true) public boolean errorIfMissingData = false; private static final String MISSING_DATA = "NA"; private SortedSet samples; private long nRecords = 0L; private PrintStream outputStream = null; private VCFHeader inputHeader; @Override public void onTraversalStart() { inputHeader = getHeaderForVariants(); outputStream = createPrintStream(); if (genotypeFieldsToTake.isEmpty() && asGenotypeFieldsToTake.isEmpty()) { samples = Collections.emptySortedSet(); } else { final Map vcfHeaders = Collections.singletonMap(getDrivingVariantsFeatureInput().getName(), getHeaderForVariants()); samples = VcfUtils.getSortedSampleSet(vcfHeaders, GATKVariantContextUtils.GenotypeMergeType.REQUIRE_UNIQUE); // if there are no samples, we don't have to worry about any genotype fields if (samples.isEmpty()) { genotypeFieldsToTake.clear(); asGenotypeFieldsToTake.clear(); logger.warn("There are no samples - the genotype fields will be ignored"); if (fieldsToTake.isEmpty() && asFieldsToTake.isEmpty()){ throw new UserException("There are no samples and no fields - no output will be produced"); } } } if (asGenotypeFieldsToTake.isEmpty() && asFieldsToTake.isEmpty() && !splitMultiAllelic) { logger.warn("Allele-specific fields will only be split if splitting multi-allelic variants is specified (`--" + SPLIT_MULTI_ALLELIC_LONG_NAME + "` or `-" + SPLIT_MULTI_ALLELIC_SHORT_NAME + "`"); } // print out the header if ( moltenizeOutput ) { outputStream.println("RecordID\tSample\tVariable\tValue"); } else { final List fields = new ArrayList<>(); fields.addAll(fieldsToTake); fields.addAll(asFieldsToTake); final String header = new StringBuilder(Utils.join("\t", fields)) .append("\t") .append(createGenotypeHeader()) .toString(); outputStream.println(header); } } private PrintStream createPrintStream() { try { return out != null ? new PrintStream(out) : System.out; } catch ( final FileNotFoundException e ) { throw new UserException.CouldNotCreateOutputFile(out, e); } } @Override public void apply(final VariantContext vc, final ReadsContext readsContext, final ReferenceContext ref, final FeatureContext featureContext) { if ( showFiltered || vc.isNotFiltered() ) { nRecords++; final List> records = extractFields(vc); if (moltenizeOutput){ records.forEach(record -> emitMoltenizedOutput(record)); } else { records.forEach(record -> outputStream.println(Utils.join("\t", record))); } } } private static boolean isWildCard(final String s) { return s.endsWith("*"); } private String createGenotypeHeader() { boolean firstEntry = true; final List allGenotypeFieldsToTake = new ArrayList<>(genotypeFieldsToTake); allGenotypeFieldsToTake.addAll(asGenotypeFieldsToTake); final StringBuilder sb = new StringBuilder(); for ( final String sample : samples ) { for ( final String gf : allGenotypeFieldsToTake ) { if ( firstEntry ) { firstEntry = false; } else { sb.append("\t"); } // spaces in sample names are legal but wreak havoc in R data frames sb.append(sample.replace(" ","_")); sb.append('.'); sb.append(gf); } } return sb.toString(); } private void emitMoltenizedOutput(final List record) { int index = 0; for ( final String field : fieldsToTake ) { outputStream.println(String.format("%d\tsite\t%s\t%s", nRecords, field, record.get(index++))); } for ( final String sample : samples ) { for ( final String gf : genotypeFieldsToTake ) { outputStream.println(String.format("%d\t%s\t%s\t%s", nRecords, sample.replace(" ","_"), gf, record.get(index++))); } } } /** * Utility function that returns the list of values for each field in fields from vc. * * @param vc the VariantContext whose field values we can to capture * @return List of lists of field values */ protected List> extractFields(final VariantContext vc) { final int numRecordsToProduce = splitMultiAllelic ? vc.getAlternateAlleles().size() : 1; final List> records = new ArrayList<>(numRecordsToProduce); for ( int i = 0; i < numRecordsToProduce; i++ ) { records.add(new ArrayList<>()); } for ( final String field : fieldsToTake ) { if ( splitMultiAllelic && field.equals("ALT") ) { // we need to special case the ALT field when splitting out multi-allelic records addFieldValue(splitAltAlleles(vc), records); } else if ( getters.containsKey(field) ) { addFieldValue(getters.get(field).apply(vc), records); } else if ( vc.hasAttribute(field) ) { addFieldValue(vc.getAttribute(field, null), records); } else if ( isWildCard(field) ) { final SortedSet wildVals = new TreeSet<>(); for ( final Map.Entry elt : vc.getAttributes().entrySet()) { if ( elt.getKey().startsWith(field.substring(0, field.length() - 1)) ) { wildVals.add(elt.getValue().toString()); } } final String val = wildVals.isEmpty() ? MISSING_DATA : Utils.join(",", wildVals); addFieldValue(val, records); } else { handleMissingData(errorIfMissingData, field, records, vc); } } for ( final String field : asFieldsToTake) { if (vc.hasAttribute(field)) { if (splitMultiAllelic) { addAlleleSpecificFieldValue(Arrays.asList(vc.getAttributeAsString(field, ".").replace("[", "").replace("]", "").split(",")), records, inputHeader.getInfoHeaderLine(field).getCountType()); } else { addFieldValue(vc.getAttributeAsString(field, ".").replace("[","").replace("]",""), records); } } else { handleMissingData(errorIfMissingData, field, records, vc); } } if ( !genotypeFieldsToTake.isEmpty() || !asGenotypeFieldsToTake.isEmpty() ) { addGenotypeFieldsToRecords(vc, records, errorIfMissingData); } return records; } private void addGenotypeFieldsToRecords(final VariantContext vc, final List> records, final boolean errorIfMissingData) { for ( final String sample : samples ) { for ( final String gf : genotypeFieldsToTake ) { if ( vc.hasGenotype(sample) && vc.getGenotype(sample).hasAnyAttribute(gf) ) { if (VCFConstants.GENOTYPE_KEY.equals(gf)) { addFieldValue(vc.getGenotype(sample).getGenotypeString(true), records); } else { /** * TODO - If gf == "FT" and the GT record is not filtered, Genotype.getAnyAttribute == null. Genotype.hasAnyAttribute should be changed so it * returns false for this condition. Presently, it always returns true. Once this is fixed, then only the "addFieldValue" statement will * remain in the following logic block. */ if (vc.getGenotype(sample).getAnyAttribute(gf) != null) { addFieldValue(vc.getGenotype(sample).getAnyAttribute(gf), records); } else { handleMissingData(errorIfMissingData, gf, records, vc); } } } else { handleMissingData(errorIfMissingData, gf, records, vc); } } for ( final String field : asGenotypeFieldsToTake) { if ( vc.hasGenotype(sample) && vc.getGenotype(sample).hasAnyAttribute(field) ) { if (splitMultiAllelic) { if (VCFConstants.GENOTYPE_ALLELE_DEPTHS.equals(field)) { List altDepths = new ArrayList<>(); int[] allDepths = vc.getGenotype(sample).getAD(); for (int i = 1; i < allDepths.length; i++) { altDepths.add(allDepths[0] + "," + allDepths[i]); } addFieldValue(altDepths, records); } else { addAlleleSpecificFieldValue(split(vc.getGenotype(sample).getExtendedAttribute(field).toString(), ','), records, inputHeader.getFormatHeaderLine(field).getCountType()); } } else { final String value = vc.getGenotype(sample).getAnyAttribute(field).toString(); if (field.equals(VCFConstants.GENOTYPE_ALLELE_DEPTHS)) { addFieldValue(value.replace("[","").replace("]","").replaceAll("\\s",""),records); } else { addFieldValue(value, records); } } } else { handleMissingData(errorIfMissingData, field, records, vc); } } } } private static void handleMissingData(final boolean errorIfMissingData, final String field, final List> records, final VariantContext vc) { if (errorIfMissingData) { throw new UserException(String.format("Missing field %s in vc %s at %s", field, vc.getSource(), vc)); } else { addFieldValue(MISSING_DATA, records); } } private static void addFieldValue(final Object val, final List> result) { final int numResultRecords = result.size(); // if we're trying to create a single output record, add it if ( numResultRecords == 1 ) { result.get(0).add(prettyPrintObject(val)); } // if this field is a list of the proper size, add the appropriate entry to each record else if ( (val instanceof List) && ((List)val).size() == numResultRecords ) { final List list = (List)val; for ( int i = 0; i < numResultRecords; i++ ) { result.get(i).add(list.get(i).toString()); } } // otherwise, add the original value to all of the records else { final String valStr = prettyPrintObject(val); for ( final List record : result ) { record.add(valStr); } } } /** * Handle per-allele/allele-specific annotations as described in the header * @param val the annotation value * @param result the cummulative output * @param alleleCount scalar, R-type or A-type values */ private static void addAlleleSpecificFieldValue(final Object val, final List> result, final VCFHeaderLineCount alleleCount) { if (val instanceof List && alleleCount.equals(VCFHeaderLineCount.R)) { final List myList = (List) val; addFieldValue(new ArrayList<>(myList.subList(1, myList.size())), result); } else { addFieldValue(val, result); } } private static String prettyPrintObject(final Object val) { if ( val == null ) { return ""; } if ( val instanceof List ) { return prettyPrintObject(((List) val).toArray()); } if ( !val.getClass().isArray() ) { return val.toString(); } final int length = Array.getLength(val); if ( length == 0 ) { return ""; } final StringBuilder sb = new StringBuilder(prettyPrintObject(Array.get(val, 0))); for ( int i = 1; i < length; i++ ) { sb.append(","); sb.append(prettyPrintObject(Array.get(val, i))); } return sb.toString(); } // ---------------------------------------------------------------------------------------------------- // // Getting values from VC by name. // // ---------------------------------------------------------------------------------------------------- private final Map> getters = new LinkedHashMap<>(); { // #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT getters.put("CHROM", vc -> vc.getContig()); getters.put("POS", vc -> Integer.toString(vc.getStart())); getters.put("REF", vc -> vc.getReference().getDisplayString()); getters.put("ALT", vc -> { final StringBuilder x = new StringBuilder(); final int n = vc.getAlternateAlleles().size(); if ( n == 0 ) { return "."; } for ( int i = 0; i < n; i++ ) { if ( i != 0 ) { x.append(","); } x.append(vc.getAlternateAllele(i)); } return x.toString(); }); getters.put("EVENTLENGTH", vc -> { int maxLength = 0; for ( final Allele a : vc.getAlternateAlleles() ) { final int length = a.length() - vc.getReference().length(); if( Math.abs(length) > Math.abs(maxLength) ) { maxLength = length; } } return Integer.toString(maxLength); }); getters.put("QUAL", vc -> Double.toString(vc.getPhredScaledQual())); getters.put("TRANSITION", vc -> { if ( vc.isSNP() && vc.isBiallelic() ) { return GATKVariantContextUtils.isTransition(vc) ? "1" : "0"; } else { return "-1"; } }); getters.put("FILTER", vc -> vc.isNotFiltered() ? "PASS" : Utils.join(",", vc.getFilters())); getters.put("ID", vc -> vc.getID()); getters.put("HET", vc -> Integer.toString(vc.getHetCount())); getters.put("HOM-REF", vc -> Integer.toString(vc.getHomRefCount())); getters.put("HOM-VAR", vc -> Integer.toString(vc.getHomVarCount())); getters.put("NO-CALL", vc -> Integer.toString(vc.getNoCallCount())); getters.put("TYPE", vc -> vc.getType().toString()); getters.put("VAR", vc -> Integer.toString(vc.getHetCount() + vc.getHomVarCount())); getters.put("NSAMPLES", vc -> Integer.toString(vc.getNSamples())); getters.put("NCALLED", vc -> Integer.toString(vc.getNSamples() - vc.getNoCallCount())); getters.put("MULTI-ALLELIC", vc -> Boolean.toString(vc.getAlternateAlleles().size() > 1)); } private static Object splitAltAlleles(final VariantContext vc) { final int numAltAlleles = vc.getAlternateAlleles().size(); if ( numAltAlleles == 1 ) { return vc.getAlternateAllele(0); } return vc.getAlternateAlleles(); } }




© 2015 - 2024 Weber Informatics LLC | Privacy Policy