
htsjdk.variant.vcf.VCFHeader Maven / Gradle / Ivy
/*
* Copyright (c) 2012 The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
package htsjdk.variant.vcf;
import htsjdk.samtools.SAMSequenceDictionary;
import htsjdk.samtools.SAMSequenceRecord;
import htsjdk.tribble.TribbleException;
import htsjdk.tribble.util.ParsingUtils;
import htsjdk.variant.utils.GeneralUtils;
import htsjdk.variant.variantcontext.VariantContextComparator;
import java.io.Serializable;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collection;
import java.util.Collections;
import java.util.HashMap;
import java.util.HashSet;
import java.util.LinkedHashMap;
import java.util.LinkedHashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.TreeSet;
/**
* A class to represent a VCF header
*
* @author aaron
* NOTE: This class stores header lines in lots of places. The original author noted that this should
* be cleaned up at some point in the future (jgentry - 5/2013)
*/
public class VCFHeader implements Serializable {
public static final long serialVersionUID = 1L;
// the mandatory header fields
public enum HEADER_FIELDS {
CHROM, POS, ID, REF, ALT, QUAL, FILTER, INFO
}
// the associated meta data
private final Set mMetaData = new LinkedHashSet();
private final Map mInfoMetaData = new LinkedHashMap();
private final Map mFormatMetaData = new LinkedHashMap();
private final Map mFilterMetaData = new LinkedHashMap();
private final Map mOtherMetaData = new LinkedHashMap();
private final List contigMetaData = new ArrayList();
// the list of auxillary tags
private final List mGenotypeSampleNames = new ArrayList();
// the character string that indicates meta data
public static final String METADATA_INDICATOR = "##";
// the header string indicator
public static final String HEADER_INDICATOR = "#";
public static final String SOURCE_KEY = "source";
public static final String REFERENCE_KEY = "reference";
public static final String CONTIG_KEY = "contig";
public static final String INTERVALS_KEY = "intervals";
public static final String EXCLUDE_INTERVALS_KEY = "excludeIntervals";
public static final String INTERVAL_MERGING_KEY = "interval_merging";
public static final String INTERVAL_SET_RULE_KEY = "interval_set_rule";
public static final String INTERVAL_PADDING_KEY = "interval_padding";
// were the input samples sorted originally (or are we sorting them)?
private boolean samplesWereAlreadySorted = true;
// cache for efficient conversion of VCF -> VariantContext
private ArrayList sampleNamesInOrder = null;
private HashMap sampleNameToOffset = null;
private boolean writeEngineHeaders = true;
private boolean writeCommandLine = true;
/**
* Create an empty VCF header with no header lines and no samples
*/
public VCFHeader() {
this(Collections.emptySet(), Collections.emptySet());
}
/**
* create a VCF header, given a list of meta data and auxiliary tags
*
* @param metaData the meta data associated with this header
*/
public VCFHeader(final Set metaData) {
mMetaData.addAll(metaData);
removeVCFVersionLines(mMetaData);
createLookupEntriesForAllHeaderLines();
checkForDeprecatedGenotypeLikelihoodsKey();
}
/**
* Creates a deep copy of the given VCFHeader, duplicating all its metadata and
* sample names.
*/
public VCFHeader(final VCFHeader toCopy) {
this(toCopy.mMetaData, toCopy.mGenotypeSampleNames);
}
/**
* create a VCF header, given a list of meta data and auxillary tags
*
* @param metaData the meta data associated with this header
* @param genotypeSampleNames the sample names
*/
public VCFHeader(final Set metaData, final Set genotypeSampleNames) {
this(metaData, new ArrayList(genotypeSampleNames));
}
public VCFHeader(final Set metaData, final List genotypeSampleNames) {
this(metaData);
if ( genotypeSampleNames.size() != new HashSet(genotypeSampleNames).size() )
throw new TribbleException.InvalidHeader("BUG: VCF header has duplicate sample names");
mGenotypeSampleNames.addAll(genotypeSampleNames);
samplesWereAlreadySorted = ParsingUtils.isSorted(genotypeSampleNames);
buildVCFReaderMaps(genotypeSampleNames);
}
/**
* Tell this VCF header to use pre-calculated sample name ordering and the
* sample name -> offset map. This assumes that all VariantContext created
* using this header (i.e., read by the VCFCodec) will have genotypes
* occurring in the same order
*
* @param genotypeSampleNamesInAppearenceOrder genotype sample names, must iterator in order of appearance
*/
private void buildVCFReaderMaps(final Collection genotypeSampleNamesInAppearenceOrder) {
sampleNamesInOrder = new ArrayList(genotypeSampleNamesInAppearenceOrder.size());
sampleNameToOffset = new HashMap(genotypeSampleNamesInAppearenceOrder.size());
int i = 0;
for (final String name : genotypeSampleNamesInAppearenceOrder) {
sampleNamesInOrder.add(name);
sampleNameToOffset.put(name, i++);
}
Collections.sort(sampleNamesInOrder);
}
/**
* Adds a new line to the VCFHeader. If there is an existing header line of the
* same type with the same key, the new line is not added and the existing line
* is preserved.
*
* @param headerLine header line to attempt to add
*/
public void addMetaDataLine(final VCFHeaderLine headerLine) {
// Try to create a lookup entry for the new line. If this succeeds (because there was
// no line of this type with the same key), add the line to our master list of header
// lines in mMetaData.
if ( addMetadataLineLookupEntry(headerLine) ) {
mMetaData.add(headerLine);
checkForDeprecatedGenotypeLikelihoodsKey();
}
}
/**
* @return all of the VCF header lines of the ##contig form in order, or an empty list if none were present
*/
public List getContigLines() {
return Collections.unmodifiableList(contigMetaData);
}
/**
* Returns the contigs in this VCF file as a SAMSequenceDictionary. Returns null if contigs lines are
* not present in the header. Throws SAMException if one or more contig lines do not have length
* information.
*/
public SAMSequenceDictionary getSequenceDictionary() {
final List contigHeaderLines = this.getContigLines();
if (contigHeaderLines.isEmpty()) return null;
final List sequenceRecords = new ArrayList(contigHeaderLines.size());
for (final VCFContigHeaderLine contigHeaderLine : contigHeaderLines) {
sequenceRecords.add(contigHeaderLine.getSAMSequenceRecord());
}
return new SAMSequenceDictionary(sequenceRecords);
}
/**
* Completely replaces the contig records in this header with those in the given SAMSequenceDictionary.
*/
public void setSequenceDictionary(final SAMSequenceDictionary dictionary) {
this.contigMetaData.clear();
// Also need to remove contig record lines from mMetaData
final List toRemove = new ArrayList();
for (final VCFHeaderLine line : mMetaData) {
if (line instanceof VCFContigHeaderLine) {
toRemove.add(line);
}
}
mMetaData.removeAll(toRemove);
for (final SAMSequenceRecord record : dictionary.getSequences()) {
contigMetaData.add(new VCFContigHeaderLine(record, record.getAssembly()));
}
this.mMetaData.addAll(contigMetaData);
}
public VariantContextComparator getVCFRecordComparator() {
return new VariantContextComparator(this.getContigLines());
}
/**
* @return all of the VCF FILTER lines in their original file order, or an empty list if none were present
*/
public List getFilterLines() {
final List filters = new ArrayList();
for (final VCFHeaderLine line : mMetaData) {
if ( line instanceof VCFFilterHeaderLine ) {
filters.add((VCFFilterHeaderLine)line);
}
}
return filters;
}
/**
* @return all of the VCF FILTER lines in their original file order, or an empty list if none were present
*/
public List getIDHeaderLines() {
final List filters = new ArrayList();
for (final VCFHeaderLine line : mMetaData) {
if (line instanceof VCFIDHeaderLine) {
filters.add((VCFIDHeaderLine)line);
}
}
return filters;
}
/**
* Remove all lines with a VCF version tag from the provided set of header lines
*/
private void removeVCFVersionLines( final Set headerLines ) {
final List toRemove = new ArrayList();
for (final VCFHeaderLine line : headerLines) {
if (VCFHeaderVersion.isFormatString(line.getKey())) {
toRemove.add(line);
}
}
headerLines.removeAll(toRemove);
}
/**
* Creates lookup table entries for all header lines in mMetaData.
*/
private void createLookupEntriesForAllHeaderLines() {
for (final VCFHeaderLine line : mMetaData) {
addMetadataLineLookupEntry(line);
}
}
/**
* Add a single header line to the appropriate type-specific lookup table (but NOT to the master
* list of lines in mMetaData -- this must be done separately if desired).
*
* If a header line is present that has the same key as an existing line, it will not be added. A warning
* will be shown if this occurs when GeneralUtils.DEBUG_MODE_ENABLED is true, otherwise this will occur
* silently.
*
* @param line header line to attempt to add to its type-specific lookup table
* @return true if the line was added to the appropriate lookup table, false if there was an existing
* line with the same key and the new line was not added
*/
private boolean addMetadataLineLookupEntry(final VCFHeaderLine line) {
if ( line instanceof VCFInfoHeaderLine ) {
final VCFInfoHeaderLine infoLine = (VCFInfoHeaderLine)line;
return addMetaDataLineMapLookupEntry(mInfoMetaData, infoLine.getID(), infoLine);
} else if ( line instanceof VCFFormatHeaderLine ) {
final VCFFormatHeaderLine formatLine = (VCFFormatHeaderLine)line;
return addMetaDataLineMapLookupEntry(mFormatMetaData, formatLine.getID(), formatLine);
} else if ( line instanceof VCFFilterHeaderLine ) {
final VCFFilterHeaderLine filterLine = (VCFFilterHeaderLine)line;
return addMetaDataLineMapLookupEntry(mFilterMetaData, filterLine.getID(), filterLine);
} else if ( line instanceof VCFContigHeaderLine ) {
return addContigMetaDataLineLookupEntry((VCFContigHeaderLine) line);
} else {
return addMetaDataLineMapLookupEntry(mOtherMetaData, line.getKey(), line);
}
}
/**
* Add a contig header line to the lookup list for contig lines (contigMetaData). If there's
* already a contig line with the same ID, does not add the line.
*
* Note: does not add the contig line to the master list of header lines in mMetaData --
* this must be done separately if desired.
*
* @param line contig header line to add
* @return true if line was added to the list of contig lines, otherwise false
*/
private boolean addContigMetaDataLineLookupEntry(final VCFContigHeaderLine line) {
for (VCFContigHeaderLine vcfContigHeaderLine : contigMetaData) {
// if we are trying to add a contig for the same ID
if (vcfContigHeaderLine.getID().equals(line.getID())) {
if ( GeneralUtils.DEBUG_MODE_ENABLED ) {
System.err.println("Found duplicate VCF contig header lines for " + line.getID() + "; keeping the first only" );
}
// do not add this contig if it exists
return false;
}
}
contigMetaData.add(line);
return true;
}
/**
* Add a header line to the provided map at a given key. If the key already exists, it will not be replaced.
* If it does already exist and GeneralUtils.DEBUG_MODE_ENABLED is true, it will issue warnings about duplicates,
* otherwise it will silently leave the existing key/line pair as is.
*
* Note: does not add the header line to the master list of header lines in mMetaData --
* this must be done separately if desired.
*
* @param map a map from each key to the associated VCFHeaderLine
* @param key the key to insert this line at
* @param line the line to insert at this key
* @param a type of vcf header line that extends VCFHeaderLine
* @return true if the line was added to the map, false if it was not added because there's already a line with that key
*/
private boolean addMetaDataLineMapLookupEntry(final Map map, final String key, final T line) {
if ( map.containsKey(key) ) {
if ( GeneralUtils.DEBUG_MODE_ENABLED ) {
System.err.println("Found duplicate VCF header lines for " + key + "; keeping the first only" );
}
return false;
}
map.put(key, line);
return true;
}
/**
* Check for the presence of a format line with the deprecated key {@link VCFConstants#GENOTYPE_LIKELIHOODS_KEY}.
* If one is present, and there isn't a format line with the key {@link VCFConstants#GENOTYPE_PL_KEY}, adds
* a new format line with the key {@link VCFConstants#GENOTYPE_PL_KEY}.
*/
private void checkForDeprecatedGenotypeLikelihoodsKey() {
if ( hasFormatLine(VCFConstants.GENOTYPE_LIKELIHOODS_KEY) && ! hasFormatLine(VCFConstants.GENOTYPE_PL_KEY) ) {
if ( GeneralUtils.DEBUG_MODE_ENABLED ) {
System.err.println("Found " + VCFConstants.GENOTYPE_LIKELIHOODS_KEY + " format, but no "
+ VCFConstants.GENOTYPE_PL_KEY + " field. We now only manage PL fields internally"
+ " automatically adding a corresponding PL field to your VCF header");
}
addMetaDataLine(new VCFFormatHeaderLine(VCFConstants.GENOTYPE_PL_KEY, VCFHeaderLineCount.G, VCFHeaderLineType.Integer, "Normalized, Phred-scaled likelihoods for genotypes as defined in the VCF specification"));
}
}
/**
* get the header fields in order they're presented in the input file (which is now required to be
* the order presented in the spec).
*
* @return a set of the header fields, in order
*/
public Set getHeaderFields() {
return new LinkedHashSet(Arrays.asList(HEADER_FIELDS.values()));
}
/**
* get the meta data, associated with this header, in sorted order
*
* @return a set of the meta data
*/
public Set getMetaDataInInputOrder() {
return makeGetMetaDataSet(mMetaData);
}
public Set getMetaDataInSortedOrder() {
return makeGetMetaDataSet(new TreeSet(mMetaData));
}
private static Set makeGetMetaDataSet(final Set headerLinesInSomeOrder) {
final Set lines = new LinkedHashSet();
lines.add(new VCFHeaderLine(VCFHeaderVersion.VCF4_2.getFormatString(), VCFHeaderVersion.VCF4_2.getVersionString()));
lines.addAll(headerLinesInSomeOrder);
return Collections.unmodifiableSet(lines);
}
/**
* Get the VCFHeaderLine whose key equals key. Returns null if no such line exists
* @param key
* @return
*/
public VCFHeaderLine getMetaDataLine(final String key) {
for (final VCFHeaderLine line: mMetaData) {
if ( line.getKey().equals(key) )
return line;
}
return null;
}
/**
* get the genotyping sample names
*
* @return a list of the genotype column names, which may be empty if hasGenotypingData() returns false
*/
public List getGenotypeSamples() {
return mGenotypeSampleNames;
}
public int getNGenotypeSamples() {
return mGenotypeSampleNames.size();
}
/**
* do we have genotyping data?
*
* @return true if we have genotyping columns, false otherwise
*/
public boolean hasGenotypingData() {
return getNGenotypeSamples() > 0;
}
/**
* were the input samples sorted originally?
*
* @return true if the input samples were sorted originally, false otherwise
*/
public boolean samplesWereAlreadySorted() {
return samplesWereAlreadySorted;
}
/** @return the column count */
public int getColumnCount() {
return HEADER_FIELDS.values().length + (hasGenotypingData() ? mGenotypeSampleNames.size() + 1 : 0);
}
/**
* Returns the INFO HeaderLines in their original ordering
*/
public Collection getInfoHeaderLines() {
return mInfoMetaData.values();
}
/**
* Returns the FORMAT HeaderLines in their original ordering
*/
public Collection getFormatHeaderLines() {
return mFormatMetaData.values();
}
/**
* @param id the header key name
* @return the meta data line, or null if there is none
*/
public VCFInfoHeaderLine getInfoHeaderLine(final String id) {
return mInfoMetaData.get(id);
}
/**
* @param id the header key name
* @return the meta data line, or null if there is none
*/
public VCFFormatHeaderLine getFormatHeaderLine(final String id) {
return mFormatMetaData.get(id);
}
/**
* @param id the header key name
* @return the meta data line, or null if there is none
*/
public VCFFilterHeaderLine getFilterHeaderLine(final String id) {
return mFilterMetaData.get(id);
}
public boolean hasInfoLine(final String id) {
return getInfoHeaderLine(id) != null;
}
public boolean hasFormatLine(final String id) {
return getFormatHeaderLine(id) != null;
}
public boolean hasFilterLine(final String id) {
return getFilterHeaderLine(id) != null;
}
/**
* @param key the header key name
* @return the meta data line, or null if there is none
*/
public VCFHeaderLine getOtherHeaderLine(final String key) {
return mOtherMetaData.get(key);
}
/**
* Returns the other HeaderLines in their original ordering
*/
public Collection getOtherHeaderLines() {
return mOtherMetaData.values();
}
/**
* If true additional engine headers will be written to the VCF, otherwise only the walker headers will be output.
* @return true if additional engine headers will be written to the VCF
*/
public boolean isWriteEngineHeaders() {
return writeEngineHeaders;
}
/**
* If true additional engine headers will be written to the VCF, otherwise only the walker headers will be output.
* @param writeEngineHeaders true if additional engine headers will be written to the VCF
*/
public void setWriteEngineHeaders(final boolean writeEngineHeaders) {
this.writeEngineHeaders = writeEngineHeaders;
}
/**
* If true, and isWriteEngineHeaders also returns true, the command line will be written to the VCF.
* @return true if the command line will be written to the VCF
*/
public boolean isWriteCommandLine() {
return writeCommandLine;
}
/**
* If true, and isWriteEngineHeaders also returns true, the command line will be written to the VCF.
* @param writeCommandLine true if the command line will be written to the VCF
*/
public void setWriteCommandLine(final boolean writeCommandLine) {
this.writeCommandLine = writeCommandLine;
}
public ArrayList getSampleNamesInOrder() {
return sampleNamesInOrder;
}
public HashMap getSampleNameToOffset() {
return sampleNameToOffset;
}
@Override
public String toString() {
final StringBuilder b = new StringBuilder();
b.append("[VCFHeader:");
for ( final VCFHeaderLine line : mMetaData )
b.append("\n\t").append(line);
return b.append("\n]").toString();
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy