Many resources are needed to download a project. Please understand that we have to compensate our server costs. Thank you in advance. Project price only 1 $
You can buy this project and download/modify it how often you want.
/*
* Copyright (c) 2012 The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
package htsjdk.variant.variantcontext.writer;
import htsjdk.samtools.SAMSequenceDictionary;
import htsjdk.samtools.util.IOUtil;
import htsjdk.samtools.util.RuntimeIOException;
import htsjdk.tribble.index.IndexCreator;
import htsjdk.variant.bcf2.BCF2Codec;
import htsjdk.variant.bcf2.BCF2Type;
import htsjdk.variant.bcf2.BCF2Utils;
import htsjdk.variant.bcf2.BCFVersion;
import htsjdk.variant.utils.GeneralUtils;
import htsjdk.variant.variantcontext.Allele;
import htsjdk.variant.variantcontext.Genotype;
import htsjdk.variant.variantcontext.GenotypeBuilder;
import htsjdk.variant.variantcontext.LazyGenotypesContext;
import htsjdk.variant.variantcontext.VariantContext;
import htsjdk.variant.variantcontext.VariantContextBuilder;
import htsjdk.variant.vcf.VCFConstants;
import htsjdk.variant.vcf.VCFContigHeaderLine;
import htsjdk.variant.vcf.VCFHeader;
import htsjdk.variant.vcf.VCFUtils;
import java.io.ByteArrayOutputStream;
import java.io.File;
import java.io.IOException;
import java.io.OutputStream;
import java.io.OutputStreamWriter;
import java.nio.file.Path;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Collections;
import java.util.HashMap;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.Map;
/**
* VariantContextWriter that emits BCF2 binary encoding
*
* Overall structure of this writer is complex for efficiency reasons
*
* -- The BCF2Writer manages the low-level BCF2 encoder, the mappings
* from contigs and strings to offsets, the VCF header, and holds the
* lower-level encoders that map from VC and Genotype fields to their
* specific encoders. This class also writes out the standard BCF2 fields
* like POS, contig, the size of info and genotype data, QUAL, etc. It
* has loops over the INFO and GENOTYPES to encode each individual datum
* with the generic field encoders, but the actual encoding work is
* done with by the FieldWriters classes themselves
*
* -- BCF2FieldWriter are specialized classes for writing out SITE and
* genotype information for specific SITE/GENOTYPE fields (like AC for
* sites and GQ for genotypes). These are objects in themselves because
* the manage all of the complexity of relating the types in the VCF header
* with the proper encoding in BCF as well as the type representing this
* in java. Relating all three of these pieces of information together
* is the main complexity challenge in the encoder. The piece of code
* that determines which FieldWriters to associate with each SITE and
* GENOTYPE field is the BCF2FieldWriterManager. These FieldWriters
* are specialized for specific combinations of encoders (see below)
* and contexts (genotypes) for efficiency, so they smartly manage
* the writing of PLs (encoded as int[]) directly into the lowest
* level BCFEncoder.
*
* -- At the third level is the BCF2FieldEncoder, relatively simple
* pieces of code that handle the task of determining the right
* BCF2 type for specific field values, as well as reporting back
* information such as the number of elements used to encode it
* (simple for atomic values like Integer but complex for PLs
* or lists of strings)
*
* -- At the lowest level is the BCF2Encoder itself. This provides
* just the limited encoding methods specified by the BCF2 specification. This encoder
* doesn't do anything but make it possible to conveniently write out valid low-level
* BCF2 constructs.
*
* @author Mark DePristo
* @since 06/12
*/
class BCF2Writer extends IndexingVariantContextWriter {
public static final int MAJOR_VERSION = 2;
public static final int MINOR_VERSION = 1;
final private static boolean ALLOW_MISSING_CONTIG_LINES = false;
private final OutputStream outputStream; // Note: do not flush until completely done writing, to avoid issues with eventual BGZF support
private VCFHeader header;
private final Map contigDictionary = new HashMap();
private final Map stringDictionaryMap = new LinkedHashMap();
private final boolean doNotWriteGenotypes;
private String[] sampleNames = null;
private final BCF2Encoder encoder = new BCF2Encoder(); // initialized after the header arrives
final BCF2FieldWriterManager fieldManager = new BCF2FieldWriterManager();
/**
* cached results for whether we can write out raw genotypes data.
*/
private VCFHeader lastVCFHeaderOfUnparsedGenotypes = null;
private boolean canPassOnUnparsedGenotypeDataForLastVCFHeader = false;
// is the header or body written to the output stream?
private boolean outputHasBeenWritten;
public BCF2Writer(final File location, final OutputStream output, final SAMSequenceDictionary refDict,
final boolean enableOnTheFlyIndexing, final boolean doNotWriteGenotypes) {
this(IOUtil.toPath(location), output, refDict, enableOnTheFlyIndexing, doNotWriteGenotypes);
}
public BCF2Writer(final Path location, final OutputStream output, final SAMSequenceDictionary refDict,
final boolean enableOnTheFlyIndexing, final boolean doNotWriteGenotypes) {
super(writerName(location, output), location, output, refDict, enableOnTheFlyIndexing);
this.outputStream = getOutputStream();
this.doNotWriteGenotypes = doNotWriteGenotypes;
}
public BCF2Writer(final File location, final OutputStream output, final SAMSequenceDictionary refDict,
final IndexCreator indexCreator,
final boolean enableOnTheFlyIndexing, final boolean doNotWriteGenotypes) {
this(IOUtil.toPath(location), output, refDict, indexCreator, enableOnTheFlyIndexing,
doNotWriteGenotypes);
}
public BCF2Writer(final Path location, final OutputStream output, final SAMSequenceDictionary refDict,
final IndexCreator indexCreator,
final boolean enableOnTheFlyIndexing, final boolean doNotWriteGenotypes) {
super(writerName(location, output), location, output, refDict, enableOnTheFlyIndexing, indexCreator);
this.outputStream = getOutputStream();
this.doNotWriteGenotypes = doNotWriteGenotypes;
}
// --------------------------------------------------------------------------------
//
// Interface functions
//
// --------------------------------------------------------------------------------
@Override
public void writeHeader(VCFHeader header) {
setHeader(header);
try {
// write out the header into a byte stream, get its length, and write everything to the file
final ByteArrayOutputStream capture = new ByteArrayOutputStream();
final OutputStreamWriter writer = new OutputStreamWriter(capture);
this.header = VCFWriter.writeHeader(this.header, writer, VCFWriter.getVersionLine(), "BCF2 stream");
writer.append('\0'); // the header is null terminated by a byte
writer.close();
final byte[] headerBytes = capture.toByteArray();
new BCFVersion(MAJOR_VERSION, MINOR_VERSION).write(outputStream);
BCF2Type.INT32.write(headerBytes.length, outputStream);
outputStream.write(headerBytes);
outputHasBeenWritten = true;
} catch (IOException e) {
throw new RuntimeIOException("BCF2 stream: Got IOException while trying to write BCF2 header", e);
}
}
@Override
public void add( VariantContext vc ) {
if ( doNotWriteGenotypes )
vc = new VariantContextBuilder(vc).noGenotypes().make();
vc = vc.fullyDecode(header, false);
super.add(vc); // allow on the fly indexing
try {
final byte[] infoBlock = buildSitesData(vc);
final byte[] genotypesBlock = buildSamplesData(vc);
// write the two blocks to disk
writeBlock(infoBlock, genotypesBlock);
outputHasBeenWritten = true;
}
catch ( IOException e ) {
throw new RuntimeIOException("Error writing record to BCF2 file: " + vc.toString(), e);
}
}
@Override
public void close() {
try {
outputStream.flush();
}
catch ( IOException e ) {
throw new RuntimeIOException("Failed to flush BCF2 file");
}
super.close();
}
@Override
public void setHeader(final VCFHeader header) {
if (outputHasBeenWritten) {
throw new IllegalStateException("The header cannot be modified after the header or variants have been written to the output stream.");
}
// make sure the header is sorted correctly
this.header = doNotWriteGenotypes ? new VCFHeader(header.getMetaDataInSortedOrder()) : new VCFHeader(
header.getMetaDataInSortedOrder(), header.getGenotypeSamples());
// create the config offsets map
if ( this.header.getContigLines().isEmpty() ) {
if ( ALLOW_MISSING_CONTIG_LINES ) {
if ( GeneralUtils.DEBUG_MODE_ENABLED ) {
System.err.println("No contig dictionary found in header, falling back to reference sequence dictionary");
}
createContigDictionary(VCFUtils.makeContigHeaderLines(getRefDict(), null));
} else {
throw new IllegalStateException("Cannot write BCF2 file with missing contig lines");
}
} else {
createContigDictionary(this.header.getContigLines());
}
// set up the map from dictionary string values -> offset
final ArrayList dict = BCF2Utils.makeDictionary(this.header);
for ( int i = 0; i < dict.size(); i++ ) {
stringDictionaryMap.put(dict.get(i), i);
}
sampleNames = this.header.getGenotypeSamples().toArray(new String[this.header.getNGenotypeSamples()]);
// setup the field encodings
fieldManager.setup(this.header, encoder, stringDictionaryMap);
}
// --------------------------------------------------------------------------------
//
// implicit block
//
// The first four records of BCF are inline untype encoded data of:
//
// 4 byte integer chrom offset
// 4 byte integer start
// 4 byte integer ref length
// 4 byte float qual
//
// --------------------------------------------------------------------------------
private byte[] buildSitesData( VariantContext vc ) throws IOException {
final int contigIndex = contigDictionary.get(vc.getContig());
if ( contigIndex == -1 )
throw new IllegalStateException(String.format("Contig %s not found in sequence dictionary from reference", vc.getContig()));
// note use of encodeRawValue to not insert the typing byte
encoder.encodeRawValue(contigIndex, BCF2Type.INT32);
// pos. GATK is 1 based, BCF2 is 0 based
encoder.encodeRawValue(vc.getStart() - 1, BCF2Type.INT32);
// ref length. GATK is closed, but BCF2 is open so the ref length is GATK end - GATK start + 1
// for example, a SNP is in GATK at 1:10-10, which has ref length 10 - 10 + 1 = 1
encoder.encodeRawValue(vc.getEnd() - vc.getStart() + 1, BCF2Type.INT32);
// qual
if ( vc.hasLog10PError() )
encoder.encodeRawFloat((float) vc.getPhredScaledQual());
else
encoder.encodeRawMissingValue(BCF2Type.FLOAT);
// info fields
final int nAlleles = vc.getNAlleles();
final int nInfo = vc.getAttributes().size();
final int nGenotypeFormatFields = getNGenotypeFormatFields(vc);
final int nSamples = header.getNGenotypeSamples();
encoder.encodeRawInt((nAlleles << 16) | (nInfo & 0x0000FFFF), BCF2Type.INT32);
encoder.encodeRawInt((nGenotypeFormatFields << 24) | (nSamples & 0x00FFFFF), BCF2Type.INT32);
buildID(vc);
buildAlleles(vc);
buildFilter(vc);
buildInfo(vc);
return encoder.getRecordBytes();
}
/**
* Can we safely write on the raw (undecoded) genotypes of an input VC?
*
* The cache depends on the undecoded lazy data header == lastVCFHeaderOfUnparsedGenotypes, in
* which case we return the previous result. If it's not cached, we use the BCF2Util to
* compare the VC header with our header (expensive) and cache it.
*
* @param lazyData
* @return
*/
private boolean canSafelyWriteRawGenotypesBytes(final BCF2Codec.LazyData lazyData) {
if ( lazyData.header != lastVCFHeaderOfUnparsedGenotypes ) {
// result is already cached
canPassOnUnparsedGenotypeDataForLastVCFHeader = BCF2Utils.headerLinesAreOrderedConsistently(this.header,lazyData.header);
lastVCFHeaderOfUnparsedGenotypes = lazyData.header;
}
return canPassOnUnparsedGenotypeDataForLastVCFHeader;
}
private BCF2Codec.LazyData getLazyData(final VariantContext vc) {
if ( vc.getGenotypes().isLazyWithData() ) {
final LazyGenotypesContext lgc = (LazyGenotypesContext)vc.getGenotypes();
if ( lgc.getUnparsedGenotypeData() instanceof BCF2Codec.LazyData &&
canSafelyWriteRawGenotypesBytes((BCF2Codec.LazyData) lgc.getUnparsedGenotypeData())) {
return (BCF2Codec.LazyData)lgc.getUnparsedGenotypeData();
} else {
lgc.decode(); // WARNING -- required to avoid keeping around bad lazy data for too long
}
}
return null;
}
/**
* Try to get the nGenotypeFields as efficiently as possible.
*
* If this is a lazy BCF2 object just grab the field count from there,
* otherwise do the whole counting by types test in the actual data
*
* @param vc
* @return
*/
private int getNGenotypeFormatFields(final VariantContext vc) {
final BCF2Codec.LazyData lazyData = getLazyData(vc);
return lazyData != null ? lazyData.nGenotypeFields : vc.calcVCFGenotypeKeys(header).size();
}
private void buildID( VariantContext vc ) throws IOException {
encoder.encodeTypedString(vc.getID());
}
private void buildAlleles( VariantContext vc ) throws IOException {
for ( Allele allele : vc.getAlleles() ) {
final byte[] s = allele.getDisplayBases();
if ( s == null )
throw new IllegalStateException("BUG: BCF2Writer encountered null padded allele" + allele);
encoder.encodeTypedString(s);
}
}
private void buildFilter( VariantContext vc ) throws IOException {
if ( vc.isFiltered() ) {
encodeStringsByRef(vc.getFilters());
} else if ( vc.filtersWereApplied() ) {
encodeStringsByRef(Collections.singleton(VCFConstants.PASSES_FILTERS_v4));
} else {
encoder.encodeTypedMissing(BCF2Type.INT8);
}
}
private void buildInfo( VariantContext vc ) throws IOException {
for ( Map.Entry infoFieldEntry : vc.getAttributes().entrySet() ) {
final String field = infoFieldEntry.getKey();
final BCF2FieldWriter.SiteWriter writer = fieldManager.getSiteFieldWriter(field);
if ( writer == null ) errorUnexpectedFieldToWrite(vc, field, "INFO");
writer.start(encoder, vc);
writer.site(encoder, vc);
writer.done(encoder, vc);
}
}
private byte[] buildSamplesData(final VariantContext vc) throws IOException {
final BCF2Codec.LazyData lazyData = getLazyData(vc); // has critical side effects
if ( lazyData != null ) {
// we never decoded any data from this BCF file, so just pass it back
return lazyData.bytes;
}
// we have to do work to convert the VC into a BCF2 byte stream
final List genotypeFields = vc.calcVCFGenotypeKeys(header);
for ( final String field : genotypeFields ) {
final BCF2FieldWriter.GenotypesWriter writer = fieldManager.getGenotypeFieldWriter(field);
if ( writer == null ) errorUnexpectedFieldToWrite(vc, field, "FORMAT");
assert writer != null;
writer.start(encoder, vc);
for ( final String name : sampleNames ) {
Genotype g = vc.getGenotype(name);
if ( g == null ) g = GenotypeBuilder.createMissing(name, writer.nValuesPerGenotype);
writer.addGenotype(encoder, vc, g);
}
writer.done(encoder, vc);
}
return encoder.getRecordBytes();
}
/**
* Throws a meaningful error message when a field (INFO or FORMAT) is found when writing out a file
* but there's no header line for it.
*
* @param vc
* @param field
* @param fieldType
*/
private void errorUnexpectedFieldToWrite(final VariantContext vc, final String field, final String fieldType) {
throw new IllegalStateException("Found field " + field + " in the " + fieldType + " fields of VariantContext at " +
vc.getContig() + ":" + vc.getStart() + " from " + vc.getSource() + " but this hasn't been defined in the VCFHeader");
}
// --------------------------------------------------------------------------------
//
// Low-level block encoding
//
// --------------------------------------------------------------------------------
/**
* Write the data in the encoder to the outputstream as a length encoded
* block of data. After this call the encoder stream will be ready to
* start a new data block
*
* @throws IOException
*/
private void writeBlock(final byte[] infoBlock, final byte[] genotypesBlock) throws IOException {
BCF2Type.INT32.write(infoBlock.length, outputStream);
BCF2Type.INT32.write(genotypesBlock.length, outputStream);
outputStream.write(infoBlock);
outputStream.write(genotypesBlock);
}
private BCF2Type encodeStringsByRef(final Collection strings) throws IOException {
final List offsets = new ArrayList(strings.size());
// iterate over strings until we find one that needs 16 bits, and break
for ( final String string : strings ) {
final Integer got = stringDictionaryMap.get(string);
if ( got == null ) throw new IllegalStateException("Format error: could not find string " + string + " in header as required by BCF");
final int offset = got;
offsets.add(offset);
}
final BCF2Type type = BCF2Utils.determineIntegerType(offsets);
encoder.encodeTyped(offsets, type);
return type;
}
/**
* Create the contigDictionary from the contigLines extracted from the VCF header
*
* @param contigLines
*/
private void createContigDictionary(final Collection contigLines) {
int offset = 0;
for ( VCFContigHeaderLine contig : contigLines )
contigDictionary.put(contig.getID(), offset++);
}
}