htsjdk.variant.variantcontext.writer.BCF2Writer Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of htsjdk Show documentation
htsjdk
There is a newer version: 4.1.3
/*
* Copyright (c) 2012 The Broad Institute
* 
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
* 
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
* 
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/

package htsjdk.variant.variantcontext.writer;

import htsjdk.samtools.SAMSequenceDictionary;
import htsjdk.samtools.util.IOUtil;
import htsjdk.samtools.util.RuntimeIOException;
import htsjdk.tribble.index.IndexCreator;
import htsjdk.variant.bcf2.BCF2Codec;
import htsjdk.variant.bcf2.BCF2Type;
import htsjdk.variant.bcf2.BCF2Utils;
import htsjdk.variant.bcf2.BCFVersion;
import htsjdk.variant.utils.GeneralUtils;
import htsjdk.variant.variantcontext.Allele;
import htsjdk.variant.variantcontext.Genotype;
import htsjdk.variant.variantcontext.GenotypeBuilder;
import htsjdk.variant.variantcontext.LazyGenotypesContext;
import htsjdk.variant.variantcontext.VariantContext;
import htsjdk.variant.variantcontext.VariantContextBuilder;
import htsjdk.variant.vcf.VCFConstants;
import htsjdk.variant.vcf.VCFContigHeaderLine;
import htsjdk.variant.vcf.VCFHeader;
import htsjdk.variant.vcf.VCFUtils;

import java.io.ByteArrayOutputStream;
import java.io.File;
import java.io.IOException;
import java.io.OutputStream;
import java.io.OutputStreamWriter;
import java.nio.file.Path;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Collections;
import java.util.HashMap;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.Map;

/**
 * VariantContextWriter that emits BCF2 binary encoding
 *
 * Overall structure of this writer is complex for efficiency reasons
 *
 * -- The BCF2Writer manages the low-level BCF2 encoder, the mappings
 * from contigs and strings to offsets, the VCF header, and holds the
 * lower-level encoders that map from VC and Genotype fields to their
 * specific encoders.  This class also writes out the standard BCF2 fields
 * like POS, contig, the size of info and genotype data, QUAL, etc.  It
 * has loops over the INFO and GENOTYPES to encode each individual datum
 * with the generic field encoders, but the actual encoding work is
 * done with by the FieldWriters classes themselves
 *
 * -- BCF2FieldWriter are specialized classes for writing out SITE and
 * genotype information for specific SITE/GENOTYPE fields (like AC for
 * sites and GQ for genotypes).  These are objects in themselves because
 * the manage all of the complexity of relating the types in the VCF header
 * with the proper encoding in BCF as well as the type representing this
 * in java.  Relating all three of these pieces of information together
 * is the main complexity challenge in the encoder.  The piece of code
 * that determines which FieldWriters to associate with each SITE and
 * GENOTYPE field is the BCF2FieldWriterManager.  These FieldWriters
 * are specialized for specific combinations of encoders (see below)
 * and contexts (genotypes) for efficiency, so they smartly manage
 * the writing of PLs (encoded as int[]) directly into the lowest
 * level BCFEncoder.
 *
 * -- At the third level is the BCF2FieldEncoder, relatively simple
 * pieces of code that handle the task of determining the right
 * BCF2 type for specific field values, as well as reporting back
 * information such as the number of elements used to encode it
 * (simple for atomic values like Integer but complex for PLs
 * or lists of strings)
 *
 * -- At the lowest level is the BCF2Encoder itself.  This provides
 * just the limited encoding methods specified by the BCF2 specification.  This encoder
 * doesn't do anything but make it possible to conveniently write out valid low-level
 * BCF2 constructs.
 *
 * @author Mark DePristo
 * @since 06/12
 */
class BCF2Writer extends IndexingVariantContextWriter {
    public static final int MAJOR_VERSION = 2;
    public static final int MINOR_VERSION = 1;

    final private static boolean ALLOW_MISSING_CONTIG_LINES = false;

    private final OutputStream outputStream;      // Note: do not flush until completely done writing, to avoid issues with eventual BGZF support
    private VCFHeader header;
    private final Map contigDictionary = new HashMap();
    private final Map stringDictionaryMap = new LinkedHashMap();
    private final boolean doNotWriteGenotypes;
    private String[] sampleNames = null;

    private final BCF2Encoder encoder = new BCF2Encoder(); // initialized after the header arrives
    final BCF2FieldWriterManager fieldManager = new BCF2FieldWriterManager();

    /**
     * cached results for whether we can write out raw genotypes data.
     */
    private VCFHeader lastVCFHeaderOfUnparsedGenotypes = null;
    private boolean canPassOnUnparsedGenotypeDataForLastVCFHeader = false;

    // is the header or body written to the output stream?
    private boolean outputHasBeenWritten;


    public BCF2Writer(final File location, final OutputStream output, final SAMSequenceDictionary refDict,
                      final boolean enableOnTheFlyIndexing, final boolean doNotWriteGenotypes) {
        this(IOUtil.toPath(location), output, refDict, enableOnTheFlyIndexing, doNotWriteGenotypes);
    }

    public BCF2Writer(final Path location, final OutputStream output, final SAMSequenceDictionary refDict,
        final boolean enableOnTheFlyIndexing, final boolean doNotWriteGenotypes) {
        super(writerName(location, output), location, output, refDict, enableOnTheFlyIndexing);
        this.outputStream = getOutputStream();
        this.doNotWriteGenotypes = doNotWriteGenotypes;
    }

    public BCF2Writer(final File location, final OutputStream output, final SAMSequenceDictionary refDict,
        final IndexCreator indexCreator,
        final boolean enableOnTheFlyIndexing, final boolean doNotWriteGenotypes) {
        this(IOUtil.toPath(location), output, refDict, indexCreator, enableOnTheFlyIndexing,
            doNotWriteGenotypes);
    }

    public BCF2Writer(final Path location, final OutputStream output, final SAMSequenceDictionary refDict,
                      final IndexCreator indexCreator,
                      final boolean enableOnTheFlyIndexing, final boolean doNotWriteGenotypes) {
        super(writerName(location, output), location, output, refDict, enableOnTheFlyIndexing, indexCreator);
        this.outputStream = getOutputStream();
        this.doNotWriteGenotypes = doNotWriteGenotypes;
    }

    // --------------------------------------------------------------------------------
    //
    // Interface functions
    //
    // --------------------------------------------------------------------------------

    @Override
    public void writeHeader(VCFHeader header) {
        setHeader(header);

        try {
            // write out the header into a byte stream, get its length, and write everything to the file
            final ByteArrayOutputStream capture = new ByteArrayOutputStream();
            final OutputStreamWriter writer = new OutputStreamWriter(capture);
            this.header = VCFWriter.writeHeader(this.header, writer, VCFWriter.getVersionLine(), "BCF2 stream");
            writer.append('\0'); // the header is null terminated by a byte
            writer.close();

            final byte[] headerBytes = capture.toByteArray();
            new BCFVersion(MAJOR_VERSION, MINOR_VERSION).write(outputStream);
            BCF2Type.INT32.write(headerBytes.length, outputStream);
            outputStream.write(headerBytes);
            outputHasBeenWritten = true;
        } catch (IOException e) {
            throw new RuntimeIOException("BCF2 stream: Got IOException while trying to write BCF2 header", e);
        }
    }

    @Override
    public void add( VariantContext vc ) {
        if ( doNotWriteGenotypes )
            vc = new VariantContextBuilder(vc).noGenotypes().make();
        vc = vc.fullyDecode(header, false);

        super.add(vc); // allow on the fly indexing

        try {
            final byte[] infoBlock = buildSitesData(vc);
            final byte[] genotypesBlock = buildSamplesData(vc);

            // write the two blocks to disk
            writeBlock(infoBlock, genotypesBlock);
            outputHasBeenWritten = true;
        }
        catch ( IOException e ) {
            throw new RuntimeIOException("Error writing record to BCF2 file: " + vc.toString(), e);
        }
    }

    @Override
    public void close() {
        try {
            outputStream.flush();
        }
        catch ( IOException e ) {
            throw new RuntimeIOException("Failed to flush BCF2 file");
        }
        super.close();
    }

    @Override
    public void setHeader(final VCFHeader header) {
        if (outputHasBeenWritten) {
            throw new IllegalStateException("The header cannot be modified after the header or variants have been written to the output stream.");
        }
        // make sure the header is sorted correctly
        this.header = doNotWriteGenotypes ? new VCFHeader(header.getMetaDataInSortedOrder()) : new VCFHeader(
                header.getMetaDataInSortedOrder(), header.getGenotypeSamples());
        // create the config offsets map
        if ( this.header.getContigLines().isEmpty() ) {
            if ( ALLOW_MISSING_CONTIG_LINES ) {
                if ( GeneralUtils.DEBUG_MODE_ENABLED ) {
                    System.err.println("No contig dictionary found in header, falling back to reference sequence dictionary");
                }
                createContigDictionary(VCFUtils.makeContigHeaderLines(getRefDict(), null));
            } else {
                throw new IllegalStateException("Cannot write BCF2 file with missing contig lines");
            }
        } else {
            createContigDictionary(this.header.getContigLines());
        }
        // set up the map from dictionary string values -> offset
        final ArrayList dict = BCF2Utils.makeDictionary(this.header);
        for ( int i = 0; i < dict.size(); i++ ) {
            stringDictionaryMap.put(dict.get(i), i);
        }

        sampleNames = this.header.getGenotypeSamples().toArray(new String[this.header.getNGenotypeSamples()]);
        // setup the field encodings
        fieldManager.setup(this.header, encoder, stringDictionaryMap);

    }

    // --------------------------------------------------------------------------------
    //
    // implicit block
    //
    // The first four records of BCF are inline untype encoded data of:
    //
    // 4 byte integer chrom offset
    // 4 byte integer start
    // 4 byte integer ref length
    // 4 byte float qual
    //
    // --------------------------------------------------------------------------------
    private byte[] buildSitesData( VariantContext vc ) throws IOException {
        final int contigIndex = contigDictionary.get(vc.getContig());
        if ( contigIndex == -1 )
            throw new IllegalStateException(String.format("Contig %s not found in sequence dictionary from reference", vc.getContig()));

        // note use of encodeRawValue to not insert the typing byte
        encoder.encodeRawValue(contigIndex, BCF2Type.INT32);

        // pos.  GATK is 1 based, BCF2 is 0 based
        encoder.encodeRawValue(vc.getStart() - 1, BCF2Type.INT32);

        // ref length.  GATK is closed, but BCF2 is open so the ref length is GATK end - GATK start + 1
        // for example, a SNP is in GATK at 1:10-10, which has ref length 10 - 10 + 1 = 1
        encoder.encodeRawValue(vc.getEnd() - vc.getStart() + 1, BCF2Type.INT32);

        // qual
        if ( vc.hasLog10PError() )
            encoder.encodeRawFloat((float) vc.getPhredScaledQual());
        else
            encoder.encodeRawMissingValue(BCF2Type.FLOAT);

        // info fields
        final int nAlleles = vc.getNAlleles();
        final int nInfo = vc.getAttributes().size();
        final int nGenotypeFormatFields = getNGenotypeFormatFields(vc);
        final int nSamples = header.getNGenotypeSamples();

        encoder.encodeRawInt((nAlleles << 16) | (nInfo & 0x0000FFFF), BCF2Type.INT32);
        encoder.encodeRawInt((nGenotypeFormatFields << 24) | (nSamples & 0x00FFFFF), BCF2Type.INT32);

        buildID(vc);
        buildAlleles(vc);
        buildFilter(vc);
        buildInfo(vc);

        return encoder.getRecordBytes();
    }


    /**
     * Can we safely write on the raw (undecoded) genotypes of an input VC?
     *
     * The cache depends on the undecoded lazy data header == lastVCFHeaderOfUnparsedGenotypes, in
     * which case we return the previous result.  If it's not cached, we use the BCF2Util to
     * compare the VC header with our header (expensive) and cache it.
     *
     * @param lazyData
     * @return
     */
    private boolean canSafelyWriteRawGenotypesBytes(final BCF2Codec.LazyData lazyData) {
        if ( lazyData.header != lastVCFHeaderOfUnparsedGenotypes ) {
            // result is already cached
            canPassOnUnparsedGenotypeDataForLastVCFHeader = BCF2Utils.headerLinesAreOrderedConsistently(this.header,lazyData.header);
            lastVCFHeaderOfUnparsedGenotypes = lazyData.header;
        }

        return canPassOnUnparsedGenotypeDataForLastVCFHeader;
    }

    private BCF2Codec.LazyData getLazyData(final VariantContext vc) {
        if ( vc.getGenotypes().isLazyWithData() ) {
            final LazyGenotypesContext lgc = (LazyGenotypesContext)vc.getGenotypes();

            if ( lgc.getUnparsedGenotypeData() instanceof BCF2Codec.LazyData &&
                    canSafelyWriteRawGenotypesBytes((BCF2Codec.LazyData) lgc.getUnparsedGenotypeData())) {
                return (BCF2Codec.LazyData)lgc.getUnparsedGenotypeData();
            } else {
                lgc.decode(); // WARNING -- required to avoid keeping around bad lazy data for too long
            }
        }

        return null;
    }

    /**
     * Try to get the nGenotypeFields as efficiently as possible.
     *
     * If this is a lazy BCF2 object just grab the field count from there,
     * otherwise do the whole counting by types test in the actual data
     *
     * @param vc
     * @return
     */
    private int getNGenotypeFormatFields(final VariantContext vc) {
        final BCF2Codec.LazyData lazyData = getLazyData(vc);
        return lazyData != null ? lazyData.nGenotypeFields : vc.calcVCFGenotypeKeys(header).size();
    }

    private void buildID( VariantContext vc ) throws IOException {
        encoder.encodeTypedString(vc.getID());
    }

    private void buildAlleles( VariantContext vc ) throws IOException {
        for ( Allele allele : vc.getAlleles() ) {
            final byte[] s = allele.getDisplayBases();
            if ( s == null )
                throw new IllegalStateException("BUG: BCF2Writer encountered null padded allele" + allele);
            encoder.encodeTypedString(s);
        }
    }

    private void buildFilter( VariantContext vc ) throws IOException {
        if ( vc.isFiltered() ) {
            encodeStringsByRef(vc.getFilters());
        } else if ( vc.filtersWereApplied() ) {
            encodeStringsByRef(Collections.singleton(VCFConstants.PASSES_FILTERS_v4));
        } else {
            encoder.encodeTypedMissing(BCF2Type.INT8);
        }
    }

    private void buildInfo( VariantContext vc ) throws IOException {
        for ( Map.Entry infoFieldEntry : vc.getAttributes().entrySet() ) {
            final String field = infoFieldEntry.getKey();
            final BCF2FieldWriter.SiteWriter writer = fieldManager.getSiteFieldWriter(field);
            if ( writer == null ) errorUnexpectedFieldToWrite(vc, field, "INFO");
            writer.start(encoder, vc);
            writer.site(encoder, vc);
            writer.done(encoder, vc);
        }
    }

    private byte[] buildSamplesData(final VariantContext vc) throws IOException {
        final BCF2Codec.LazyData lazyData = getLazyData(vc);  // has critical side effects
        if ( lazyData != null ) {
            // we never decoded any data from this BCF file, so just pass it back
            return lazyData.bytes;
        }

        // we have to do work to convert the VC into a BCF2 byte stream
        final List genotypeFields = vc.calcVCFGenotypeKeys(header);
        for ( final String field : genotypeFields ) {
            final BCF2FieldWriter.GenotypesWriter writer = fieldManager.getGenotypeFieldWriter(field);
            if ( writer == null ) errorUnexpectedFieldToWrite(vc, field, "FORMAT");

            assert writer != null;

            writer.start(encoder, vc);
            for ( final String name : sampleNames ) {
                Genotype g = vc.getGenotype(name);
                if ( g == null ) g = GenotypeBuilder.createMissing(name, writer.nValuesPerGenotype);
                writer.addGenotype(encoder, vc, g);
            }
            writer.done(encoder, vc);
        }
        return encoder.getRecordBytes();
    }

    /**
     * Throws a meaningful error message when a field (INFO or FORMAT) is found when writing out a file
     * but there's no header line for it.
     *
     * @param vc
     * @param field
     * @param fieldType
     */
    private void errorUnexpectedFieldToWrite(final VariantContext vc, final String field, final String fieldType) {
        throw new IllegalStateException("Found field " + field + " in the " + fieldType + " fields of VariantContext at " +
                vc.getContig() + ":" + vc.getStart() + " from " + vc.getSource() + " but this hasn't been defined in the VCFHeader");
    }

    // --------------------------------------------------------------------------------
    //
    // Low-level block encoding
    //
    // --------------------------------------------------------------------------------

    /**
     * Write the data in the encoder to the outputstream as a length encoded
     * block of data.  After this call the encoder stream will be ready to
     * start a new data block
     *
     * @throws IOException
     */
    private void writeBlock(final byte[] infoBlock, final byte[] genotypesBlock) throws IOException {
        BCF2Type.INT32.write(infoBlock.length, outputStream);
        BCF2Type.INT32.write(genotypesBlock.length, outputStream);
        outputStream.write(infoBlock);
        outputStream.write(genotypesBlock);
    }

    private BCF2Type encodeStringsByRef(final Collection strings) throws IOException {
        final List offsets = new ArrayList(strings.size());

        // iterate over strings until we find one that needs 16 bits, and break
        for ( final String string : strings ) {
            final Integer got = stringDictionaryMap.get(string);
            if ( got == null ) throw new IllegalStateException("Format error: could not find string " + string + " in header as required by BCF");
            final int offset = got;
            offsets.add(offset);
        }

        final BCF2Type type = BCF2Utils.determineIntegerType(offsets);
        encoder.encodeTyped(offsets, type);
        return type;
    }

    /**
     * Create the contigDictionary from the contigLines extracted from the VCF header
     *
     * @param contigLines
     */
    private void createContigDictionary(final Collection contigLines) {
        int offset = 0;
        for ( VCFContigHeaderLine contig : contigLines )
            contigDictionary.put(contig.getID(), offset++);
    }
}