All Downloads are FREE. Search and download functionalities are using the official Maven repository.

htsjdk.variant.bcf2.BCF2Codec Maven / Gradle / Ivy

There is a newer version: 4.1.3
Show newest version
/*
* Copyright (c) 2012 The Broad Institute
* 
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
* 
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
* 
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/

package htsjdk.variant.bcf2;

import htsjdk.samtools.util.IOUtil;
import htsjdk.tribble.BinaryFeatureCodec;
import htsjdk.tribble.Feature;
import htsjdk.tribble.FeatureCodecHeader;
import htsjdk.tribble.TribbleException;
import htsjdk.tribble.readers.*;
import htsjdk.variant.utils.GeneralUtils;
import htsjdk.variant.variantcontext.Allele;
import htsjdk.variant.variantcontext.GenotypeBuilder;
import htsjdk.variant.variantcontext.LazyGenotypesContext;
import htsjdk.variant.variantcontext.VariantContext;
import htsjdk.variant.variantcontext.VariantContextBuilder;
import htsjdk.variant.variantcontext.VariantContextUtils;
import htsjdk.variant.vcf.VCFCodec;
import htsjdk.variant.vcf.VCFCompoundHeaderLine;
import htsjdk.variant.vcf.VCFConstants;
import htsjdk.variant.vcf.VCFContigHeaderLine;
import htsjdk.variant.vcf.VCFHeader;
import htsjdk.variant.vcf.VCFHeaderLineType;

import java.io.*;
import java.nio.file.Files;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;

/**
 * Decode BCF2 files
 */
public class BCF2Codec extends BinaryFeatureCodec {
    protected final static int ALLOWED_MAJOR_VERSION = 2;
    protected final static int ALLOWED_MINOR_VERSION = 1;
    public static final BCFVersion ALLOWED_BCF_VERSION = new BCFVersion(ALLOWED_MAJOR_VERSION, ALLOWED_MINOR_VERSION);

    /** sizeof a BCF header (+ min/max version). Used when trying to detect when a streams starts with a bcf header */
    public static final int SIZEOF_BCF_HEADER =  BCFVersion.MAGIC_HEADER_START.length + 2*Byte.BYTES;
    
    private BCFVersion bcfVersion = null;

    private VCFHeader header = null;

    /**
     * Maps offsets (encoded in BCF) into contig names (from header) for the CHROM field
     */
    private final ArrayList contigNames = new ArrayList();

    /**
     * Maps header string names (encoded in VCF) into strings found in the BCF header
     *
     * Initialized when processing the header
     */
    private ArrayList dictionary;

    /**
     * Our decoder that reads low-level objects from the BCF2 records
     */
    private final BCF2Decoder decoder = new BCF2Decoder();

    /**
     * Provides some sanity checking on the header
     */
    private final static int MAX_HEADER_SIZE = 0x08000000;

    /**
     * Genotype field decoders that are initialized when the header is read
     */
    private BCF2GenotypeFieldDecoders gtFieldDecoders = null;

    /**
     * A cached array of GenotypeBuilders for efficient genotype decoding.
     *
     * Caching it allows us to avoid recreating this intermediate data
     * structure each time we decode genotypes
     */
    private GenotypeBuilder[] builders = null;

    // for error handling
    private int recordNo = 0;
    private int pos = 0;


    // ----------------------------------------------------------------------
    //
    // Feature codec interface functions
    //
    // ----------------------------------------------------------------------

    @Override
    public Feature decodeLoc( final PositionalBufferedStream inputStream ) {
        return decode(inputStream);
    }

    @Override
    public VariantContext decode( final PositionalBufferedStream inputStream ) {
        try {
            recordNo++;
            final VariantContextBuilder builder = new VariantContextBuilder();

            final int sitesBlockSize = decoder.readBlockSize(inputStream);
            final int genotypeBlockSize = decoder.readBlockSize(inputStream);

            decoder.readNextBlock(sitesBlockSize, inputStream);
            decodeSiteLoc(builder);
            final SitesInfoForDecoding info = decodeSitesExtendedInfo(builder);

            decoder.readNextBlock(genotypeBlockSize, inputStream);
            createLazyGenotypesDecoder(info, builder);
            return builder.fullyDecoded(true).make();
        } catch ( IOException e ) {
            throw new TribbleException("Failed to read BCF file", e);
        }
    }

    @Override
    public Class getFeatureType() {
        return VariantContext.class;
    }

    /**
     * Validate the actual version against the supported version to determine compatibility. Throws a
     * TribbleException if the actualVersion is not compatible with the supportedVersion. Subclasses can override
     * this to provide a custom version compatibility policy, but allowing something other than the
     * supported version is dangerous and should be done with great care.
     *
     * The default policy is to require an exact version match.
     * @param supportedVersion the current BCF implementation version
     * @param actualVersion the actual version
     * @thows TribbleException if the version policy determines that {@code actualVersion} is not compatible
     * with {@code supportedVersion}
     */
    protected void validateVersionCompatibility(final BCFVersion supportedVersion, final BCFVersion actualVersion) {
        if ( actualVersion.getMajorVersion() != ALLOWED_MAJOR_VERSION ) {
            error("BCF2Codec can only process BCF2 files, this file has major version " + bcfVersion.getMajorVersion());
        }

        // require the minor version to be an exact match and reject minor versions form the future
        if ( actualVersion.getMinorVersion() != ALLOWED_MINOR_VERSION ) {
            error("BCF2Codec can only process BCF2 files with minor version = " + ALLOWED_MINOR_VERSION + " but this file has minor version " + bcfVersion.getMinorVersion());
        }
    }

    @Override
    public FeatureCodecHeader readHeader( final PositionalBufferedStream inputStream ) {
        try {
            // note that this reads the magic as well, and so does double duty
            bcfVersion = BCFVersion.readBCFVersion(inputStream);
            if ( bcfVersion == null ) {
                error("Input stream does not contain a BCF encoded file; BCF magic header info not found");
            }

            validateVersionCompatibility(BCF2Codec.ALLOWED_BCF_VERSION, bcfVersion);
            if ( GeneralUtils.DEBUG_MODE_ENABLED ) {
                System.err.println("Parsing data stream with BCF version " + bcfVersion);
            }

            final int headerSizeInBytes = BCF2Type.INT32.read(inputStream);

            if ( headerSizeInBytes <= 0 || headerSizeInBytes > MAX_HEADER_SIZE) // no bigger than 8 MB
                error("BCF2 header has invalid length: " + headerSizeInBytes + " must be >= 0 and < "+ MAX_HEADER_SIZE);

            final byte[] headerBytes = new byte[headerSizeInBytes];
            if ( inputStream.read(headerBytes) != headerSizeInBytes )
                error("Couldn't read all of the bytes specified in the header length = " + headerSizeInBytes);

            final PositionalBufferedStream bps = new PositionalBufferedStream(new ByteArrayInputStream(headerBytes));
            final LineIterator lineIterator = new LineIteratorImpl(new SynchronousLineReader(bps));
            final VCFCodec headerParser = new VCFCodec();
            this.header = (VCFHeader) headerParser.readActualHeader(lineIterator);
            bps.close();
        } catch ( IOException e ) {
            throw new TribbleException("I/O error while reading BCF2 header");
        }

        // create the config offsets
        if ( ! header.getContigLines().isEmpty() ) {
            contigNames.clear();
            for ( final VCFContigHeaderLine contig : header.getContigLines()) {
                if ( contig.getID() == null || contig.getID().equals("") )
                    error("found a contig with an invalid ID " + contig);
                contigNames.add(contig.getID());
            }
        } else {
            error("Didn't find any contig lines in BCF2 file header");
        }

        // create the string dictionary
        dictionary = parseDictionary(header);

        // prepare the genotype field decoders
        gtFieldDecoders = new BCF2GenotypeFieldDecoders(header);

        // create and initialize the genotype builder array
        final int nSamples = header.getNGenotypeSamples();
        builders = new GenotypeBuilder[nSamples];
        for ( int i = 0; i < nSamples; i++ ) {
            builders[i] = new GenotypeBuilder(header.getGenotypeSamples().get(i));
        }

        // position right before next line (would be right before first real record byte at end of header)
        return new FeatureCodecHeader(header, inputStream.getPosition());
    }

    @Override
    public boolean canDecode( final String path ) {
        try (InputStream fis = Files.newInputStream(IOUtil.getPath(path)) ){
            final BCFVersion version = BCFVersion.readBCFVersion(fis);
            return version != null && version.getMajorVersion() == ALLOWED_MAJOR_VERSION;
        } catch ( final IOException e ) {
            return false;
        }
    }

    // --------------------------------------------------------------------------------
    //
    // implicit block
    //
    // The first four records of BCF are inline untype encoded data of:
    //
    // 4 byte integer chrom offset
    // 4 byte integer start
    // 4 byte integer ref length
    // 4 byte float qual
    //
    // --------------------------------------------------------------------------------

    /**
     * Decode the sites level data from this classes decoder
     *
     * @param builder
     * @return
     */
    private final void decodeSiteLoc(final VariantContextBuilder builder) throws IOException {
        final int contigOffset = decoder.decodeInt(BCF2Type.INT32);
        final String contig = lookupContigName(contigOffset);
        builder.chr(contig);

        this.pos = decoder.decodeInt(BCF2Type.INT32) + 1; // GATK is one based, BCF2 is zero-based
        final int refLength = decoder.decodeInt(BCF2Type.INT32);
        builder.start((long)pos);
        builder.stop((long)(pos + refLength - 1)); // minus one because GATK has closed intervals but BCF2 is open
    }

    /**
     * Decode the sites level data from this classes decoder
     *
     * @param builder
     * @return
     */
    private final SitesInfoForDecoding decodeSitesExtendedInfo(final VariantContextBuilder builder) throws IOException {
        final Object qual = decoder.decodeSingleValue(BCF2Type.FLOAT);
        if ( qual != null ) {
            builder.log10PError(((Double)qual) / -10.0);
        }

        final int nAlleleInfo = decoder.decodeInt(BCF2Type.INT32);
        final int nFormatSamples = decoder.decodeInt(BCF2Type.INT32);
        final int nAlleles = nAlleleInfo >> 16;
        final int nInfo = nAlleleInfo & 0x0000FFFF;
        final int nFormatFields = nFormatSamples >> 24;
        final int nSamples = nFormatSamples & 0x00FFFFF;

        if ( header.getNGenotypeSamples() != nSamples )
            error("Reading BCF2 files with different numbers of samples per record " +
                    "is not currently supported.  Saw " + header.getNGenotypeSamples() +
                    " samples in header but have a record with " + nSamples + " samples");

        decodeID(builder);
        final List alleles = decodeAlleles(builder, pos, nAlleles);
        decodeFilter(builder);
        decodeInfo(builder, nInfo);

        final SitesInfoForDecoding info = new SitesInfoForDecoding(nFormatFields, nSamples, alleles);
        if ( ! info.isValid() )
            error("Sites info is malformed: " + info);
        return info;
    }

    protected final static class SitesInfoForDecoding {
        final int nFormatFields;
        final int nSamples;
        final List alleles;

        private SitesInfoForDecoding(final int nFormatFields, final int nSamples, final List alleles) {
            this.nFormatFields = nFormatFields;
            this.nSamples = nSamples;
            this.alleles = alleles;
        }

        public boolean isValid() {
            return nFormatFields >= 0 &&
                    nSamples >= 0 &&
                    alleles != null && ! alleles.isEmpty() && alleles.get(0).isReference();
        }

        @Override
        public String toString() {
            return String.format("nFormatFields = %d, nSamples = %d, alleles = %s", nFormatFields, nSamples, alleles);
        }
    }

    /**
     * Decode the id field in this BCF2 file and store it in the builder
     * @param builder
     */
    private void decodeID( final VariantContextBuilder builder ) throws IOException {
        final String id = (String)decoder.decodeTypedValue();

        if ( id == null )
            builder.noID();
        else
            builder.id(id);
    }

    /**
     * Decode the alleles from this BCF2 file and put the results in builder
     * @param builder
     * @param pos
     * @param nAlleles
     * @return the alleles
     */
    private List decodeAlleles( final VariantContextBuilder builder, final int pos, final int nAlleles ) throws IOException {
        // TODO -- probably need inline decoder for efficiency here (no sense in going bytes -> string -> vector -> bytes
        List alleles = new ArrayList(nAlleles);
        String ref = null;

        for ( int i = 0; i < nAlleles; i++ ) {
            final String alleleBases = (String)decoder.decodeTypedValue();

            final boolean isRef = i == 0;
            final Allele allele = Allele.create(alleleBases, isRef);
            if ( isRef ) ref = alleleBases;

            alleles.add(allele);
        }
        assert ref != null;

        builder.alleles(alleles);

        assert !ref.isEmpty();

        return alleles;
    }

    /**
     * Decode the filter field of this BCF2 file and store the result in the builder
     * @param builder
     */
    private void decodeFilter( final VariantContextBuilder builder ) throws IOException {
        final Object value = decoder.decodeTypedValue();

        if ( value == null )
            builder.unfiltered();
        else {
            if ( value instanceof Integer ) {
                // fast path for single integer result
                final String filterString = getDictionaryString((Integer)value);
                if ( VCFConstants.PASSES_FILTERS_v4.equals(filterString))
                    builder.passFilters();
                else
                    builder.filter(filterString);
            } else {
                for ( final int offset : (List)value )
                    builder.filter(getDictionaryString(offset));
            }
        }
    }

    /**
     * Loop over the info field key / value pairs in this BCF2 file and decode them into the builder
     *
     * @param builder
     * @param numInfoFields
     */
    private void decodeInfo( final VariantContextBuilder builder, final int numInfoFields ) throws IOException {
        if ( numInfoFields == 0 )
            // fast path, don't bother doing any work if there are no fields
            return;

        final Map infoFieldEntries = new HashMap(numInfoFields);
        for ( int i = 0; i < numInfoFields; i++ ) {
            final String key = getDictionaryString();
            Object value = decoder.decodeTypedValue();
            final VCFCompoundHeaderLine metaData = VariantContextUtils.getMetaDataForField(header, key);
            if ( metaData.getType() == VCFHeaderLineType.Flag ) value = true; // special case for flags
            infoFieldEntries.put(key, value);
        }

        builder.attributes(infoFieldEntries);
    }

    // --------------------------------------------------------------------------------
    //
    // Decoding Genotypes
    //
    // --------------------------------------------------------------------------------

    /**
     * Create the lazy loader for the genotypes data, and store it in the builder
     * so that the VC will be able to decode on demand the genotypes data
     *
     * @param siteInfo
     * @param builder
     */
    private void createLazyGenotypesDecoder( final SitesInfoForDecoding siteInfo,
                                             final VariantContextBuilder builder ) {
        if (siteInfo.nSamples > 0) {
            final LazyGenotypesContext.LazyParser lazyParser =
                    new BCF2LazyGenotypesDecoder(this, siteInfo.alleles, siteInfo.nSamples, siteInfo.nFormatFields, builders);

            final LazyData lazyData = new LazyData(header, siteInfo.nFormatFields, decoder.getRecordBytes());
            final LazyGenotypesContext lazy = new LazyGenotypesContext(lazyParser, lazyData, header.getNGenotypeSamples());

            // did we resort the sample names?  If so, we need to load the genotype data
            if ( !header.samplesWereAlreadySorted() )
                lazy.decode();

            builder.genotypesNoValidation(lazy);
        }
    }

    public static class LazyData {
        final public VCFHeader header;
        final public int nGenotypeFields;
        final public byte[] bytes;

        public LazyData(final VCFHeader header, final int nGenotypeFields, final byte[] bytes) {
            this.header = header;
            this.nGenotypeFields = nGenotypeFields;
            this.bytes = bytes;
        }
    }

    private final String getDictionaryString() throws IOException {
        return getDictionaryString((Integer) decoder.decodeTypedValue());
    }

    protected final String getDictionaryString(final int offset) {
        return dictionary.get(offset);
    }

    /**
     * Translate the config offset as encoded in the BCF file into the actual string
     * name of the contig from the dictionary
     *
     * @param contigOffset
     * @return
     */
    private final String lookupContigName( final int contigOffset ) {
        return contigNames.get(contigOffset);
    }

    private final ArrayList parseDictionary(final VCFHeader header) {
        final ArrayList dict = BCF2Utils.makeDictionary(header);

        // if we got here we never found a dictionary, or there are no elements in the dictionary
        if ( dict.isEmpty() )
            error("Dictionary header element was absent or empty");

        return dict;
    }

    /**
     * @return the VCFHeader we found in this BCF2 file
     */
    protected VCFHeader getHeader() {
        return header;
    }

    protected BCF2GenotypeFieldDecoders.Decoder getGenotypeFieldDecoder(final String field) {
        return gtFieldDecoders.getDecoder(field);
    }

    protected void error(final String message) throws RuntimeException {
        throw new TribbleException(String.format("%s, at record %d with position %d:", message, recordNo, pos));
    }
    
    /** try to read a BCFVersion from an uncompressed BufferedInputStream.
     * The buffer must be large enough to contain {@link #SIZEOF_BCF_HEADER}
     * 
     * @param uncompressedBufferedInput the uncompressed input stream
     * @return the BCFVersion if it can be decoded, or null if not found.
     * @throws IOException
     */
    public static BCFVersion tryReadBCFVersion(final BufferedInputStream uncompressedBufferedInput) throws IOException {
        uncompressedBufferedInput.mark(SIZEOF_BCF_HEADER);
        final BCFVersion bcfVersion = BCFVersion.readBCFVersion(uncompressedBufferedInput);
        uncompressedBufferedInput.reset();
        return bcfVersion;
    }
    
}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy