All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.broadinstitute.hellbender.utils.variant.writers.GVCFBlockCombiner Maven / Gradle / Ivy

There is a newer version: 4.6.0.0
Show newest version
package org.broadinstitute.hellbender.utils.variant.writers;

import com.google.common.annotations.VisibleForTesting;
import com.google.common.collect.Range;
import com.google.common.collect.RangeMap;
import com.google.common.collect.TreeRangeMap;
import htsjdk.variant.variantcontext.Allele;
import htsjdk.variant.variantcontext.Genotype;
import htsjdk.variant.variantcontext.VariantContext;
import htsjdk.variant.vcf.VCFConstants;
import htsjdk.variant.vcf.VCFHeader;
import htsjdk.variant.vcf.VCFHeaderLine;
import htsjdk.variant.vcf.VCFStandardHeaderLines;
import org.broadinstitute.hellbender.exceptions.GATKException;
import org.broadinstitute.hellbender.utils.Utils;
import org.broadinstitute.hellbender.utils.iterators.PushPullTransformer;
import org.broadinstitute.hellbender.utils.variant.GATKVCFConstants;
import org.broadinstitute.hellbender.utils.variant.GATKVCFHeaderLines;

import java.util.ArrayDeque;
import java.util.ArrayList;
import java.util.List;
import java.util.Queue;

import static htsjdk.variant.vcf.VCFConstants.MAX_GENOTYPE_QUAL;
import static org.broadinstitute.hellbender.utils.variant.writers.GVCFWriter.GVCF_BLOCK;

/**
 * Combines variants into GVCF blocks.
 */
public class GVCFBlockCombiner implements PushPullTransformer {
    final RangeMap> gqPartitions;
    final int defaultPloidy;
    final boolean floorBlocks;
    final Queue toOutput = new ArrayDeque<>();

    /**
     * fields updated on the fly during GVCFWriter operation
     */
    private int nextAvailableStart = -1;
    private String contigOfNextAvailableStart = null;
    private String sampleName = null;

    GVCFBlock currentBlock = null;

    public GVCFBlockCombiner(List gqPartitions, int defaultPloidy, boolean floorBlocks) {
        this.gqPartitions = parsePartitions(gqPartitions);
        this.defaultPloidy = defaultPloidy;
        this.floorBlocks = floorBlocks;
    }

    /**
     * Create {@link HomRefBlock}s which will collectively accept variants of any genotype quality
     *
     * Each individual block covers a band of genotype qualities with the splits between bands occurring at values in {@code gqPartitions}.
     * There will be {@code gqPartitions.size() +1} bands produced covering the entire possible range of genotype qualities from 0 to {@link VCFConstants#MAX_GENOTYPE_QUAL}.
     *
     * Note that this has to return a RangeMap with concrete types because Numbers aren't Comparable
     *
     * @param gqPartitions proposed GQ partitions
     * @return a list of HomRefBlocks accepting bands of genotypes qualities split at the points specified in gqPartitions
     */
    @VisibleForTesting
    RangeMap> parsePartitions(final List gqPartitions) {
        Utils.nonEmpty(gqPartitions);
        Utils.containsNoNull(gqPartitions, "The list of GQ partitions contains a null integer");
        final RangeMap> result = TreeRangeMap.create();
        int lastThreshold = 0;
        for (final Number num : gqPartitions) {
            final int value = num.intValue();
            if (value < 0) {
                throw new IllegalArgumentException("The list of GQ partitions contains a non-positive integer.");
            } else if (value > MAX_GENOTYPE_QUAL + 1) {
                throw new IllegalArgumentException(String.format("The value %d in the list of GQ partitions is greater than VCFConstants.MAX_GENOTYPE_QUAL + 1 = %d.", value, MAX_GENOTYPE_QUAL + 1));
            } else if (value < lastThreshold) {
                throw new IllegalArgumentException(String.format("The list of GQ partitions is out of order. Previous value is %d but the next is %d.", lastThreshold, value));
            } else if (value == lastThreshold) {
                throw new IllegalArgumentException(String.format("The value %d appears more than once in the list of GQ partitions.", value));
            }

            result.put(Range.closedOpen(lastThreshold, value), Range.closedOpen(lastThreshold, value));
            lastThreshold = value;
        }

        if (lastThreshold <= MAX_GENOTYPE_QUAL) {
            result.put(Range.closedOpen(lastThreshold, MAX_GENOTYPE_QUAL + 1), Range.closedOpen(lastThreshold,MAX_GENOTYPE_QUAL + 1));
        }

        return result;
    }

    public void addRangesToHeader(VCFHeader header) {
        Utils.nonNull(header, "header cannot be null");

        header.addMetaDataLine(VCFStandardHeaderLines.getInfoLine(VCFConstants.END_KEY));
        header.addMetaDataLine(GATKVCFHeaderLines.getFormatLine(GATKVCFConstants.MIN_DP_FORMAT_KEY));

        for (final Range partition : gqPartitions.asMapOfRanges().keySet()) {
            header.addMetaDataLine(rangeToVCFHeaderLine(partition));
        }
    }

    static VCFHeaderLine rangeToVCFHeaderLine(Range genotypeQualityBand) {
        // Need to uniquify the key for the header line using the min/max GQ, since
        // VCFHeader does not allow lines with duplicate keys.
        final String key = String.format(GVCF_BLOCK+"%d-%d", genotypeQualityBand.lowerEndpoint(), genotypeQualityBand.upperEndpoint());
        return new VCFHeaderLine(key, "minGQ=" + genotypeQualityBand.lowerEndpoint() + "(inclusive),maxGQ=" + genotypeQualityBand.upperEndpoint() + "(exclusive)");
    }

    /**
     * Add hom-ref site from vc to this gVCF hom-ref state tracking, emitting any pending states if appropriate
     *
     * @param vc a non-null VariantContext
     * @param g  a non-null genotype from VariantContext
     * @return a VariantContext to be emitted, or null if non is appropriate
     */
    protected VariantContext addHomRefSite(final VariantContext vc, final Genotype g) {

        if (nextAvailableStart != -1) {
            //there's a use case here related to ReblockGVCFs for overlapping deletions on different haplotypes
            if ( vc.getStart() <= nextAvailableStart && vc.getContig().equals(contigOfNextAvailableStart) ) {
                if (vc.getEnd() <= nextAvailableStart) {
                    return null;
                }
            }
            // otherwise, reset to non-relevant
            nextAvailableStart = -1;
            contigOfNextAvailableStart = null;
        }

        final VariantContext result;
        if (genotypeCanBeMergedInCurrentBlock(g)) {
            currentBlock.add(vc.getStart(), vc.getAttributeAsInt(VCFConstants.END_KEY, vc.getStart()), g);
            result = null;
        } else {
            result = currentBlock != null ? currentBlock.toVariantContext(sampleName, floorBlocks): null;
            currentBlock = createNewBlock(vc, g);
        }
        return result;
    }

    boolean genotypeCanBeMergedInCurrentBlock(final Genotype g) {
        final HomRefBlock currentHomRefBlock = (HomRefBlock)currentBlock;
        return currentHomRefBlock != null
                && currentHomRefBlock.withinBounds(Math.min(g.getGQ(), MAX_GENOTYPE_QUAL))
                && currentHomRefBlock.getPloidy() == g.getPloidy()
                && (currentHomRefBlock.getMinPLs() == null || !g.hasPL() || (currentHomRefBlock.getMinPLs().length == g.getPL().length));
    }

    /**
     * Helper function to create a new HomRefBlock from a variant context and current genotype
     *
     * @param vc the VariantContext at the site where want to start the band
     * @param g  the genotype of the sample from vc that should be used to initialize the block
     * @return a newly allocated and initialized block containing g already
     */
    GVCFBlock createNewBlock(final VariantContext vc, final Genotype g) {
        // figure out the GQ limits to use based on the GQ of g
        final int gq = Math.min(g.getGQ(), MAX_GENOTYPE_QUAL);
        final Range partition = gqPartitions.get(gq);

        if( partition == null) {
            throw new GATKException("GQ " + g + " from " + vc + " didn't fit into any partition");
        }

        // create the block, add g to it, and return it for use
        final HomRefBlock block = new HomRefBlock(vc, partition.lowerEndpoint(), partition.upperEndpoint(), defaultPloidy);
        block.add(vc.getStart(), vc.getAttributeAsInt(VCFConstants.END_KEY, vc.getStart()), g);
        return block;
    }

    /**
     * Flush the current hom-ref block, if necessary, to the underlying writer, and reset the currentBlock to null
     */
    private void emitCurrentBlock() {
        if (currentBlock != null) {
            toOutput.add(currentBlock.toVariantContext(sampleName, floorBlocks));
            this.currentBlock = null;
        }
    }

    @Override
    public void submit(VariantContext vc) {
        Utils.nonNull(vc);
        Utils.validateArg(vc.hasGenotypes(), "GVCF assumes that the VariantContext has genotypes");
        Utils.validateArg(vc.getGenotypes().size() == 1, () -> "GVCF assumes that the VariantContext has exactly one genotype but saw " + vc.getGenotypes().size());

        if (sampleName == null) {
            sampleName = vc.getGenotype(0).getSampleName();
        }

        if (currentBlock != null && !currentBlock.isContiguous(vc)) {
            // we've made a non-contiguous step (across interval, onto another chr), so finalize
            emitCurrentBlock();
        }

        final Genotype g = vc.getGenotype(0);
        if (g.isHomRef() && vc.hasAlternateAllele(Allele.NON_REF_ALLELE) && vc.isBiallelic()) {
            // create bands
            final VariantContext maybeCompletedBand = addHomRefSite(vc, g);
            if (maybeCompletedBand != null) {
                toOutput.add(maybeCompletedBand);
            }
        } else {
            // g is variant, so flush the bands and emit vc
            emitCurrentBlock();
            nextAvailableStart = vc.getEnd();
            contigOfNextAvailableStart = vc.getContig();
            toOutput.add(vc);
        }
    }

    @Override
    public boolean hasFinalizedItems() {
        return !toOutput.isEmpty();
    }

    @Override
    public List consumeFinalizedItems() {
        List result = new ArrayList<>(toOutput);
        toOutput.clear();
        return result;
    }

    @Override
    public void signalEndOfInput() {
        emitCurrentBlock();
    }
}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy