All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.broadinstitute.hellbender.utils.variant.writers.ShardingVCFWriter Maven / Gradle / Ivy

The newest version!
package org.broadinstitute.hellbender.utils.variant.writers;

import htsjdk.samtools.SAMSequenceDictionary;
import htsjdk.samtools.util.FileExtensions;
import htsjdk.variant.variantcontext.VariantContext;
import htsjdk.variant.variantcontext.writer.Options;
import htsjdk.variant.variantcontext.writer.VariantContextWriter;
import htsjdk.variant.vcf.VCFHeader;
import org.broadinstitute.hellbender.utils.Utils;
import org.broadinstitute.hellbender.utils.io.IOUtils;
import org.broadinstitute.hellbender.utils.variant.GATKVariantContextUtils;

import java.nio.file.Path;
import java.nio.file.Paths;

/**
 * Variant writer tha splits output to multiple VCFs given the maximum records per file. Before using {@link #add},
 * the header should be set using either {@link #setHeader} or {@link #writeHeader}, which may only be called once
 * and will determine whether headers are written to all shards.
 *
 * @author Mark Walker <[email protected]>
 */
public class ShardingVCFWriter implements VariantContextWriter {

    public static final String SHARD_INDEX_PREFIX = ".shard_";
    public static final String SHARD_INDEX_SUFFIX = FileExtensions.COMPRESSED_VCF;

    private VariantContextWriter writer;
    private VCFHeader header;
    private final int maxVariantsPerShard;
    private final Path basePath;
    private final SAMSequenceDictionary dictionary;
    private final boolean createMD5;
    private final Options[] options;

    /** Current shard  */
    private int shardIndex;
    /** Number of records written to current shard  */
    private int shardSize;
     /** Whether to write header, or null if header is undefined */
    private Boolean enableWriteHeader;

    /**
     * Create a new sharding VCF writer
     *
     * @param basePath              base path of the output VCFs. The shard designation and file extension will be added.
     * @param maxVariantsPerShard   max number of records per file (last shard may have less)
     * @param dictionary            sequence dictionary for this writer
     * @param createMD5             enable MD5 file creation
     * @param options               vcf writer options
     */
    public ShardingVCFWriter(final Path basePath,
                             final int maxVariantsPerShard,
                             final SAMSequenceDictionary dictionary,
                             final boolean createMD5,
                             final Options... options) {
        Utils.nonNull(basePath);
        Utils.validateArg(maxVariantsPerShard > 0, "maxVariantsPerShard must be positive");
        this.basePath = IOUtils.removeExtension(basePath, FileExtensions.VCF_LIST);
        this.maxVariantsPerShard = maxVariantsPerShard;
        this.dictionary = dictionary;
        this.createMD5 = createMD5;
        this.options = options;

        // Initialize first shard
        this.shardIndex = 0;
        this.shardSize = 0;
        this.writer = createNewWriter();
    }

    /**
     * Initializes a new sharded file.
     */
    protected void createNextShard() {
        writer.close();
        shardIndex++;
        shardSize = 0;
        writer = createNewWriter();
        Utils.nonNull(header, "Attempted to create new shard before header has been set");
        initializeShardHeader();
    }

    /**
     * Initializes shard header depending on which header function (set or write) was used
     */
    protected void initializeShardHeader() {
        if (enableWriteHeader != null) {
            if (enableWriteHeader.booleanValue()) {
                writer.writeHeader(header);
            } else {
                writer.setHeader(header);
            }
        }
    }

    /**
     * Creates a writer for a new shard
     *
     * @return the new writer
     */
    protected VariantContextWriter createNewWriter() {
        final Path outPath = Paths.get(getShardFilename(basePath, shardIndex));
        return GATKVariantContextUtils.createVCFWriter(
                outPath,
                dictionary,
                createMD5,
                options);
    }

    /**
     * Gets filepath for the given shard and base path
     *
     * @param basePath path without extension
     * @param shardIndex
     * @return path as String
     */
    public static String getShardFilename(final Path basePath, final int shardIndex) {
        return String.format("%s%s%05d%s", basePath, SHARD_INDEX_PREFIX, shardIndex, SHARD_INDEX_SUFFIX);
    }

    /**
     * Defines header for the writer. The header will not be written to any shards. May only be called once and only
     * if {@link #writeHeader} has not been called.
     *
     * @param header header to use
     */
    @Override
    public void setHeader(final VCFHeader header) {
        Utils.validate(this.header == null, "Cannot redefine header");
        this.header = header;
        enableWriteHeader = Boolean.FALSE;
        writer.setHeader(header);
    }

    /**
     * Defines header for the writer that will be written to all shards. May only be called once and only
     * if {@link #setHeader} has not been called.
     *
     * @param header header to use
     */
    @Override
    public void writeHeader(final VCFHeader header) {
        Utils.validate(this.header == null, "Cannot redefine header");
        this.header = header;
        enableWriteHeader = Boolean.TRUE;
        writer.writeHeader(header);
    }

    @Override
    public void close() {
        writer.close();
    }

    @Override
    public boolean checkError() {
        return writer.checkError();
    }

    /**
     * Adds variant to writer. Note that a header must be assigned first.
     *
     * @param vc variant to write
     */
    @Override
    public void add(final VariantContext vc) {
        if (shardSize + 1 > maxVariantsPerShard) {
            createNextShard();
        }
        writer.add(vc);
        shardSize++;
    }
}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy