All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.opencb.biodata.tools.variant.VcfManager Maven / Gradle / Ivy

There is a newer version: 3.3.0
Show newest version
/*
 * 
 *
 */

package org.opencb.biodata.tools.variant;

import htsjdk.samtools.util.CloseableIterator;
import htsjdk.tribble.index.IndexFactory;
import htsjdk.tribble.index.tabix.TabixFormat;
import htsjdk.variant.variantcontext.VariantContext;
import htsjdk.variant.variantcontext.writer.VariantContextWriter;
import htsjdk.variant.variantcontext.writer.VariantContextWriterBuilder;
import htsjdk.variant.vcf.VCFCodec;
import htsjdk.variant.vcf.VCFFileReader;
import org.opencb.biodata.models.core.Region;
import org.opencb.biodata.models.variant.Variant;
import org.opencb.biodata.models.variant.protobuf.VariantProto;
import org.opencb.biodata.tools.variant.filters.VariantFilters;
import org.opencb.biodata.tools.variant.iterators.VariantContextToAvroVariantVcfIterator;
import org.opencb.biodata.tools.variant.iterators.VariantContextToProtoVariantVcfIterator;
import org.opencb.biodata.tools.variant.iterators.VariantContextVcfIterator;
import org.opencb.biodata.tools.variant.iterators.VcfIterator;
import org.opencb.commons.utils.FileUtils;

import java.io.IOException;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.util.ArrayList;
import java.util.List;

import static htsjdk.variant.variantcontext.writer.VariantContextWriterBuilder.OutputType.BLOCK_COMPRESSED_VCF;

/**
 * Created by jtarraga on 29/11/16.
 */
public class VcfManager {

    private Path input;

    private Path dataPath;
    private Path indexPath;

    private VCFFileReader vcfReader;

    private static final int DEFAULT_MAX_NUM_RECORDS = 50000;

    public VcfManager(Path input) throws IOException {
        FileUtils.checkFile(input);
        this.input = input;
        this.dataPath = input;
    }

    /**
     * Creates a index file for the VCF input file.
     * @return The path of the index file.
     * @throws IOException
     */
    public Path createIndex() throws IOException {
        return createIndex(null);
    }

    /**
     * Creates a VCF index file.
     * @param indexPath The index created.
     * @return
     * @throws IOException
     */
    public Path createIndex(Path indexPath) throws IOException {
        VCFFileReader reader = new VCFFileReader(input.toFile(), false);

        // compress vcf if necessary, .gz
        if (!dataPath.getFileName().endsWith(".gz")) {
            dataPath = Paths.get(input + ".gz");
            System.out.println("Creating compressed file: " + dataPath);
            VariantContextWriter writer = new VariantContextWriterBuilder()
                    .setOutputFile(dataPath.toFile())
                    .setOutputFileType(BLOCK_COMPRESSED_VCF)
                    .build();

            writer.writeHeader(reader.getFileHeader());
            for (VariantContext vc: reader) {
                if (vc != null) {
                    writer.add(vc);
                }
            }
            writer.close();
        }


        // and then create the tabix index, .tbi
        if (indexPath == null) {
            this.indexPath = Paths.get(dataPath + ".tbi");
        } else {
            this.indexPath = indexPath;
        }


        System.out.println("Creating index file: " + this.indexPath);
        IndexFactory.createTabixIndex(dataPath.toFile(), new VCFCodec(), TabixFormat.VCF, reader.getFileHeader().getSequenceDictionary())
                .write(this.indexPath.toFile());
        reader.close();
        return this.indexPath;
    }

    /**
     * This method aims to provide a very simple, safe and quick way of accessing to a small fragment of the VCF file.
     * This must not be used in production for reading big data files. It returns a maximum of 10,000 variant records.
     *
     * @param region @return
     * @throws IOException
     */
    public List query(Region region) throws Exception {
        return query(region, null, new VariantOptions(), VariantContext.class);
    }

    public List query(Region region, VariantOptions options) throws Exception {
        return query(region, null, options, VariantContext.class);
    }

    public List query(Region region, VariantFilters filters, VariantOptions options) throws Exception {
        return query(region, filters, options, VariantContext.class);
    }

//    public List query() throws Exception {
//        return query(null, null, new VariantOptions(), VariantContext.class);
//    }

    public List query(VariantFilters filters) throws Exception {
        return query(null, filters, null, VariantContext.class);
    }

    public List query(VariantFilters filters, VariantOptions options) throws Exception {
        return query(null, filters, options, VariantContext.class);
    }

    public  List query(VariantFilters filters, VariantOptions options, Class clazz) throws Exception {
        return query(null, filters, options, clazz);
    }

    public  List query(Region region, VariantFilters filters, VariantOptions options, Class clazz) throws Exception {
        open();

        if (options == null) {
            options = new VariantOptions();
        }

        // Number of returned records, if not set then MAX_NUM_RECORDS is returned
        int maxNumberRecords = DEFAULT_MAX_NUM_RECORDS;
        if (options.getLimit() > 0) {  // && VariantOptions.getLimit() <= MAX_NUM_RECORDS
            maxNumberRecords = options.getLimit();
        }

        List results = new ArrayList<>(maxNumberRecords);
        VcfIterator vcfIterator = (region != null)
                ? iterator(region, filters, clazz)
                : iterator(filters, clazz);

        while (vcfIterator.hasNext() && results.size() < maxNumberRecords) {
            results.add(vcfIterator.next());
        }
        vcfIterator.close();
        return results;
    }

    /**
     * This method aims to provide a very simple, safe and quick way to iterate through VCF files.
     *
     */

    public VcfIterator iterator() {
        return iterator(null, VariantContext.class);
    }

    public VcfIterator iterator(VariantFilters filters) {
        return iterator(filters, VariantContext.class);
    }

    public  VcfIterator iterator(VariantFilters filters, Class clazz) {
        open();

        CloseableIterator variantContextIterator = vcfReader.iterator();
        return getVariantIterator(filters, clazz, variantContextIterator);
    }

    public VcfIterator iterator(Region region) {
        return iterator(region, null, VariantContext.class);
    }

    public VcfIterator iterator(Region region, VariantFilters filters) {
        return iterator(region, filters, VariantContext.class);
    }

    public  VcfIterator iterator(Region region, VariantFilters filters, Class clazz) {
        open();

        CloseableIterator variantContextIterator =
                vcfReader.query(region.getChromosome(), region.getStart(), region.getEnd());
        return getVariantIterator(filters, clazz, variantContextIterator);
    }

    private  VcfIterator getVariantIterator(VariantFilters filters, Class clazz,
                                                  CloseableIterator variantContextIterator) {
        if (Variant.class == clazz) { // AVRO
            return (VcfIterator) new VariantContextToAvroVariantVcfIterator(variantContextIterator, filters);
        } else if (VariantProto.Variant.class == clazz) { // PROTOCOL BUFFER
            return (VcfIterator) new VariantContextToProtoVariantVcfIterator(variantContextIterator, filters);
        } else if (VariantContext.class == clazz) {
            return (VcfIterator) new VariantContextVcfIterator(variantContextIterator, filters);
        } else {
            throw new IllegalArgumentException("Unknown variant class " + clazz);
        }
    }

    public void close() throws IOException {
        if (vcfReader != null) {
            vcfReader.close();
        }
    }

    private void open() {
        if (vcfReader == null) {
            if (indexPath != null) {
                vcfReader = new VCFFileReader(dataPath.toFile(), indexPath.toFile());
            } else {
                vcfReader = new VCFFileReader(dataPath.toFile());
            }
        }
    }

}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy