All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.opencb.biodata.tools.variant.VariantVcfHtsjdkReader Maven / Gradle / Ivy

/*
 * 
 *
 */

package org.opencb.biodata.tools.variant;

import htsjdk.tribble.readers.LineIterator;
import htsjdk.tribble.readers.LineIteratorImpl;
import htsjdk.tribble.readers.LineReader;
import htsjdk.variant.variantcontext.VariantContext;
import htsjdk.variant.vcf.VCFHeader;
import org.opencb.biodata.formats.variant.io.VariantReader;
import org.opencb.biodata.formats.variant.vcf4.FullVcfCodec;
import org.opencb.biodata.models.variant.Variant;
import org.opencb.biodata.models.variant.VariantNormalizer;
import org.opencb.biodata.models.variant.VariantSource;
import org.opencb.biodata.tools.variant.converters.avro.VCFHeaderToAvroVcfHeaderConverter;
import org.opencb.biodata.tools.variant.converters.avro.VariantContextToVariantConverter;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import java.io.IOException;
import java.io.InputStream;
import java.io.UncheckedIOException;
import java.util.*;
import java.util.function.BiConsumer;

/**
 * Reads a VCF file using the library HTSJDK.
 *
 * Optionally, normalizes the variants.
 *
 * Created on 16/05/16.
 *
 * @author Jacobo Coll <[email protected]>
 */
public class VariantVcfHtsjdkReader implements VariantReader {

    private final Logger logger = LoggerFactory.getLogger(VariantVcfHtsjdkReader.class);

    private final VariantSource source;
    private final InputStream inputStream;
    private final VariantNormalizer normalizer;
    private FullVcfCodec codec;
    private VCFHeader header;
    private VariantContextToVariantConverter converter;
    private LineIterator lineIterator;
    private List headerLines;
    private Set> malformHandlerSet = new HashSet<>();
    private boolean failOnError = false;

    public VariantVcfHtsjdkReader(InputStream inputStream, VariantSource source) {
        this(inputStream, source, null);
    }

    public VariantVcfHtsjdkReader(InputStream inputStream, VariantSource source, VariantNormalizer normalizer) {
        this.source = source;
        this.inputStream = inputStream;
        this.normalizer = normalizer;
    }

    public VariantVcfHtsjdkReader registerMalformatedVcfHandler(BiConsumer handler) {
        this.malformHandlerSet.add(handler);
        return this;
    }

    public VariantVcfHtsjdkReader setFailOnError(boolean failOnError) {
        this.failOnError = failOnError;
        return this;
    }

    @Override
    public boolean open() {
        return true;
    }

    @Override
    public boolean pre() {
        codec = new FullVcfCodec();
        lineIterator = codec.makeSourceFromStream(inputStream);

        // Read the header
        headerLines = new LinkedList<>();
        while (lineIterator.hasNext()) {
            String line = lineIterator.peek();
            if (line.startsWith(VCFHeader.HEADER_INDICATOR)) {
                headerLines.add(line);
                lineIterator.next();
            } else {
                break;
            }
        }

        // Parse the header
        header = (VCFHeader) codec.readActualHeader(new LineIteratorImpl(new LineReader() {
            Iterator iterator = headerLines.iterator();
            @Override
            public String readLine() throws IOException {
                if (iterator.hasNext()) {
                    return iterator.next();
                } else {
                    return null;
                }
            }
            @Override public void close() {}
        }));

        // Create converters and fill VariantSource
        converter = new VariantContextToVariantConverter(source.getStudyId(), source.getFileId(), header.getSampleNamesInOrder());
        source.setHeader(new VCFHeaderToAvroVcfHeaderConverter().convert(header));
        source.setSamples(header.getSampleNamesInOrder());
        return true;
    }

    @Override
    public List read(int batchSize) {
        List variantContexts = new ArrayList<>(batchSize);
        while (lineIterator.hasNext() && variantContexts.size() < batchSize) {
            String line = lineIterator.next();
            if (line.startsWith("#") || line.trim().isEmpty()) {
                continue;
            }
            try {
                variantContexts.add(codec.decode(line));
            } catch (RuntimeException e) {
                logMalformatedLine(line, e);
                if (failOnError) {
                    throw e;
                }
//                if (e.getMessage().startsWith("The provided VCF file is malformed at approximately line number")) {
//                } else {
//                    throw e;
//                }
            }
        }

        List variants = converter.apply(variantContexts);

        if (normalizer != null) {
            variants = normalizer.apply(variants);
        }

        return variants;
    }

    private void logMalformatedLine(String line, RuntimeException exception) {
        logger.warn(exception.getMessage());
        for (BiConsumer consumer : this.malformHandlerSet) {
            consumer.accept(line, exception);
        }
    }

    @Override
    public boolean post() {
        return true;
    }

    @Override
    public boolean close() {
        try {
            inputStream.close();
        } catch (IOException e) {
            throw new UncheckedIOException(e);
        }
        return true;
    }

    @Override
    public List getSampleNames() {
        return header.getSampleNamesInOrder();
    }

    @Override
    public String getHeader() {
        return String.join("\n", headerLines);
    }

    public VariantSource getSource() {
        return source;
    }
}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy