
org.opencb.biodata.tools.variant.VcfFileReader Maven / Gradle / Ivy
package org.opencb.biodata.tools.variant;
import htsjdk.tribble.readers.LineIterator;
import htsjdk.tribble.readers.LineIteratorImpl;
import htsjdk.tribble.readers.LineReader;
import htsjdk.tribble.TribbleException;
import htsjdk.variant.variantcontext.VariantContext;
import htsjdk.variant.vcf.VCFHeader;
import org.opencb.biodata.formats.variant.vcf4.FullVcfCodec;
import org.opencb.commons.io.DataReader;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.io.*;
import java.util.*;
import java.util.function.BiConsumer;
/**
* Created by joaquin on 9/27/16.
*/
public class VcfFileReader implements DataReader {
private final Logger logger = LoggerFactory.getLogger(VcfFileReader.class);
private String inputFilename;
private InputStream inputStream;
private FullVcfCodec codec;
private VCFHeader header;
private LineIterator lineIterator;
private List headerLines;
private boolean lazy;
private Set> malformHandlerSet = new HashSet<>();
public VcfFileReader registerMalformatedVcfHandler(BiConsumer handler) {
this.malformHandlerSet.add(handler);
return this;
}
public VcfFileReader(String inputFilename, boolean lazy) {
this.inputFilename = inputFilename;
this.lazy = lazy;
}
@Override
public boolean open() {
try {
inputStream = new FileInputStream(new File(inputFilename));
codec = new FullVcfCodec();
lineIterator = codec.makeSourceFromStream(inputStream);
// Read the header
headerLines = new LinkedList<>();
while (lineIterator.hasNext()) {
String line = lineIterator.peek();
if (line.startsWith(VCFHeader.HEADER_INDICATOR)) {
headerLines.add(line);
lineIterator.next();
} else {
break;
}
}
// Parse the header
header = (VCFHeader) codec.readActualHeader(new LineIteratorImpl(new LineReader() {
Iterator iterator = headerLines.iterator();
@Override
public String readLine() throws IOException {
if (iterator.hasNext()) {
return iterator.next();
} else {
return null;
}
}
@Override
public void close() {
}
}));
} catch (Exception e) {
e.printStackTrace();
return false;
}
return true;
}
@Override
public List read(int batchSize) {
List variantContexts = new ArrayList<>(batchSize);
while (lineIterator.hasNext() && variantContexts.size() < batchSize) {
String line = lineIterator.next();
if (line.startsWith("#") || line.trim().isEmpty()) {
continue;
}
try {
VariantContext variantContext = codec.decode(line);
// Lazy processing management
if (!lazy && variantContext.getGenotypes().isLazyWithData()) {
variantContext.getGenotype(variantContext.getGenotypes().size() - 1);
}
variantContext.getGenotypes();
variantContexts.add(variantContext);
} catch (TribbleException e) {
if (e.getMessage().startsWith("The provided VCF file is malformed at approximately line number")) {
logMalformatedLine(line, e);
} else {
throw e;
}
}
}
return variantContexts;
}
private void logMalformatedLine(String line, RuntimeException error) {
logger.warn(error.getMessage());
for (BiConsumer consumer : this.malformHandlerSet) {
consumer.accept(line, error);
}
}
@Override
public boolean close() {
try {
inputStream.close();
} catch (IOException e) {
throw new UncheckedIOException(e);
}
return true;
}
public VCFHeader getVcfHeader() {
if (header == null) {
open();
close();
}
return header;
}
public boolean isLazy() {
return lazy;
}
public VcfFileReader setLazy(boolean lazy) {
this.lazy = lazy;
return this;
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy