org.geneweaver.io.reader.FastVCFReader Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of gweaver-stream-io Show documentation
Show all versions of gweaver-stream-io Show documentation
The IO bundle for Geneweaver.
package org.geneweaver.io.reader;
import java.util.Arrays;
import java.util.List;
import org.apache.commons.beanutils.BeanMap;
import org.geneweaver.domain.Entity;
import org.geneweaver.domain.VariantCall;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
/**
* VCF files read as a stream exist here:
*
com.github.samtools
htsjdk
2.24.1
However it is quite a large dependency to make on a small package like this one.
More importantly this simple reader basically is designed to do just what we need when biulding
the geneweaver graph. This means that long lines of individual information are not parsed
or split meaning the stream processing this file can go *fast*
* @author gerrim
*
*/
public class FastVCFReader extends LineIteratorReader {
private static Logger logger = LoggerFactory.getLogger(FastVCFReader.class);
/**
* Create the reader by setting its data
*
* @param reader
* @throws ReaderException
*/
@SuppressWarnings("unchecked")
@Override
public FastVCFReader init(ReaderRequest request) throws ReaderException {
super.setup(request);
setDelimiter("\t"); // Must be a tab only
return this;
}
private List headerNames;
/**
* Creates the.
*
* @param line the line
* @return the n
* @throws ReaderException the reader exception
*/
@SuppressWarnings("unchecked")
@Override
protected N create(String line) throws ReaderException {
if (headerNames==null) {
if (header==null || header.isEmpty()) {
throw new ReaderException("VCF files must have a header!");
}
String headLine = header.get(header.size()-1);
String[] names = headLine.substring(1).split(getDelimiter());
// Something like: CHROM, POS, ID, REF, ALT, QUAL, FILTER, INFO, FORMAT, HG00096, ...
headerNames = Arrays.asList(names);
}
VariantCall bean = new VariantCall();
BeanMap d = new BeanMap(bean);
// Splitting these long lines is slow and we do not need the
// individual values, therefore we do not split instead we
// substring the line for each delimiter
String sline = line;
for(int i=0;i