org.pharmgkb.parser.vcf.VcfParser Maven / Gradle / Ivy
Show all versions of vcf-parser Show documentation
package org.pharmgkb.parser.vcf;
import java.io.BufferedReader;
import java.io.Closeable;
import java.io.IOException;
import java.lang.invoke.MethodHandles;
import java.math.BigDecimal;
import java.nio.file.Files;
import java.nio.file.Path;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.List;
import java.util.Map;
import java.util.regex.Pattern;
import javax.annotation.Nonnull;
import javax.annotation.Nullable;
import com.google.common.base.Preconditions;
import com.google.common.collect.ArrayListMultimap;
import com.google.common.collect.ListMultimap;
import org.apache.commons.lang3.StringUtils;
import org.pharmgkb.parser.vcf.model.BaseMetadata;
import org.pharmgkb.parser.vcf.model.ContigMetadata;
import org.pharmgkb.parser.vcf.model.FormatMetadata;
import org.pharmgkb.parser.vcf.model.IdDescriptionMetadata;
import org.pharmgkb.parser.vcf.model.InfoMetadata;
import org.pharmgkb.parser.vcf.model.VcfMetadata;
import org.pharmgkb.parser.vcf.model.VcfPosition;
import org.pharmgkb.parser.vcf.model.VcfSample;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
/**
* This class parses a VCF file.
*
* @author Mark Woon
*/
public class VcfParser implements Closeable {
private static final Logger sf_logger = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass());
private static final Pattern sf_tabSplitter = Pattern.compile("\t");
private static final Pattern sf_commaSplitter = Pattern.compile(",");
private static final Pattern sf_colonSplitter = Pattern.compile(":");
private static final Pattern sf_semicolonSplitter = Pattern.compile(";");
private final boolean m_rsidsOnly;
private final BufferedReader m_reader;
private VcfMetadata m_vcfMetadata;
private final VcfLineParser m_vcfLineParser;
private int m_lineNumber;
private boolean m_alreadyFinished;
private VcfParser(@Nonnull BufferedReader reader, boolean rsidsOnly, @Nonnull VcfLineParser lineParser) {
m_reader = reader;
m_rsidsOnly = rsidsOnly;
m_vcfLineParser = lineParser;
}
/**
* Parses metadata only.
* This method should be if only the metadata is needed; otherwise, {@link #parse()} is preferred.
*/
public @Nonnull VcfMetadata parseMetadata() throws IOException {
if (m_vcfMetadata != null) {
throw new IllegalStateException("Metadata has already been parsed.");
}
VcfMetadata.Builder mdBuilder = new VcfMetadata.Builder();
String line;
while ((line = m_reader.readLine()) != null) {
m_lineNumber++;
if (line.startsWith("##")) {
try {
parseMetadata(mdBuilder, line);
} catch (VcfFormatException ex) {
ex.addMetadata(m_lineNumber, "metadata");
throw ex;
} catch (RuntimeException e) {
throw new VcfFormatException(m_lineNumber, "metadata", e);
}
} else if (line.startsWith("#")) {
try {
parseColumnInfo(mdBuilder, line);
} catch (VcfFormatException ex) {
ex.addMetadata(m_lineNumber, "column (# header)");
throw ex;
} catch (RuntimeException e) {
throw new VcfFormatException(m_lineNumber, "column (# header)", e);
}
break;
}
}
m_vcfMetadata = mdBuilder.build();
// check sample lists
if (m_vcfMetadata.getNumSamples() == m_vcfMetadata.getSamples().size()) {
for (int i = 0; i < m_vcfMetadata.getNumSamples(); i++) {
String sampleName = m_vcfMetadata.getSampleName(i);
if (!m_vcfMetadata.getSamples().containsKey(sampleName)) {
sf_logger.warn("Sample {} is missing in the metadata", sampleName);
}
}
} else {
sf_logger.warn("There are {} samples in the header but {} in the metadata", m_vcfMetadata.getNumSamples(),
m_vcfMetadata.getSamples().size());
}
return m_vcfMetadata;
}
/**
* Gets VCF metadata (if it has already been parsed).
*/
public @Nullable VcfMetadata getMetadata() {
return m_vcfMetadata;
}
/**
* Parses the entire VCF file (including the metadata).
*
* This is the preferred way to read a VCF file.
*/
public void parse() throws IOException {
boolean hasNext = true;
while (hasNext) {
hasNext = parseNextLine();
}
}
/**
* Parses just the next data line available, also reading all the metadata if it has not been read.
* This is a specialized method; in general calling {@link #parse()} to parse the entire stream is preferred.
*
* @return Whether another line may be available to read; false only if and only if this is the last line available
* @throws IllegalStateException If the stream was already fully parsed
* @throws VcfFormatException If a VCF formatting error is found
*/
public boolean parseNextLine() throws IOException, VcfFormatException {
if (m_alreadyFinished) {
// prevents user errors from causing infinite loops
throw new IllegalStateException("Already finished reading the stream");
}
if (m_vcfMetadata == null) {
parseMetadata();
}
String line = m_reader.readLine();
if (line == null) {
m_alreadyFinished = true;
return false;
}
m_lineNumber++;
if (line.startsWith("#")) {
return true;
}
try {
if (StringUtils.stripToNull(line) == null) {
throw new VcfFormatException("Empty line", m_lineNumber);
}
List data = toList(sf_tabSplitter, line);
if (data.size() != m_vcfMetadata.getNumColumns()) {
throw new VcfFormatException("Data line does not have expected number of columns (got " + data.size() +
" vs. " + m_vcfMetadata.getNumColumns() + ")", m_lineNumber);
}
// CHROM
String chromosome = data.get(0);
// POS
long position;
try {
position = Long.parseLong(data.get(1));
} catch (NumberFormatException e) {
throw new VcfFormatException("POS '" + data.get(1) + "' is not a number");
}
// ID
List ids = null;
if (!data.get(2).equals(".")) {
if (m_rsidsOnly && !VcfUtils.RSID_PATTERN.matcher(data.get(2)).find()) {
return true;
}
ids = toList(sf_semicolonSplitter, data.get(2));
} else if (m_rsidsOnly) {
return true;
}
// REF
String ref = data.get(3);
// ALT
List alt = null;
if (!data.get(7).isEmpty() && !data.get(4).equals(".")) {
alt = toList(sf_commaSplitter, data.get(4));
}
// QUAL
BigDecimal quality = null;
if (!data.get(5).isEmpty() && !data.get(5).equals(".")) {
try {
quality = new BigDecimal(data.get(5));
} catch (NumberFormatException e) {
throw new VcfFormatException("QUAL '" + data.get(5) + "' is not a number");
}
}
// FILTER
List filters = null;
if (!data.get(6).equals("PASS")) {
filters = toList(sf_semicolonSplitter, data.get(6));
}
// INFO
ListMultimap info = null;
if (!data.get(7).equals("") && !data.get(7).equals(".")) {
info = ArrayListMultimap.create();
List props = toList(sf_semicolonSplitter, data.get(7));
for (String prop : props) {
int idx = prop.indexOf('=');
if (idx == -1) {
info.put(prop, "");
} else {
String key = prop.substring(0, idx);
String value = prop.substring(idx + 1);
info.putAll(key, toList(sf_commaSplitter, value));
}
}
}
// FORMAT
List format = null;
if (data.size() >= 9 && data.get(8) != null) {
format = toList(sf_colonSplitter, data.get(8));
}
// samples
VcfPosition pos = new VcfPosition(chromosome, position, ids, ref, alt,
quality, filters, info, format);
List samples = new ArrayList<>();
for (int x = 9; x < data.size(); x++) {
samples.add(new VcfSample(format, toList(sf_colonSplitter, (data.get(x)))));
}
m_vcfLineParser.parseLine(m_vcfMetadata, pos, samples);
return true;
} catch (VcfFormatException ex) {
ex.addMetadata(m_lineNumber, "data");
throw ex;
} catch (RuntimeException e) {
throw new VcfFormatException(m_lineNumber, "data", e);
}
}
@Override
public void close() {
try {
m_reader.close();
} catch (Exception ex) {
sf_logger.info("Error closing reader", ex);
}
}
/**
* Parses a metadata line (starts with ##).
*/
private void parseMetadata(@Nonnull VcfMetadata.Builder mdBuilder, @Nonnull String line) {
int idx = line.indexOf("=");
String propName = line.substring(2, idx).trim();
String propValue = line.substring(idx + 1).trim();
sf_logger.debug("{} : {}", propName, propValue);
switch (propName) {
case "fileformat":
mdBuilder.setFileFormat(propValue);
break;
case "ALT":
case "FILTER":
case "INFO":
case "FORMAT":
case "contig":
case "SAMPLE":
case "PEDIGREE":
parseMetadataProperty(mdBuilder, propName, removeAngleBrackets(propValue));
break;
case "assembly":
case "pedigreeDB":
default:
mdBuilder.addRawProperty(propName, propValue);
}
}
/**
* Removes double quotation marks around a string.
* @throws IllegalArgumentException If angle brackets are not present
*/
private static @Nonnull String removeAngleBrackets(@Nonnull String string) throws VcfFormatException {
if (string.startsWith("<") && string.endsWith(">")) {
return string.substring(1, string.length() - 1);
}
throw new VcfFormatException("Angle brackets not present for: '" + string + "'");
}
/**
* Converts metadata name-value pair into object.
*/
private void parseMetadataProperty(@Nonnull VcfMetadata.Builder mdBuilder,
@Nonnull String propName, @Nonnull String value) {
Map props = VcfUtils.extractPropertiesFromLine(value);
switch (propName.toLowerCase()) {
case "alt":
mdBuilder.addAlt(new IdDescriptionMetadata(props, true));
break;
case "filter":
mdBuilder.addFilter(new IdDescriptionMetadata(props, true));
break;
case "info":
mdBuilder.addInfo(new InfoMetadata(props));
break;
case "format":
mdBuilder.addFormat(new FormatMetadata(props));
break;
case "contig":
mdBuilder.addContig(new ContigMetadata(props));
break;
case "sample":
mdBuilder.addSample(new IdDescriptionMetadata(props, true));
break;
case "pedigree":
mdBuilder.addPedigree(new BaseMetadata(props));
break;
}
}
private @Nonnull List toList(@Nonnull Pattern pattern, @Nullable String string) {
String[] array = pattern.split(string);
List list = new ArrayList<>(array.length);
Collections.addAll(list, array);
return list;
}
public int getLineNumber() {
return m_lineNumber;
}
private void parseColumnInfo(@Nonnull VcfMetadata.Builder mdBuilder, @Nonnull String line) {
List cols = Arrays.asList(sf_tabSplitter.split(line));
if (cols.size() < 8) {
throw new VcfFormatException("Header line does not have mandatory (tab-delimited) columns", m_lineNumber);
}
mdBuilder.setColumns(cols);
}
public static class Builder {
private BufferedReader m_reader;
private Path m_vcfFile;
private boolean m_rsidsOnly;
private VcfLineParser m_vcfLineParser;
/**
* Provides the {@link Path} to the VCF file to parse.
*/
public Builder fromFile(@Nonnull Path dataFile) {
Preconditions.checkNotNull(dataFile);
if (m_reader != null) {
throw new IllegalStateException("Already loading from reader");
}
if (!dataFile.toString().endsWith(".vcf")) {
throw new IllegalArgumentException("Not a VCF file (doesn't end with .vcf extension");
}
m_vcfFile = dataFile;
return this;
}
/**
* Provides a {@link BufferedReader} to the beginning of the VCF file to parse.
*/
public Builder fromReader(@Nonnull BufferedReader reader) {
Preconditions.checkNotNull(reader);
if (m_vcfFile != null) {
throw new IllegalStateException("Already loading from file");
}
m_reader = reader;
return this;
}
/**
* Tells parser to ignore data lines that are not associated with an RSID.
*/
public Builder rsidsOnly() {
m_rsidsOnly = true;
return this;
}
public Builder parseWith(@Nonnull VcfLineParser lineParser) {
Preconditions.checkNotNull(lineParser);
m_vcfLineParser = lineParser;
return this;
}
public VcfParser build() throws IOException {
if (m_vcfLineParser == null) {
throw new IllegalStateException("Missing VcfLineParser");
}
if (m_vcfFile != null) {
m_reader = Files.newBufferedReader(m_vcfFile);
}
if (m_reader == null) {
throw new IllegalStateException("Must specify either file or reader to parse");
}
return new VcfParser(m_reader, m_rsidsOnly, m_vcfLineParser);
}
}
}