All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.pharmgkb.parser.vcf.VcfParser Maven / Gradle / Ivy

There is a newer version: 0.3.1
Show newest version
package org.pharmgkb.parser.vcf;

import java.io.BufferedReader;
import java.io.Closeable;
import java.io.IOException;
import java.lang.invoke.MethodHandles;
import java.math.BigDecimal;
import java.nio.file.Files;
import java.nio.file.Path;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.List;
import java.util.Map;
import java.util.regex.Pattern;
import javax.annotation.Nonnull;
import javax.annotation.Nullable;
import com.google.common.base.Preconditions;
import com.google.common.collect.ArrayListMultimap;
import com.google.common.collect.ListMultimap;
import com.google.common.io.Closeables;
import org.pharmgkb.parser.vcf.model.BaseMetadata;
import org.pharmgkb.parser.vcf.model.ContigMetadata;
import org.pharmgkb.parser.vcf.model.FormatMetadata;
import org.pharmgkb.parser.vcf.model.IdDescriptionMetadata;
import org.pharmgkb.parser.vcf.model.InfoMetadata;
import org.pharmgkb.parser.vcf.model.VcfMetadata;
import org.pharmgkb.parser.vcf.model.VcfPosition;
import org.pharmgkb.parser.vcf.model.VcfSample;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;


/**
 * This class parses a VCF file.
 *
 * @author Mark Woon
 */
public class VcfParser implements Closeable {
  private static final Logger sf_logger = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass());

  private static final Pattern sf_tabSplitter = Pattern.compile("\t");
  private static final Pattern sf_commaSplitter = Pattern.compile(",");
  private static final Pattern sf_colonSplitter = Pattern.compile(":");
  private static final Pattern sf_semicolonSplitter = Pattern.compile(";");

  private boolean m_rsidsOnly;
  private BufferedReader m_reader;
  private VcfMetadata m_vcfMetadata;
  private VcfLineParser m_vcfLineParser;

  private int m_lineNumber;
  private boolean m_alreadyFinished;



  private VcfParser(@Nonnull BufferedReader reader, boolean rsidsOnly, @Nonnull VcfLineParser lineParser) {
    m_reader = reader;
    m_rsidsOnly = rsidsOnly;
    m_vcfLineParser = lineParser;
  }


  /**
   * Parses metadata only.
   * This method should be if only the metadata is needed; otherwise, {@link #parse()} is preferred.
   */
  public @Nonnull VcfMetadata parseMetadata() throws IOException {

    if (m_vcfMetadata != null) {
      throw new IllegalStateException("Metadata has already been parsed.");
    }
    VcfMetadata.Builder mdBuilder = new VcfMetadata.Builder();
    String line;
    while ((line = m_reader.readLine()) != null) {
      m_lineNumber++;
      if (line.startsWith("##")) {
        try {
          parseMetadata(mdBuilder, line);
        } catch (RuntimeException e) {
          throw new IllegalArgumentException("Error parsing metadata on line #" + m_lineNumber + ": " + line, e);
        }
      } else if (line.startsWith("#")) {
        try {
          parseColumnInfo(mdBuilder, line);
        } catch (RuntimeException e) {
          throw new IllegalArgumentException("Error parsing column (# header) on line #" + m_lineNumber + ": " + line, e);
        }
        break;
      }
    }
    m_vcfMetadata = mdBuilder.build();

    // check sample lists
    if (m_vcfMetadata.getNumSamples() == m_vcfMetadata.getSamples().size()) {
      for (int i = 0; i < m_vcfMetadata.getNumSamples(); i++) {
        String sampleName = m_vcfMetadata.getSampleName(i);
        if (!m_vcfMetadata.getSamples().containsKey(sampleName)) {
          sf_logger.warn("Sample {} is missing in the metadata", sampleName);
        }
      }
    } else {
      sf_logger.warn("There are {} samples in the header but {} in the metadata", m_vcfMetadata.getNumSamples(),
          m_vcfMetadata.getSamples().size());
    }

    return m_vcfMetadata;
  }


  /**
   * Gets VCF metadata (if it has already been parsed).
   */
  public @Nullable VcfMetadata getMetadata() {
    return m_vcfMetadata;
  }


  /**
   * Parses the entire VCF file (including the metadata).
   *
   * This is the preferred way to read a VCF file.
   */
  public void parse() throws IOException {
    boolean hasNext = true;
    while (hasNext) {
      hasNext = parseNextLine();
    }
    Closeables.closeQuietly(m_reader);
  }

  /**
   * Parses just the next data line available, also reading all the metadata if it has not been read.
   * This is a specialized method; in general calling {@link #parse()} to parse the entire stream is preferred.
   *
   * @return Whether another line may be available to read; false only if and only if this is the last line available
   * @throws IllegalStateException If the stream was already fully parsed
   */
  public boolean parseNextLine() throws IOException {

    if (m_vcfMetadata == null) {
      parseMetadata();
    }

    String line = m_reader.readLine();
    if (line == null) {
      m_alreadyFinished = true;
      return false;
    }

    if (m_alreadyFinished) {
      // prevents user errors from causing infinite loops
      throw new IllegalStateException("Already finished reading the stream");
    }

    m_lineNumber++;

    try {

      List data = toList(sf_tabSplitter, line);

      // CHROM
      String chromosome = data.get(0);

      // POS
      long position;
      try {
        position = Long.parseLong(data.get(1));
      } catch (NumberFormatException e) {
        throw new IllegalArgumentException("Error parsing VCF data line #" + m_lineNumber + ": POS " +
            data.get(1) + " is not a number");
      }

      // ID
      List ids = null;
      if (!data.get(2).equals(".")) {
        if (m_rsidsOnly && !VcfUtils.RSID_PATTERN.matcher(data.get(2)).find()) {
          return true;
        }
        ids = toList(sf_semicolonSplitter, data.get(2));
      } else if (m_rsidsOnly) {
        return true;
      }

      // REF
      String ref = data.get(3);

      // ALT
      List alt = null;
      if (!data.get(7).isEmpty() && !data.get(4).equals(".")) {
        alt = toList(sf_commaSplitter, data.get(4));
      }

      // QUAL
      BigDecimal quality = null;
      if (!data.get(5).isEmpty() && !data.get(5).equals(".")) {
        try {
          quality = new BigDecimal(data.get(5));
        } catch (NumberFormatException e) {
          throw new IllegalArgumentException("Error parsing VCF data line #" + m_lineNumber + ": QUAL " +
              data.get(5) + " is not a number");
        }
      }

      // FILTER
      List filters = null;
      if (!data.get(6).equals("PASS")) {
        filters = toList(sf_semicolonSplitter, data.get(6));
      }

      // INFO
      ListMultimap info = null;
      if (!data.get(7).equals("") && !data.get(7).equals(".")) {
        info = ArrayListMultimap.create();
        List props = toList(sf_semicolonSplitter, data.get(7));
        for (String prop : props) {
          int idx = prop.indexOf('=');
          if (idx == -1) {
            info.put(prop, "");
          } else {
            String key = prop.substring(0, idx);
            String value = prop.substring(idx + 1);
            info.putAll(key, toList(sf_commaSplitter, value));
          }
        }
      }

      // FORMAT
      List format = null;
      if (data.size() >= 9 && data.get(8) != null) {
        format = toList(sf_colonSplitter, data.get(8));
      }

      // samples
      VcfPosition pos = new VcfPosition(chromosome, position, ids, ref, alt,
          quality, filters, info, format);
      List samples = new ArrayList<>();
      for (int x = 9; x < data.size(); x++) {
        samples.add(new VcfSample(format, toList(sf_colonSplitter, (data.get(x)))));
      }

      m_vcfLineParser.parseLine(m_vcfMetadata, pos, samples);

      m_lineNumber++;

    } catch (IllegalArgumentException ex) {
      throw ex;
    } catch (RuntimeException e) {
      throw new IllegalArgumentException("Error parsing VCF data line #" + m_lineNumber + ": " + line, e);
    }
    return true;
  }

  @Override
  public void close() {
    Closeables.closeQuietly(m_reader);
  }


  /**
   * Parses a metadata line (starts with ##).
   */
  private void parseMetadata(@Nonnull VcfMetadata.Builder mdBuilder, @Nonnull String line) {

    int idx = line.indexOf("=");
    String propName = line.substring(2, idx).trim();
    String propValue = line.substring(idx + 1).trim();

    sf_logger.debug("{} : {}", propName, propValue);

    switch (propName) {
      case "fileformat":
        mdBuilder.setFileFormat(propValue);
        break;

      case "ALT":
      case "FILTER":
      case "INFO":
      case "FORMAT":
      case "contig":
      case "SAMPLE":
      case "PEDIGREE":
        parseMetadataProperty(mdBuilder, propName, removeAngleBrackets(propValue));
        break;

      case "assembly":
      case "pedigreeDB":
      default:
        mdBuilder.addRawProperty(propName, propValue);
    }
  }

  /**
   * Removes double quotation marks around a string.
   * @throws IllegalArgumentException If angle brackets are not present
   */
  private static @Nonnull String removeAngleBrackets(@Nonnull String string) throws IllegalArgumentException {
    if (string.startsWith("<") && string.endsWith(">")) {
      return string.substring(1, string.length() - 1);
    }
    throw new IllegalArgumentException("Angle brackets not present for: " + string);
  }

  /**
   * Converts metadata name-value pair into object.
   */
  private void parseMetadataProperty(@Nonnull VcfMetadata.Builder mdBuilder,
      @Nonnull String propName, @Nonnull String value) {
    Map props = VcfUtils.extractPropertiesFromLine(value);
    switch (propName.toLowerCase()) {
      case "alt":
        mdBuilder.addAlt(new IdDescriptionMetadata(props, true));
        break;
      case "filter":
        mdBuilder.addFilter(new IdDescriptionMetadata(props, true));
        break;
      case "info":
        mdBuilder.addInfo(new InfoMetadata(props));
        break;
      case "format":
        mdBuilder.addFormat(new FormatMetadata(props));
        break;
      case "contig":
        mdBuilder.addContig(new ContigMetadata(props));
        break;
      case "sample":
        mdBuilder.addSample(new IdDescriptionMetadata(props, true));
        break;
      case "pedigree":
        mdBuilder.addPedigree(new BaseMetadata(props));
        break;
    }
  }

  private @Nonnull List toList(@Nonnull Pattern pattern, @Nullable String string) {
    String[] array = pattern.split(string);
    List list = new ArrayList<>(array.length);
    Collections.addAll(list, array);
    return list;
  }

  public int getLineNumber() {
    return m_lineNumber;
  }

  private void parseColumnInfo(@Nonnull VcfMetadata.Builder mdBuilder, @Nonnull String line) {
    mdBuilder.setColumns(Arrays.asList(sf_tabSplitter.split(line)));
  }


  public static class Builder {
    private BufferedReader m_reader;
    private Path m_vcfFile;
    private boolean m_rsidsOnly;
    private VcfLineParser m_vcfLineParser;


    /**
     * Provides the {@link Path} to the VCF file to parse.
     */
    public Builder fromFile(@Nonnull Path dataFile) {
      Preconditions.checkNotNull(dataFile);
      if (m_reader != null) {
        throw new IllegalStateException("Already loading from reader");
      }
      if (!dataFile.toString().endsWith(".vcf")) {
        throw new IllegalArgumentException("Not a VCF file (doesn't end with .vcf extension");
      }
      m_vcfFile = dataFile;
      return this;
    }

    /**
     * Provides a {@link BufferedReader} to the beginning of the VCF file to parse.
     */
    public Builder fromReader(@Nonnull BufferedReader reader) {
      Preconditions.checkNotNull(reader);
      if (m_vcfFile != null) {
        throw new IllegalStateException("Already loading from file");
      }
      m_reader = reader;
      return this;
    }

    /**
     * Tells parser to ignore data lines that are not associated with an RSID.
     */
    public Builder rsidsOnly() {
      m_rsidsOnly = true;
      return this;
    }

    public Builder parseWith(@Nonnull VcfLineParser lineParser) {
      Preconditions.checkNotNull(lineParser);
      m_vcfLineParser = lineParser;
      return this;
    }


    public VcfParser build() throws IOException {
      if (m_vcfLineParser == null) {
        throw new IllegalStateException("Missing VcfLineParser");
      }
      if (m_vcfFile != null) {
        m_reader = Files.newBufferedReader(m_vcfFile);
      }
      if (m_reader == null) {
        throw new IllegalStateException("Must specify either file or reader to parse");
      }
      return new VcfParser(m_reader, m_rsidsOnly, m_vcfLineParser);
    }
  }
}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy