org.dishevelled.bio.variant.vcf.VcfPedigreeParser Maven / Gradle / Ivy
The newest version!
/*
dsh-bio-variant Variants.
Copyright (c) 2013-2024 held jointly by the individual authors.
This library is free software; you can redistribute it and/or modify it
under the terms of the GNU Lesser General Public License as published
by the Free Software Foundation; either version 3 of the License, or (at
your option) any later version.
This library is distributed in the hope that it will be useful, but WITHOUT
ANY WARRANTY; with out even the implied warranty of MERCHANTABILITY or
FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
License for more details.
You should have received a copy of the GNU Lesser General Public License
along with this library; if not, write to the Free Software Foundation,
Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA.
> http://www.fsf.org/licensing/licenses/lgpl.html
> http://www.opensource.org/licenses/lgpl-license.php
*/
package org.dishevelled.bio.variant.vcf;
import static com.google.common.base.Preconditions.checkNotNull;
import java.io.IOException;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import com.google.common.collect.ArrayListMultimap;
import com.google.common.collect.ImmutableList;
import com.google.common.collect.ListMultimap;
/**
* VCF pedigree parser.
*
* @author Michael Heuer
*/
public final class VcfPedigreeParser {
/**
* Private no-arg constructor.
*/
private VcfPedigreeParser() {
// empty
}
/**
* Read a VCF pedigree from the specified readable.
*
* @param readable readable to read from, must not be null
* @return a VCF pedigree read from the specified readable
* @throws IOException if an I/O error occurs
*/
public static VcfPedigree pedigree(final Readable readable) throws IOException {
checkNotNull(readable);
ParseListener parseListener = new ParseListener();
VcfParser.parse(readable, parseListener);
return parseListener.getPedigree();
}
/**
* Parse listener.
*/
static final class ParseListener extends VcfParseAdapter {
/** List of ##PEDIGREE meta header lines. */
private final List pedigreeMetaLines = new ArrayList();
/** VCF pedigree builder. */
private final VcfPedigree.Builder builder = VcfPedigree.builder();
/** VCF samples keyed by id. */
private Map samplesById = new HashMap();
@Override
public void meta(final String meta) throws IOException {
// copied from VcfSampleParser.ParseListener
if (meta.startsWith("##SAMPLE=")) {
ListMultimap values = ArrayListMultimap.create();
String[] tokens = meta.substring(10).split(",");
for (String token : tokens) {
String[] metaTokens = token.split("=");
String key = metaTokens[0];
String[] valueTokens = metaTokens[1].split(";");
for (String valueToken : valueTokens) {
values.put(key, valueToken.replace("\"", "").replace(">", ""));
}
}
String id = values.get("ID").get(0);
List genomeIds = values.get("Genomes");
List mixtures = values.get("Mixture");
List descriptions = values.get("Description");
List genomes = new ArrayList(genomeIds.size());
for (int i = 0, size = genomeIds.size(); i < size; i++) {
genomes.add(new VcfGenome(genomeIds.get(i), Double.parseDouble(mixtures.get(i)), descriptions.get(i)));
}
samplesById.put(id, new VcfSample(id, genomes.toArray(new VcfGenome[genomes.size()])));
}
else if (meta.startsWith("##PEDIGREE=")) {
// need to process later, after all samples have been found
pedigreeMetaLines.add(meta);
}
}
@Override
public void samples(final String... samples) throws IOException {
// copied from VcfSampleParser.ParseListener
for (String sample : samples) {
// add if missing in meta lines
if (!samplesById.containsKey(sample)) {
samplesById.put(sample, new VcfSample(sample));
}
}
/*
VCF 4.2 and earlier spec:
##PEDIGREE=
##PEDIGREE=
##PEDIGREE=
##PEDIGREE=
##PEDIGREE=
##PEDIGREE=
G0-ID is target genome, G1-ID..GN-ID are source genomes
Name_0 is relationship target label, Name_1..Name_N are relationship source labels
E.g.
ID1 -- Original ---- Derived --> ID2
MOTHER-GENOME_ID -- Mother ---- Child --> CHILD-GENOME-ID
FATHER-GENOME_ID -- Father ---- Child --> CHILD-GENOME-ID
GERMLINE-GENOME-ID -- Original ---- Derived --> PRIMARY-TUMOR-GENOME-ID
PRIMARY-TUMOR-GENOME-ID -- Original ---- Derived --> SECONDARY1-TUMOR-GENOME-ID
PRIMARY-TUMOR-GENOME-ID -- Original ---- Derived --> SECONDARY2-TUMOR-GENOME-ID
VCF 4.3 spec:
TBD.
*/
for (String meta : pedigreeMetaLines) {
// note: need to trim < and > characters
String[] tokens = meta.substring(12, meta.length() - 1).split(",");
if (tokens.length > 1) {
String[] first = tokens[0].split("=");
if (first.length < 2) {
throw new IOException("invalid ##PEDIGREE meta header line: " + meta);
}
// in VCF 4.3, targetLabel will always be "ID"
String targetLabel = first[0];
VcfSample target = samplesById.get(first[1]);
if (target == null) {
throw new IOException("VCF sample id " + first[1] + " not found in samples");
}
for (int i = 1; i < tokens.length; i++) {
String[] next = tokens[i].split("=");
if (next.length < 2) {
throw new IOException("invalid ##PEDIGREE meta header line: " + meta);
}
String sourceLabel = next[0];
VcfSample source = samplesById.get(next[1]);
if (source == null) {
throw new IOException("VCF sample id " + next[1] + " not found in samples");
}
builder.withRelationship(source, sourceLabel, target, targetLabel);
}
}
}
}
/**
* Return the VCF pedigree.
*
* @return the VCF pedigree.
*/
VcfPedigree getPedigree() {
return builder.build();
}
}
}
© 2015 - 2024 Weber Informatics LLC | Privacy Policy