de.charite.compbio.jannovar.hgnc.HGNCParser Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of jannovar-core Show documentation
Show all versions of jannovar-core Show documentation
jannovar-core is a library for the functional annotation of genomic variants
package de.charite.compbio.jannovar.hgnc;
import java.io.BufferedReader;
import java.io.FileNotFoundException;
import java.io.FileReader;
import java.io.IOException;
import java.util.HashMap;
import java.util.Map;
import com.google.common.base.Splitter;
import com.google.common.collect.ImmutableList;
import de.charite.compbio.jannovar.Immutable;
import de.charite.compbio.jannovar.JannovarException;
// TODO(holtgrewe): test me
/**
* Parser for hgnc_complete_set.txt
*
* @author Manuel Holtgrewe
*/
@Immutable
public class HGNCParser {
/** Download URL for the HGNC complete set TSV file */
public static final String DOWNLOAD_URL = "ftp://ftp.ebi.ac.uk/pub/databases/genenames/new/tsv/hgnc_complete_set.txt";
/** Path to the file to parser */
private final String path;
public HGNCParser(String path) {
super();
this.path = path;
}
/** @return entry in the given index or the empty string */
private static String getField(String[] arr, int idx) {
if (idx >= arr.length)
return "";
else
return arr[idx];
}
/**
* @return ImmutableList with {@link HGNCRecord}s
* @throws JannovarException
* if there is a problem with opening or reading the file
*/
public ImmutableList run() throws JannovarException {
ImmutableList.Builder result = new ImmutableList.Builder();
Map header = null;
try (BufferedReader br = new BufferedReader(new FileReader(path))) {
String line;
while ((line = br.readLine()) != null) {
if (line.trim().isEmpty())
continue; // skip
if (header == null) {
// read header
header = new HashMap();
int i = 0;
for (String s : line.split("\t"))
header.put(s, i++);
} else {
// parse fields
final String[] arr = line.split("\t");
final String hgncID = getField(arr, header.get("hgnc_id"));
final String symbol = getField(arr, header.get("symbol"));
final String name = getField(arr, header.get("name"));
final ImmutableList aliasSymbols = splitField(getField(arr, header.get("alias_symbol")));
final ImmutableList prevSymbols = splitField(getField(arr, header.get("prev_symbol")));
final String entrezID = getField(arr, header.get("entrez_id"));
final String ensemblGeneID = getField(arr, header.get("ensembl_gene_id"));
final String vegaID = getField(arr, header.get("vega_id"));
final String ucscID = getField(arr, header.get("ucsc_id"));
final String enaID = getField(arr, header.get("ena"));
final String refseqAccession = getField(arr, header.get("refseq_accession"));
final ImmutableList ccdsIDs = splitField(getField(arr, header.get("ccds_id")));
final ImmutableList uniprotIDs = splitField(getField(arr, header.get("uniprot_ids")));
final ImmutableList pubmedIDs = splitField(getField(arr, header.get("pubmed_id")));
final String mgdID = getField(arr, header.get("mgd_id"));
final String rgdID = getField(arr, header.get("rgd_id"));
final String cosmicID = getField(arr, header.get("cosmic"));
final String omimID = getField(arr, header.get("omim_id"));
result.add(new HGNCRecord(hgncID, symbol, name, aliasSymbols, prevSymbols, entrezID, ensemblGeneID,
vegaID, ucscID, enaID, refseqAccession, ccdsIDs, uniprotIDs, pubmedIDs, mgdID, rgdID,
cosmicID, omimID));
}
}
} catch (FileNotFoundException e) {
throw new JannovarException("Problem opening HGNC file", e);
} catch (IOException e) {
throw new JannovarException("Problem reading HGNC file", e);
}
return result.build();
}
private static ImmutableList splitField(String f) {
if (f.startsWith("\"") && f.endsWith("\"")) {
f = f.substring(1, f.length() - 1);
return ImmutableList.copyOf(Splitter.on('|').split(f));
} else {
return ImmutableList.of(f);
}
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy