org.biojava.nbio.genome.parsers.genename.GeneNamesParser Maven / Gradle / Ivy
/*
* BioJava development code
*
* This code may be freely distributed and modified under the
* terms of the GNU Lesser General Public Licence. This should
* be distributed with the code. If you do not have a copy,
* see:
*
* http://www.gnu.org/copyleft/lesser.html
*
* Copyright for this code is held jointly by the individual
* authors. These should be listed in @author doc comments.
*
* For more information on the BioJava project and its aims,
* or to join the biojava-l mailing list, visit the home page
* at:
*
* http://www.biojava.org/
*
* created at 28 Jan 2014
* Author: Andreas Prlic
*/
package org.biojava.nbio.genome.parsers.genename;
import org.biojava.nbio.core.util.InputStreamProvider;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.net.URL;
import java.util.ArrayList;
import java.util.List;
/**
* Parses a file from the www.genenames.org website that contains a mapping of human gene names to other databases
*
* @author Andreas Prlic
*
*/
public class GeneNamesParser {
private static final Logger logger = LoggerFactory.getLogger(GeneNamesParser.class);
public static final String DEFAULT_GENENAMES_URL = "https://www.genenames.org/cgi-bin/download?title=HGNC+output+data&hgnc_dbtag=on&col=gd_app_sym&col=gd_app_name&col=gd_status&col=gd_prev_sym&col=gd_prev_name&col=gd_aliases&col=gd_pub_chrom_map&col=gd_pub_acc_ids&col=md_mim_id&col=gd_pub_refseq_ids&col=md_ensembl_id&col=md_prot_id&col=gd_hgnc_id" +
"&status=Approved&status_opt=2&where=((gd_pub_chrom_map%20not%20like%20%27%patch%%27%20and%20gd_pub_chrom_map%20not%20like%20%27%ALT_REF%%27)%20or%20gd_pub_chrom_map%20IS%20NULL)%20and%20gd_locus_group%20%3d%20%27protein-coding%20gene%27&order_by=gd_app_sym_sort&format=text&limit=&submit=submit&.cgifields=&.cgifields=chr&.cgifields=status&.cgifields=hgnc_dbtag";
/** parses a file from the genenames website
*
* @param args
*/
public static void main(String[] args) {
try {
List geneNames = getGeneNames();
logger.info("got {} gene names", geneNames.size());
for ( GeneName g : geneNames){
if ( g.getApprovedSymbol().equals("FOLH1"))
logger.info("Gene Name: {}", g);
}
// and returns a list of beans that contains key-value pairs for each gene name
} catch (Exception e) {
// TODO Auto-generated catch block
logger.error("Exception: ", e);
}
}
public static List getGeneNames() throws IOException{
URL url = new URL(DEFAULT_GENENAMES_URL);
InputStreamProvider prov = new InputStreamProvider();
InputStream inStream = prov.getInputStream(url);
return getGeneNames(inStream);
}
/** Get a list of GeneNames from an input stream.
*
* @param inStream
* @return list of geneNames
* @throws IOException
*/
public static List getGeneNames(InputStream inStream) throws IOException{
ArrayList geneNames = new ArrayList();
BufferedReader reader = new BufferedReader(new InputStreamReader(inStream));
// skip reading first line (it is the legend)
String line = reader.readLine();
while ((line = reader.readLine()) != null) {
// process line...
//System.out.println(Arrays.toString(line.split("\t")));
GeneName geneName = getGeneName(line);
if ( geneName != null)
geneNames.add(geneName);
//System.out.println(geneName);
}
// since this is a large list, let's free up unused space...
geneNames.trimToSize();
return geneNames;
}
private static GeneName getGeneName(String line) {
// data is in this order:
//[HGNC ID, Approved Symbol, Approved Name, Status, Previous Symbols,
// Previous Names, Synonyms, Chromosome, Accession Numbers, RefSeq IDs, UniProt ID(supplied by UniProt)]
if (line == null)
return null;
String[] s = line.split("\t");
if ( s.length != 13) {
logger.warn("Line does not contain 13 data items, but {}: {}", s.length, line);
logger.warn(line.replaceAll("\t", "|---|"));
return null;
}
GeneName gn = new GeneName();
gn.setApprovedSymbol(s[0]);
gn.setApprovedName(s[1]);
gn.setStatus(s[2]);
gn.setPreviousSymbols(s[3]);
gn.setPreviousNames(s[4]);
gn.setSynonyms(s[5]);
gn.setChromosome(s[6]);
gn.setAccessionNr(s[7]);
gn.setOmimId(s[8]);
gn.setRefseqIds(s[9]);
gn.setEnsemblGeneId(s[10]);
gn.setUniprot(s[11]);
gn.setHgncId(s[12]);
return gn;
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy