All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.biojava.nbio.genome.parsers.genename.GeneNamesParser Maven / Gradle / Ivy

There is a newer version: 7.1.3
Show newest version
/*
 *                    BioJava development code
 *
 * This code may be freely distributed and modified under the
 * terms of the GNU Lesser General Public Licence.  This should
 * be distributed with the code.  If you do not have a copy,
 * see:
 *
 *      http://www.gnu.org/copyleft/lesser.html
 *
 * Copyright for this code is held jointly by the individual
 * authors.  These should be listed in @author doc comments.
 *
 * For more information on the BioJava project and its aims,
 * or to join the biojava-l mailing list, visit the home page
 * at:
 *
 *      http://www.biojava.org/
 *
 * created at 28 Jan 2014
 * Author: Andreas Prlic
 */

package org.biojava.nbio.genome.parsers.genename;

import org.biojava.nbio.core.util.InputStreamProvider;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.net.URL;
import java.util.ArrayList;
import java.util.List;

/** 
 * Parses a file from the www.genenames.org website that contains a mapping of human gene names to other databases
 *
 * @author Andreas Prlic
 *
 */
public class GeneNamesParser {

	private static final Logger logger = LoggerFactory.getLogger(GeneNamesParser.class);

	public static final String DEFAULT_GENENAMES_URL = "https://www.genenames.org/cgi-bin/download?title=HGNC+output+data&hgnc_dbtag=on&col=gd_app_sym&col=gd_app_name&col=gd_status&col=gd_prev_sym&col=gd_prev_name&col=gd_aliases&col=gd_pub_chrom_map&col=gd_pub_acc_ids&col=md_mim_id&col=gd_pub_refseq_ids&col=md_ensembl_id&col=md_prot_id&col=gd_hgnc_id" +
			 "&status=Approved&status_opt=2&where=((gd_pub_chrom_map%20not%20like%20%27%patch%%27%20and%20gd_pub_chrom_map%20not%20like%20%27%ALT_REF%%27)%20or%20gd_pub_chrom_map%20IS%20NULL)%20and%20gd_locus_group%20%3d%20%27protein-coding%20gene%27&order_by=gd_app_sym_sort&format=text&limit=&submit=submit&.cgifields=&.cgifields=chr&.cgifields=status&.cgifields=hgnc_dbtag";

	/** parses a file from the genenames website
	 *
	 * @param args
	 */
	public static void main(String[] args) {

		try {

			List geneNames = getGeneNames();

			logger.info("got {} gene names", geneNames.size());

			for ( GeneName g : geneNames){
				if ( g.getApprovedSymbol().equals("FOLH1"))
					logger.info("Gene Name: {}", g);
			}
			// and returns a list of beans that contains key-value pairs for each gene name

		} catch (Exception e) {
			// TODO Auto-generated catch block
			logger.error("Exception: ", e);
		}

	}


	public static List getGeneNames() throws IOException{
		URL url = new URL(DEFAULT_GENENAMES_URL);

		InputStreamProvider prov = new InputStreamProvider();

		InputStream inStream = prov.getInputStream(url);

		return getGeneNames(inStream);
	}

	/** Get a list of GeneNames from an input stream.
	 *
	 * @param inStream
	 * @return list of geneNames
	 * @throws IOException
	 */
	public static List getGeneNames(InputStream inStream) throws IOException{

		ArrayList geneNames = new ArrayList();
		BufferedReader reader = new BufferedReader(new InputStreamReader(inStream));

		// skip reading first line (it is the legend)
		String line = reader.readLine();

		while ((line = reader.readLine()) != null) {
			// process line...
			//System.out.println(Arrays.toString(line.split("\t")));

			GeneName  geneName = getGeneName(line);
			if ( geneName != null)
				geneNames.add(geneName);
				//System.out.println(geneName);

		}

		// since this is a large list, let's free up unused space...
		geneNames.trimToSize();
		return geneNames;
	}

	private static GeneName getGeneName(String line) {
		// data is in this order:
		//[HGNC ID, Approved Symbol, Approved Name, Status, Previous Symbols,
		// Previous Names, Synonyms, Chromosome, Accession Numbers, RefSeq IDs, UniProt ID(supplied by UniProt)]

		if (line == null)
			return null;

		String[] s = line.split("\t");

		if ( s.length != 13) {
			logger.warn("Line does not contain 13 data items, but {}: {}", s.length, line);
			logger.warn(line.replaceAll("\t", "|---|"));
			return null;
		}
		GeneName gn = new GeneName();


		gn.setApprovedSymbol(s[0]);
		gn.setApprovedName(s[1]);
		gn.setStatus(s[2]);
		gn.setPreviousSymbols(s[3]);
		gn.setPreviousNames(s[4]);
		gn.setSynonyms(s[5]);
		gn.setChromosome(s[6]);
		gn.setAccessionNr(s[7]);
		gn.setOmimId(s[8]);
		gn.setRefseqIds(s[9]);
		gn.setEnsemblGeneId(s[10]);
		gn.setUniprot(s[11]);
		gn.setHgncId(s[12]);

		return gn;

	}

}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy