All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.biojava.nbio.genome.parsers.genename.GeneChromosomePositionParser Maven / Gradle / Ivy

/*
 *                    BioJava development code
 *
 * This code may be freely distributed and modified under the
 * terms of the GNU Lesser General Public Licence.  This should
 * be distributed with the code.  If you do not have a copy,
 * see:
 *
 *      http://www.gnu.org/copyleft/lesser.html
 *
 * Copyright for this code is held jointly by the individual
 * authors.  These should be listed in @author doc comments.
 *
 * For more information on the BioJava project and its aims,
 * or to join the biojava-l mailing list, visit the home page
 * at:
 *
 *      http://www.biojava.org/
 *
 * created at 28 Jan 2014
 * Author: ap3
 */

package org.biojava.nbio.genome.parsers.genename;

import org.biojava.nbio.genome.App;
import org.biojava.nbio.core.util.InputStreamProvider;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.net.URL;
import java.util.ArrayList;
import java.util.List;

/** A parser that parses a file from the UCSC genome browser that contains mapping of gene name to chromosome positions
 *
 * @author Andreas Prlic
 *
 */
public class GeneChromosomePositionParser {

	private static final Logger logger = LoggerFactory.getLogger(App.class);

	public static final String DEFAULT_MAPPING_URL="http://hgdownload.cse.ucsc.edu/goldenPath/hg19/database/refFlat.txt.gz";

	public static void main(String[] args){
		try {

			List genePositions=	getChromosomeMappings();
			logger.info("got {} gene positions", genePositions.size());

			for (GeneChromosomePosition pos : genePositions){
				if ( pos.getGeneName().equals("FOLH1")) {
					logger.info("Gene Position: {}", pos);
					break;
				}
			}

		} catch(Exception e){
			logger.error("Exception: ", e);
		}
	}

	public static List getChromosomeMappings() throws IOException {

		URL url = new URL(DEFAULT_MAPPING_URL);

		InputStreamProvider prov = new InputStreamProvider();

		InputStream inStream = prov.getInputStream(url);

		return getChromosomeMappings(inStream);
	}

	public static List getChromosomeMappings(InputStream inStream) throws IOException {
		BufferedReader reader = new BufferedReader(new InputStreamReader(inStream));

		ArrayList gcps = new ArrayList();

		String line = null;
		while ((line = reader.readLine()) != null) {
			GeneChromosomePosition gcp = getGeneChromosomePosition(line);
			if ( gcp != null)
				gcps.add(gcp);
		}

		// since this is a large list, remove empty content.
		gcps.trimToSize();
		return gcps;
	}

	private static GeneChromosomePosition getGeneChromosomePosition(String line) {
		if ( line == null)
			return null;
		String[] spl = line.split("\t");

		if ( spl.length != 11) {
			logger.warn("Line does not have 11 data items, but {}: {}", spl.length, line);
			return null;
		}

		GeneChromosomePosition g = new GeneChromosomePosition();

		g.setGeneName(spl[0]);
		g.setGenebankId(spl[1]);
		g.setChromosome(spl[2]);
		g.setOrientation(spl[3].charAt(0));
		g.setTranscriptionStart(Integer.parseInt(spl[4]));
		g.setTranscriptionEnd(Integer.parseInt(spl[5]));
		g.setCdsStart(Integer.parseInt(spl[6]));
		g.setCdsEnd(Integer.parseInt(spl[7]));
		g.setExonCount(Integer.parseInt(spl[8]));
		String exonStarts = spl[9];
		String exonEnds = spl[10];
		g.setExonStarts(getIntegerList(exonStarts));
		g.setExonEnds(getIntegerList(exonEnds));

		//System.out.println(line);
		//System.out.println(Arrays.asList(spl) + " " + spl.length);
		return g;
	}

	private static List getIntegerList(String lst){
		String[] spl = lst.split(",");
		ArrayList l = new ArrayList();
		for (String s : spl){
			l.add(Integer.parseInt(s));
		}
		l.trimToSize();
		return l;
	}
}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy