All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.snpeff.gtex.Gtex Maven / Gradle / Ivy

The newest version!
package org.snpeff.gtex;

import java.util.Collection;
import java.util.HashMap;
import java.util.Iterator;

import org.snpeff.fileIterator.LineFileIterator;
import org.snpeff.util.Gpr;
import org.snpeff.util.Timer;

/**
 * Load data from GTEx files.
 * 
 * References: http://www.broadinstitute.org/gtex/
 * 
 * @author pcingola
 */
public class Gtex implements Iterable {

	public static final String NA_VALUE = "NA";

	boolean verbose = false;
	HashMap experiments;
	HashMap geneId2Index;
	String geneIds[];

	public Gtex() {
		experiments = new HashMap();
		geneId2Index = new HashMap();
	}

	public Collection getGeneIds() {
		return geneId2Index.keySet();
	}

	public int getIndex(String geneId) {
		return geneId2Index.get(geneId);
	}

	@Override
	public Iterator iterator() {
		return experiments.values().iterator();
	}

	public void load(String samples, String data) {
		loadSamples(samples); // Sample definition file
		loadData(data); // Data file
	}

	/**
	 * Load data: Experimental (sometimes normalized) values
	 * @param data
	 */
	protected void loadData(String data) {
		if (verbose) Timer.showStdErr("Loaded GTEx data from file " + data);

		if (!Gpr.canRead(data)) throw new RuntimeException("Cannot find data file '" + data + "'");

		geneId2Index = new HashMap();

		// Open file
		LineFileIterator lfi = new LineFileIterator(data);
		String exIds[] = null;
		int index = 0;
		for (String line : lfi) {
			String fields[] = line.split("\t", -1); // Parse fields

			if (lfi.getLineNum() <= 1) {
				// Parse title
				exIds = fields; // Sort experiment IDs

				// For some reason the IDs in one file are like 'GTEX-N7MS-0007-SM-2D7W1' and in the other file are like 'GTEX.N7MS.0007.SM.2D7W1'
				for (int i = 0; i < exIds.length; i++)
					exIds[i] = exIds[i].replace('.', '-');
			} else {
				// Parse gene ID
				String geneId = fields[0];
				geneId = geneId.split("\\.")[0]; // Remove trailing ".X" form geneID, e.g.:   "ENSG00000228463.3" --> "ENSG00000228463"

				// Add to map(ID => index)
				geneId2Index.put(geneId, index);

				// Add data for each experiment
				for (int i = 2; i < fields.length; i++) {
					// Get appropriate experiment 
					String exId = exIds[i];
					GtexExperiment gtexExperiment = experiments.get(exId);

					// Sanity check
					if (gtexExperiment == null) throw new RuntimeException("Cannot find experiment '" + exId + "'");
					if (gtexExperiment.size() != index) throw new RuntimeException("Index doesn't match: " + gtexExperiment.size() + "!=" + index);

					// Parse data value
					double value = Double.NaN;
					if (!fields[i].equals(NA_VALUE)) value = Gpr.parseDoubleSafe(fields[i]);

					// Add value to experiment
					gtexExperiment.add(value);
				}
				index++;
			}
		}

		if (verbose) Timer.showStdErr("Done. Loaded " + experiments.size() + " experiments.");
	}

	/**
	 * Load samples names
	 * @param samples
	 */
	protected void loadSamples(String samples) {
		if (verbose) Timer.showStdErr("Loaded GTEx experiments from file " + samples);

		if (!Gpr.canRead(samples)) throw new RuntimeException("Cannot find samples file '" + samples + "'");

		experiments = new HashMap();
		LineFileIterator lfi = new LineFileIterator(samples);
		for (String line : lfi) {
			if (lfi.getLineNum() <= 1) continue; // Ignore title

			GtexExperiment gtexExperiment = new GtexExperiment(this, line);
			experiments.put(gtexExperiment.getId(), gtexExperiment);
		}

		if (verbose) Timer.showStdErr("Done. Loaded " + experiments.size() + " experiments.");
	}

	public void setVerbose(boolean verbose) {
		this.verbose = verbose;
	}

	@Override
	public String toString() {
		StringBuilder sb = new StringBuilder();
		for (GtexExperiment gex : experiments.values())
			sb.append(gex + "\n");
		return sb.toString();
	}
}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy