All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.snpeff.reactome.Reactome Maven / Gradle / Ivy

The newest version!
package org.snpeff.reactome;

import java.util.ArrayList;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.LinkedList;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import org.snpeff.collections.AutoHashMap;
import org.snpeff.fileIterator.LineFileIterator;
import org.snpeff.gtex.Gtex;
import org.snpeff.gtex.GtexExperiment;
import org.snpeff.reactome.events.BlackBoxEvent;
import org.snpeff.reactome.events.CatalystActivity;
import org.snpeff.reactome.events.Complex;
import org.snpeff.reactome.events.Depolymerisation;
import org.snpeff.reactome.events.Event;
import org.snpeff.reactome.events.Pathway;
import org.snpeff.reactome.events.Polymerisation;
import org.snpeff.reactome.events.Reaction;
import org.snpeff.stats.CountByType;
import org.snpeff.util.Gpr;
import org.snpeff.util.Timer;

/**
 * Load reactome data from TXT files
 * 
 * @author pcingola
 *
 */
public class Reactome implements Iterable {

	public static final int SHOW_EVERY = 10000;
	public static final double EPSILON = 1E-6;
	public static final double MAX_CONVERGENCE_DIFFERENCE = 1E-3;
	public static final int MAX_ITERATIONS = 1000;

	boolean verbose = false;
	String dirName;
	HashMap entityById;
	HashMap objectType;
	HashMap objectName;
	AutoHashMap> entitiesByGeneId;
	HashSet entitiesGeneId = new HashSet();
	Monitor monitor; // Monitor all nodes in the circuit
	Monitor monitorTrace; // Monitor a specific set of nodes (usually one node and all it's predecessors)

	/**
	 * Find and return first Regexp occurrence (null if nothing is found)
	 * @param pattern
	 * @param str
	 * @return
	 */
	static String findRegexp(Pattern pattern, String str) {
		Matcher m = pattern.matcher(str);
		if (!m.find()) return null;
		return m.group();
	}

	static boolean hasRegexp(Pattern pattern, String str) {
		Matcher m = pattern.matcher(str);
		return m.find();
	}

	/**
	 * Main
	 * @param args
	 */
	public static void main(String[] args) {
		String reactomeDir = Gpr.HOME + "/snpEff/db/reactome/txt/";
		String geneIdsFile = Gpr.HOME + "/snpEff/db/reactome/gene_ids/biomart_query_uniq.txt";
		String gtexDir = Gpr.HOME + "/snpEff/db/GTEx";
		String gtexSamples = gtexDir + "/GTEx_Analysis_Annotations_Sample_DS__Pilot_2013_01_31.txt";
		String gtexData = gtexDir + "/gtex_norm.10.txt";

		// Load reactome data
		Timer.showStdErr("Loading reactome data");
		Reactome reactome = new Reactome();
		reactome.setVerbose(true);
		reactome.load(reactomeDir, geneIdsFile);

		// Load GTEX data
		Timer.showStdErr("Loading GTEx data");
		Gtex gtex = new Gtex();
		gtex.setVerbose(true);
		gtex.load(gtexSamples, gtexData);

		reactome.simplifyEntities();

		// Simulate
		Timer.showStdErr("Running");
		reactome.run(gtex, null);

		// Save results
		String file = Gpr.HOME + "/circuit.txt";
		Timer.showStdErr("Saving results to '" + file + "'");
		if (reactome.monitor != null) reactome.monitor.save(file);
		if (reactome.monitorTrace != null) reactome.monitorTrace.save(file);
	}

	public Reactome() {
		entityById = new HashMap();
		entitiesByGeneId = new AutoHashMap>(new ArrayList());
	}

	/**
	 * Add an entity <-> geneId
	 * @param entity
	 * @param geneId
	 */
	public void add(Entity entity, String geneId) {
		String key = geneId + "\t" + entity.getId();
		if (entitiesGeneId.contains(key)) return; // Don't add more then once

		entity.addGeneId(geneId);
		entitiesByGeneId.getOrCreate(geneId).add(entity);
		entitiesGeneId.add(key);
	}

	/**
	 * Iterate network until convergence
	 * 
	 * @param gtexExperiment
	 */
	boolean calc(GtexExperiment gtexExperiment) {
		boolean changed = true;
		int iteration;

		if (verbose) System.err.print(gtexExperiment.getTissueTypeDetail() + "\t");
		for (iteration = 0; changed && iteration < MAX_ITERATIONS; iteration++) {
			changed = false;
			HashSet done = new HashSet();

			for (Entity e : this) {
				double outPrev = e.getOutput();
				double out = e.calc(done);

				// Output changed?
				if (Math.abs(outPrev - out) > MAX_CONVERGENCE_DIFFERENCE) changed = true;
			}
			if (verbose) System.err.print(".");
		}
		if (verbose) System.err.println(" " + iteration);

		return changed;
	}

	/** 
	 * Create a monitor for all nodes in the circuit
	 */
	Monitor createMonitor() {
		Monitor monitor = new Monitor();
		for (Entity e : this) {
			if (!e.isFixed() && e.isReaction()) monitor.add(e);
		}

		monitor.sort();
		return monitor;
	}

	/** 
	 * Create a monitor for a subset of nodes that explain "target's" output
	 */
	Monitor createMonitor(String targetNodeId) {
		// Perform 1 iteration to get a set of all nodes required for target's output
		reset();
		Entity target = entityById.get(targetNodeId);
		HashSet done = new HashSet();
		target.calc(done);

		Monitor monitor = new Monitor();
		for (Entity e : done)
			monitor.add(e);
		monitor.sort();

		return monitor;
	}

	Entity getEntity(int id) {
		return entityById.get(Integer.toString(id));
	}

	public Monitor getMonitor() {
		return monitor;
	}

	public Monitor getMonitorTrace() {
		return monitorTrace;
	}

	/**
	 * Get or create a new entity
	 * @param id
	 * @return
	 */
	Entity getOrCreateEntity(String id) {
		// Get from hash
		Entity e = entityById.get(id);
		if (e != null) return e;

		// Not available? Create entity
		String type = objectType.get(id);
		if (type == null) throw new RuntimeException("Cannot find entity type for ID '" + id + "'");
		String name = objectName.get(id);
		int idNum = Gpr.parseIntSafe(id);

		// Create according to object type
		if (type.equals("Complex")) e = new Complex(idNum, name);
		else if (type.equals("EntityCompartment") || type.equals("Compartment") || type.equals("GO_CellularComponent")) e = new Compartment(idNum, name);
		else if (type.equals("Reaction")) e = new Reaction(idNum, name);
		else if (type.equals("BlackBoxEvent")) e = new BlackBoxEvent(idNum, name);
		else if (type.equals("Pathway")) e = new Pathway(idNum, name);
		else if (type.equals("Depolymerisation")) e = new Depolymerisation(idNum, name);
		else if (type.equals("Polymerisation")) e = new Polymerisation(idNum, name);
		else if (type.equals("CatalystActivity")) e = new CatalystActivity(idNum, name);
		else e = new Entity(idNum, name);

		// Add to maps
		entityById.put(id, e);

		return e;
	}

	@Override
	public Iterator iterator() {
		return entityById.values().iterator();
	}

	public void load(String dirName, String geneIdsFile) {
		this.dirName = dirName;
		if (verbose) Timer.showStdErr("Loading Reactome data from directory '" + dirName + "'");

		loadDatabaseObjects(); // Load a map of all object names and types
		loadComplex2HasComponent(); // Load Complex_2_hasComponent
		loadPhysicalEntity2Compartment(); // Load compartment information
		loadPathway2HasEvent(); // Load pathway data
		loadReactionlikeEvent2Input(); // Load reaction inputs
		loadReactionlikeEvent2Output(); // Load reaction outputs
		loadReactionlikeEvent2CatalystActivity(); // Load reaction catalysts
		loadRegulation(); // Load reaction regulators
		loadCatalystActivity(); // Load catalyst

		// Remove cached data, we don't need it any more
		objectType = null;
		objectName = null;

		// Load Gene IDs data
		loadGeneIds(geneIdsFile);

		if (verbose) Timer.showStdErr("Loading finished");
	}

	/**
	 * Load catalyst activity to molecule mapping
	 */
	protected void loadCatalystActivity() {
		String name = "CatalystActivity";
		String fileName = dirName + name + ".txt";
		if (verbose) Timer.showStdErr("Loading " + name + " from '" + fileName + "'");

		int i = 1;
		for (String line : Gpr.readFile(fileName).split("\n")) {
			// Parse line
			String rec[] = line.split("\t");
			String id = rec[0];
			String entityId = rec[1];

			if (id.equals("DB_ID")) continue; // Skip title

			// Add event to pathway
			CatalystActivity reaction = (CatalystActivity) entityById.get(id);
			if (reaction == null) continue; // Reaction not found? Skip

			Entity e = getOrCreateEntity(entityId);
			reaction.addInput(e);

			if (verbose) Gpr.showMark(i++, SHOW_EVERY);
		}

		if (verbose) System.err.println("");
		if (verbose) Timer.showStdErr("Total catalyst entities assigned: " + (i - 1));
	}

	/**
	 * Load complexes
	 * @param name
	 * @param fileName
	 * @param map
	 */
	protected void loadComplex2HasComponent() {
		String fileName = dirName + "Complex_2_hasComponent.txt";
		if (verbose) Timer.showStdErr("Loading Complex_2_hasComponent from '" + fileName + "'");

		int i = 1;
		for (String line : Gpr.readFile(fileName).split("\n")) {
			// Parse line
			String rec[] = line.split("\t");
			String id = rec[0];
			int idNum = Gpr.parseIntSafe(id);
			String componentId = rec[1];

			if (idNum == 0) continue; // Skip title

			// Get complex and add entity
			Complex c = (Complex) getOrCreateEntity(id);
			c.add(getOrCreateEntity(componentId));

			if (verbose) Gpr.showMark(i++, SHOW_EVERY);
		}

		if (verbose) System.err.println("");
		if (verbose) Timer.showStdErr("Total entities added: " + entityById.size());
	}

	/**
	 * Load objects table (populate objectType and objectName maps)
	 */
	protected void loadDatabaseObjects() {
		String fileName = dirName + "DatabaseObject.txt";
		if (verbose) Timer.showStdErr("Loading objects from '" + fileName + "'");

		// Ensure capacity
		int numObjects = Gpr.countLines(fileName);
		if (verbose) Timer.showStdErr("Counting lines from '" + fileName + "'. Total lines: " + numObjects);
		objectType = new HashMap(numObjects);
		objectName = new HashMap(numObjects);

		// Load objects
		int i = 1;
		LineFileIterator lfi = new LineFileIterator(fileName);
		for (String line : lfi) {
			// Parse line
			String recs[] = line.split("\t");
			if (recs.length < 3) continue;
			String id = recs[0];
			String objType = recs[1];
			String objName = recs[2];

			// Add info
			objectType.put(id, objType);
			objectName.put(id, objName);

			if (verbose) Gpr.showMark(i++, SHOW_EVERY);
		}

		if (verbose) System.err.println("");
		if (verbose) Timer.showStdErr("Total objects loaded: " + objectName.size());
	}

	/**
	 * Load Gene IDs data, then map geneIDs <-> Entities
	 * 
	 * @param geneIdsFile
	 */
	public void loadGeneIds(String geneIdsFile) {
		//---
		// Load Gene IDs data
		//---
		if (verbose) Timer.showStdErr("Loading Gene IDs from " + geneIdsFile);
		GeneIds geneIds = new GeneIds(geneIdsFile);

		// Assign to geneIDs
		Pattern patternEnsg = Pattern.compile("ENSG[0-9]*");
		Pattern patternEnst = Pattern.compile("ENST[0-9]*");
		Pattern patternRefSeqNm = Pattern.compile("NM_[0-9]*");
		Pattern patternRefSeqNp = Pattern.compile("NP_[0-9]*");

		// Find matching gene IDs for all entities
		int countMatched = 0, countUnMatched = 0;
		for (Entity e : this) {
			String name = e.getName();

			List gids = null;
			String gname = null;

			// Try to match: From most specific to least specific
			if (hasRegexp(patternEnst, name)) {
				// Found ENSEMBL transcript
				gname = findRegexp(patternEnst, name);
				gids = geneIds.getId2tr().getIds(gname);
			} else if (hasRegexp(patternEnsg, name)) {
				// Found ENSEMBL gene ID
				gname = findRegexp(patternEnsg, name);
				gids = new LinkedList();
				gids.add(gname);
			} else if (hasRegexp(patternRefSeqNm, name)) {
				// Found RefeSeq mRNA ID
				gname = findRegexp(patternRefSeqNm, name);
				gids = geneIds.getId2refseqId().getIds(gname);
			} else if (hasRegexp(patternRefSeqNp, name)) {
				// Found RefeSeq protein ID
				gname = findRegexp(patternRefSeqNp, name);
				gids = geneIds.getId2refseqProtId().getIds(gname);
			} else {
				// May be it's gene name followed by some other stuff
				gname = name.split(" ")[0];
				gids = geneIds.getId2geneName().getIds(gname);
			}

			// Found any GeneIDs
			if (gids != null) {
				countMatched++;

				// Add all gene IDs
				for (String gid : gids)
					add(e, gid);

			} else countUnMatched++;

		}

		if (verbose) Timer.showStdErr("Done. Entities matched to geneIDs:" + countMatched + " / " + countUnMatched);
	}

	/**
	 * Load a two-column file into a Hash
	 * @param name
	 * @param fileName
	 * @param map
	 */
	protected void loadMap(String name, String fileName, HashMap map) {
		if (verbose) Timer.showStdErr("Loading " + name + " from '" + fileName + "'");

		int i = 1;
		for (String line : Gpr.readFile(fileName).split("\n")) {
			String rec[] = line.split("\t");
			String id = rec[0];
			String componentId = rec[1];
			map.put(id, componentId);
			if (verbose) Gpr.showMark(i++, SHOW_EVERY);
		}

		if (verbose) System.err.println("");
		if (verbose) Timer.showStdErr("Total objects loaded: " + map.size());
	}

	/**
	 * Load pathway events
	 * @param name
	 * @param fileName
	 * @param map
	 */
	protected void loadPathway2HasEvent() {
		String name = "Pathway_2_hasEvent";
		String fileName = dirName + name + ".txt";
		if (verbose) Timer.showStdErr("Loading " + name + " from '" + fileName + "'");

		int i = 1;
		for (String line : Gpr.readFile(fileName).split("\n")) {
			// Parse line
			String rec[] = line.split("\t");
			String id = rec[0];
			String eventId = rec[1];

			if (id.equals("DB_ID")) continue; // Skip title

			// Add event to pathway
			Pathway pathway = (Pathway) getOrCreateEntity(id);
			Event event = (Event) getOrCreateEntity(eventId);
			pathway.add(event);

			if (verbose) Gpr.showMark(i++, SHOW_EVERY);
		}

		if (verbose) System.err.println("");
		if (verbose) Timer.showStdErr("Total events assigned: " + (i - 1));
	}

	/**
	 * Load compartment information
	 * @param name
	 * @param fileName
	 * @param map
	 */
	protected void loadPhysicalEntity2Compartment() {
		String name = "PhysicalEntity_2_compartment";
		String fileName = dirName + name + ".txt";
		if (verbose) Timer.showStdErr("Loading " + name + " from '" + fileName + "'");

		int i = 1;
		for (String line : Gpr.readFile(fileName).split("\n")) {
			// Parse line
			String rec[] = line.split("\t");
			String id = rec[0];
			String compartmentId = rec[1];

			if (id.equals("DB_ID")) continue; // Skip title

			// Get entity & compartment
			Entity e = getOrCreateEntity(id);
			Compartment compartment = (Compartment) getOrCreateEntity(compartmentId);

			// Assign compartment (if not already assigned)
			if (e.getCompartment() != null) throw new RuntimeException("Compartment already assigned for entity: " + e);
			e.setCompartment(compartment);

			if (verbose) Gpr.showMark(i++, SHOW_EVERY);
		}

		if (verbose) System.err.println("");
		if (verbose) Timer.showStdErr("Total compartments assigned: " + (i - 1));
	}

	/**
	 * Load reaction catalyst
	 * @param name
	 * @param fileName
	 * @param map
	 */
	protected void loadReactionlikeEvent2CatalystActivity() {
		String name = "ReactionlikeEvent_2_catalystActivity";
		String fileName = dirName + name + ".txt";
		if (verbose) Timer.showStdErr("Loading " + name + " from '" + fileName + "'");

		int i = 1;
		for (String line : Gpr.readFile(fileName).split("\n")) {
			// Parse line
			String rec[] = line.split("\t");
			String id = rec[0];
			String catalystId = rec[1];

			if (id.equals("DB_ID")) continue; // Skip title

			// Add event to pathway
			Reaction reaction = (Reaction) entityById.get(id);
			if (reaction == null) continue; // Reaction not found? Skip

			Entity e = getOrCreateEntity(catalystId);
			reaction.addCatalyst(e);

			if (verbose) Gpr.showMark(i++, SHOW_EVERY);
		}

		if (verbose) System.err.println("");
		if (verbose) Timer.showStdErr("Total outputs assigned: " + (i - 1));
	}

	/**
	 * Load reaction inputs
	 * @param name
	 * @param fileName
	 * @param map
	 */
	protected void loadReactionlikeEvent2Input() {
		String name = "ReactionlikeEvent_2_input";
		String fileName = dirName + name + ".txt";
		if (verbose) Timer.showStdErr("Loading " + name + " from '" + fileName + "'");

		int i = 1;
		for (String line : Gpr.readFile(fileName).split("\n")) {
			// Parse line
			String rec[] = line.split("\t");
			String id = rec[0];
			String inputId = rec[1];

			if (id.equals("DB_ID")) continue; // Skip title

			// Add event to pathway
			Reaction reaction = (Reaction) entityById.get(id);
			if (reaction == null) continue; // Reaction not found? Skip

			Entity e = getOrCreateEntity(inputId);
			reaction.addInput(e);

			if (verbose) Gpr.showMark(i++, SHOW_EVERY);
		}

		if (verbose) System.err.println("");
		if (verbose) Timer.showStdErr("Total inputs assigned: " + (i - 1));
	}

	/**
	 * Load reaction outputs
	 * @param name
	 * @param fileName
	 * @param map
	 */
	protected void loadReactionlikeEvent2Output() {
		String name = "ReactionlikeEvent_2_output";
		String fileName = dirName + name + ".txt";
		if (verbose) Timer.showStdErr("Loading " + name + " from '" + fileName + "'");

		int i = 1;
		for (String line : Gpr.readFile(fileName).split("\n")) {
			// Parse line
			String rec[] = line.split("\t");
			String id = rec[0];
			String outputId = rec[1];

			if (id.equals("DB_ID")) continue; // Skip title

			// Add event to pathway
			Reaction reaction = (Reaction) entityById.get(id);
			if (reaction == null) continue; // Reaction not found? Skip

			Entity e = getOrCreateEntity(outputId);
			reaction.addOutput(e);

			if (verbose) Gpr.showMark(i++, SHOW_EVERY);
		}

		if (verbose) System.err.println("");
		if (verbose) Timer.showStdErr("Total outputs assigned: " + (i - 1));
	}

	/**
	 * Load reaction regulation
	 * @param name
	 * @param fileName
	 * @param map
	 */
	protected void loadRegulation() {
		String name = "Regulation";
		String fileName = dirName + name + ".txt";
		if (verbose) Timer.showStdErr("Loading " + name + " from '" + fileName + "'");

		int i = 1;
		for (String line : Gpr.readFile(fileName).split("\n")) {
			// Parse line
			String rec[] = line.split("\t");
			String id = rec[0];
			String regulatedEntityId = rec[1];
			String regulatorId = rec[2];

			if (id.equals("DB_ID")) continue; // Skip title

			// Add event to pathway
			//			Gpr.debug("Adding:\tregulatedEntityId: " + regulatedEntityId + "\t" + objectType.get(regulatedEntityId));
			Reaction reaction = (Reaction) entityById.get(regulatedEntityId);
			if (reaction == null) continue; // Reaction not found? Skip

			Entity e = getOrCreateEntity(regulatorId);
			reaction.addRegulator(e, objectType.get(id));

			if (verbose) Gpr.showMark(i++, SHOW_EVERY);
		}

		if (verbose) System.err.println("");
		if (verbose) Timer.showStdErr("Total regulations assigned: " + (i - 1));
	}

	/**
	 * Reset all nodes in the circuit
	 */
	public void reset() {
		for (Entity e : this)
			e.reset();
	}

	/**
	 * Run all experiments on gtex
	 * @param gtex
	 * @return
	 */
	public boolean run(Gtex gtex, String nameMatch) {
		boolean ok = true;

		for (GtexExperiment gtexExperiment : gtex) {
			if ((gtexExperiment.size() > 0) // Do we have data for this experiment?
					&& ((nameMatch == null) || gtexExperiment.getTissueTypeDetail().toLowerCase().indexOf(nameMatch.toLowerCase()) >= 0) // Does the name match (if any)
			) run(gtexExperiment);
		}

		return ok;
	}

	/**
	 * Run some simulations
	 * @param gtex
	 * @param gtexExperiment
	 */
	public boolean run(GtexExperiment gtexExperiment) {
		// Initialize 
		if (monitor == null) monitor = createMonitor(); // Create monitor if needed
		reset(); // Reset previous values
		setInputs(gtexExperiment); // Set input nodes (fixed outputs from GTEx values)
		scaleWeights(); // Scale weights

		// Calculate circuit
		calc(gtexExperiment);

		// Add results to monitors
		String experimentLabel = gtexExperiment.getTissueTypeDetail();
		if (monitor != null) monitor.addResults(experimentLabel);
		if (monitorTrace != null) monitorTrace.addResults(experimentLabel);

		return true;
	}

	/**
	 * Scale weights
	 */
	void scaleWeights() {
		for (Entity e : this)
			if (e.isReaction()) ((Reaction) e).scaleWeights();
	}

	/**
	 * Set input nodes (fixed outputs from GTEx values)
	 * @param gtex
	 */
	void setInputs(GtexExperiment gtexExperiment) {
		Gtex gtex = gtexExperiment.getGtex();

		for (String gid : gtex.getGeneIds()) {
			List entities = entitiesByGeneId.get(gid);

			if (entities != null) {
				double value = gtexExperiment.getValue(gid);
				if (!Double.isNaN(value)) {
					for (Entity e : entities)
						e.setFixedOutput(value);
				}
			}
		}
	}

	public void setMonitorTrace(Monitor monitorTrace) {
		this.monitorTrace = monitorTrace;
	}

	public void setVerbose(boolean verbose) {
		this.verbose = verbose;
	}

	/**
	 * Simplify: Removes entities that are not reachable from any 'gene' entity
	 */
	void simplifyEntities() {
		if (verbose) Timer.showStdErr("Simplify: Removing unnecesary nodes.");

		//---
		// Select entities to keep or delete
		//---

		// Entities to keep (all other entities will be deleted)
		HashSet keep = new HashSet();

		// Create a set of all genes
		HashSet genes = new HashSet();
		for (List entities : entitiesByGeneId.values())
			genes.addAll(entities);

		// Analyze each entity
		for (Entity e : entityById.values()) {
			// Calculate
			HashSet done = new HashSet();
			e.calc(done);

			// Is any of the calculated entities a gene?
			boolean ok = genes.contains(e);
			for (Entity ee : done)
				ok |= genes.contains(ee);

			// OK? Keep all these entities
			if (ok) keep.addAll(done);
		}

		// Entities to delete
		HashSet toDelete = new HashSet();
		for (Entity e : entityById.values())
			if (!keep.contains(e)) toDelete.add(e);

		//---
		// Delete entities
		//---
		int deleted = 0;
		for (Entity e : toDelete) {
			String id = "" + e.getId();
			if (entityById.remove(id) != null) deleted++;
		}

		// Done
		if (verbose) Timer.showStdErr("Simplify: done." //
				+ "\n\tGenes              : " + genes.size() //
				+ "\n\tEntities deleted   : " + deleted //
				+ "\n\tEntities remaining : " + entityById.size() //
		);
	}

	@Override
	public String toString() {
		CountByType countByType = new CountByType();

		for (Entity e : entityById.values())
			countByType.inc(e.getClass().getSimpleName());

		return countByType.toString();
	}

	/**
	 * Show details
	 * @return
	 */
	public String toStringDetails() {
		StringBuilder sb = new StringBuilder();

		for (Entity e : entityById.values())
			sb.append(e + "\n");

		return sb.toString();
	}
}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy