All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.snpeff.geneOntology.GoTerms Maven / Gradle / Ivy

The newest version!
package org.snpeff.geneOntology;

import java.io.BufferedReader;
import java.io.FileReader;
import java.io.IOException;
import java.io.Serializable;
import java.util.Collection;
import java.util.Collections;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.LinkedList;
import java.util.List;
import java.util.Set;

import org.snpeff.util.Gpr;

/**
 * A collection of GO terms
 * 
 * @author Pablo Cingolani
 *
 */
@SuppressWarnings("serial")
public class GoTerms implements Iterable, Serializable {

	public static boolean debug = false; // Debug mode for this class?
	public static boolean verbose = false; // Verbose mode for this class?
	static int warnCount = 0;

	String label; // Label for this set of nodes 
	int maxRank;
	String nameSpace;
	HashMap goTermsByGoTermAcc; // Go terms indexed by GoTermName (all of them for a given ontology)
	HashMap> goTermsBySymbolId; // Go terms indexed by geneId
	HashSet interestingSymbolIdsSet; // Interesting symbols for this experiment
	HashMap rankSymbolId;

	/**
	 * Default constructor
	 */
	public GoTerms() {
		goTermsByGoTermAcc = new HashMap(); // All go terms are hashed here (by GoTermName)
		goTermsBySymbolId = new HashMap>(); // All go terms are hashed here (by symbolId)
		interestingSymbolIdsSet = new HashSet();
		rankSymbolId = new HashMap();
		maxRank = 0;
	}

	/**
	 * Constructor
	 * @param oboFile : Path to OBO description file
	 * @param nameSpace: Can be 'null' for "all namespaces"
	 * @param interestingGenesFile : Path to a file containing a list of 'interesting' genes (one geneName per line)
	 * @param geneAssocFile : A file containing lines like: "GOterm \t gene_product_id \t gene_name \n"
	 */
	public GoTerms(String oboFile, String nameSpace, String interestingGenesFile, String geneAssocFile, boolean removeObsolete, boolean useGeneId) {
		goTermsByGoTermAcc = new HashMap(); // All go terms are hashed here (by GoTermName)
		goTermsBySymbolId = new HashMap>(); // All go terms are hashed here (by GeneId)
		interestingSymbolIdsSet = new HashSet();
		this.nameSpace = nameSpace;
		maxRank = 0;

		// Read go <-> genes file
		if( oboFile != null ) readOboFile(oboFile, removeObsolete);
		if( geneAssocFile != null ) readGeneAssocFile(geneAssocFile, useGeneId);
		if( interestingGenesFile != null ) readInterestingSymbolIdsFile(interestingGenesFile);
	}

	/**
	 * Add a GOTerm (if not already in this GOTerms)
	 * WARNING: Creates 'fake' symbolNames based on symbolIds. This method is used mostly for testing / debugging
	 */
	public GoTerm add(GoTerm goTerm) {
		goTermsByGoTermAcc.put(goTerm.getAcc(), goTerm);

		for( String symbolId : goTerm.getSymbolIdSet() )
			addSymbolId(goTerm, symbolId);

		return goTerm;
	}

	/**
	 * Add a symbol as 'interesting' symbol (to every corresponding GOTerm in this set)
	 * @param symbolId : Symbol's ID number
	 */
	private void addInterestingSymbol(String symbolId, HashSet noGoTermFound) {
		// Add to list of 'interesting symbol'
		interestingSymbolIdsSet.add(symbolId);

		// Find GOTerms associated with this 'interesting symbol' (and add this symbols as 'interesting')
		Set goTerms = getGoTermsBySymbolId(symbolId);
		if( goTerms != null ) {
			for( GoTerm gt : goTerms )
				gt.addInterestingSymbolId(symbolId);
		} else if( noGoTermFound != null ) noGoTermFound.add(symbolId);
		else Gpr.debug("No GOTerms related to SymbolId:" + symbolId + " were found in this DAG (" + label + ")");
	}

	/**
	 * Add a symbol as 'interesting' symbol (to every corresponding GOTerm in this set)
	 * @param symbolName : Symbol's name
	 * @param rank : symbol's rank
	 * @param noGoTermFound : Add symbol here if there are no GOTerms associated with this symbol
	 * @return 0 if 'ok', 1 if symbol's ID is not found
	 */
	public void addInterestingSymbol(String symbolId, int rank, HashSet noGoTermFound) {
		rankSymbolId.put(symbolId, rank); // Add symbol -> rank pair 
		addInterestingSymbol(symbolId, noGoTermFound); // Add symbol
		if( maxRank < rank ) maxRank = rank;
	}

	/**
	 * Add a symbolId (as well as all needed mappings)
	 * 
	 * @param goTermAcc
	 * @param symbolId
	 * @param symbolName
	 * @param goTermType
	 * @param description
	 * 
	 * @return true if OK, false on error (GOTerm 'goTermAcc' not found)
	 */
	public boolean addSymbolId(GoTerm goTerm, String symbolId) {
		// Add symbolId to GOTerm
		goTerm.addSymbolId(symbolId);

		// Add symbolId -> Set mapping (a symbolId identifies a set of goTerms)
		Set termsSet = goTermsBySymbolId.get(symbolId);
		if( termsSet == null ) { // Need to create a new list?
			termsSet = new HashSet();
			goTermsBySymbolId.put(symbolId, termsSet);
		}
		termsSet.add(goTerm);

		// OK
		return true;
	}

	/**
	 * Use symbols for chids in DAG
	 * For every GOTerm, each child's symbols are added to the term
	 * so that root term contains every symbol and every interestingSymbol
	 */
	public void addSymbolsFromChilds() {
		for( GoTerm gt : this )
			gt.addSymbolsFromChilds(gt);
	}

	/**
	 * Create a set with all the symbols
	 */
	public Set allSymbols() {
		HashSet syms = new HashSet();
		for( GoTerm gt : this )
			syms.addAll(gt.getSymbolIdSet());
		return syms;
	}

	/**
	 * Checks that every symboolID is in the set (as 'interesting' symbols)
	 * @param interestingSymbolIds : A set of interesting symbols
	 * Throws an exception on error
	 */
	public void checkInterestingSymbolIds(Set interestingSymbolIds) {
		HashSet issc = new HashSet(); // Interesting symbols (sanity check)

		if( debug ) Gpr.debug("Checking symbols (" + interestingSymbolIds.size() + ") : " + interestingSymbolIds);

		// Add every interesting symbol in the DAG
		for( GoTerm gt : this )
			issc.addAll(gt.getInterestingSymbolIdSet());

		// Check that symbolIds contains issc
		if( !interestingSymbolIds.containsAll(issc) ) throw new RuntimeException("Not every term in symbolIds is marked in DAG:" + label + " as an interesting symbol");

		// Check that every interesting symbol in DAG is from symbolIds
		if( !issc.containsAll(interestingSymbolIds) ) throw new RuntimeException("Not every term marked as interesting in DAG " + label + " is from symbolIds\n\tInteresting symbols(" + issc.size() + "): " + issc + "\n\tsymbolIds(" + interestingSymbolIds.size() + "): " + interestingSymbolIds);

		// Are ranks being used? => Check them
		if( (rankSymbolId != null) && (rankSymbolId.keySet().size() > 0) ) {
			int maxRank = interestingSymbolIds.size();

			// Check that every rank is being used
			int ranksUsed[] = new int[maxRank + 1];
			for( int i = 0; i < maxRank; i++ )
				ranksUsed[i] = 0;

			// Check that every interestingSymbolId is ranked (if ranks are being used)
			for( String symbolId : interestingSymbolIds ) {
				Integer rank = rankSymbolId.get(symbolId);
				if( (rank == null) || (rank <= 0) || (rank > maxRank) ) throw new RuntimeException("Invalid rank for symbolId:" + symbolId + ", rank:" + rank + "(should be [1," + maxRank + "]");
				ranksUsed[rank]++;
			}

			for( int rank = 1; rank < maxRank; rank++ )
				if( ranksUsed[rank] != 1 ) throw new RuntimeException("Rank number " + rank + " is used " + ranksUsed[rank] + " times (should be used exactly 1 time)");
		}
	}

	/**
	 * Produce a GOTerm based on a list of GOTerms and a 'mask'
	 * 
	 * @param goTermList : A list of GOTerms
	 * @param activeSets : An integer (binary mask) that specifies weather a set in the list should be taken into account or not. The operation performed is: 
	 * 
	 * 		Intersection{ GOTerms where mask_bit == 1 } - Union{ GOTerms where mask_bit == 0 } )
	 * 
	 * where the minus sign '-' is actually a 'set minus' operation. This operation is done for both sets
	 * in GOTerm (i.e. symbolIds and interestingSymbolIds)
	 * 
	 * @return A GOTerm
	 */
	public GoTerm disjointSet(List goTermList, int activeSets) {
		//---
		// Produce intersections (for each term in the list)
		//---
		GoTerm gtUnion = new GoTerm("UNION", null, null, null);
		GoTerm gtIntersect = new GoTerm("INTERSECTION", null, null, null);

		int i = 0;
		boolean firstIntersection = true;
		for( GoTerm goTerm : goTermList ) {
			// Extract the i_th bit from 'activeSets'
			boolean biti = (activeSets & (1L << i)) > 0;

			if( biti ) { // Is this bit is 1? => perform an intersection
				if( firstIntersection ) { // Initialize intersection set (otherwise all intersections are empty)
					gtIntersect.union(goTerm);
					firstIntersection = false;
				} else {
					gtIntersect.intersection(goTerm);
					// Are we done? (if the intersection set is empty, it doesn't make any sense to continue
					if( gtIntersect.getTotalCount() <= 0 ) return gtIntersect;
				}
			} else gtUnion.union(goTerm);
			i++;
		}

		// Now extract the 'union' set from the intersection set (i.e. perform a 'set minus' operation)
		gtIntersect.setMinus(gtUnion);

		return gtIntersect;
	}

	/**
	 * Find a GoTerm or create it (if not found)
	 * @param symbolId
	 * @return
	 */
	GoTerm findOrCreate(String symbolId) {
		GoTerm gt = getGoTerm(symbolId);
		if( gt == null ) gt = new GoTerm(symbolId, this, nameSpace, "");
		goTermsByGoTermAcc.put(gt.getAcc(), gt);
		return gt;
	}

	public GoTerm getGoTerm(String goTermAcc) {
		return goTermsByGoTermAcc.get(goTermAcc);
	}

	public HashMap getGoTermsByGoTermAcc() {
		return goTermsByGoTermAcc;
	}

	public HashMap> getGoTermsBySymbolId() {
		return goTermsBySymbolId;
	}

	public Set getGoTermsBySymbolId(String symbolId) {
		return goTermsBySymbolId.get(symbolId);
	}

	public HashSet getInterestingSymbolIdsSet() {
		return interestingSymbolIdsSet;
	}

	public int getInterestingSymbolIdsSize() {
		return interestingSymbolIdsSet.size();
	}

	public String getLabel() {
		return label;
	}

	public int getMaxRank() {
		return maxRank;
	}

	public String getNameSpace() {
		return nameSpace;
	}

	/**
	 * Get symbol's rank
	 * @param symbolId
	 * @return
	 */
	public int getRank(String symbolId) {
		Integer rank = rankSymbolId.get(symbolId);
		if( rank == null ) return 0;
		return rank;
	}

	public HashMap getRankSymbolId() {
		return rankSymbolId;
	}

	/**
	 * Iterate through each GOterm in this GOTerms
	 */
	@Override
	public Iterator iterator() {
		return goTermsByGoTermAcc.values().iterator();
	}

	public Set keySet() {
		return goTermsByGoTermAcc.keySet();
	}

	/**
	 * Calculate each node's level (in DAG)
	 * @return maximum level
	 */
	public int levels() {
		int maxLevel = 0;
		for( GoTerm gt : this )
			maxLevel = Math.max(maxLevel, gt.getLevel());
		return maxLevel;
	}

	/**
	 * Select a number of GOTerms
	 * @param numberToSelect
	 * @return
	 */
	public List listTopTerms(int numberToSelect) {
		LinkedList list = new LinkedList();

		// Create a list of terms (to be ordered)
		LinkedList ll = new LinkedList();
		for( String go : goTermsByGoTermAcc.keySet() )
			ll.add(goTermsByGoTermAcc.get(go));
		Collections.sort(ll);

		int i = 0;
		for( GoTerm goTerm : ll )
			if( i++ < numberToSelect ) list.add(goTerm);
			else break;

		return list;
	}

	/**
	 * Calculate how many interesting symbol-IDs in are there in all these GOTerms
	 * @return Number of interesting symbols
	 */
	public int numberOfInterestingSymbols() {
		HashSet intSym = new HashSet();
		for( GoTerm gt : this )
			intSym.addAll(gt.getInterestingSymbolIdSet());
		return intSym.size();
	}

	/**
	 * Number of nodes in this DAG
	 * @return
	 */
	public int numberOfNodes() {
		return goTermsByGoTermAcc.keySet().size();
	}

	/**
	 * Calculate the number of nodes in that have at least one interesting symbol
	 * @return
	 */
	public int numberOfNodesWithOneInterestingSymbol() {
		int num = 0;
		for( GoTerm gt : this )
			if( gt.getInterestingSymbolIdsSize() >= 1 ) num++;
		return num;
	}

	/**
	 * Calculate the number of nodes in that have at least one annotated symbol
	 * @return
	 */
	public int numberOfNodesWithOneSymbol() {
		int num = 0;
		for( GoTerm gt : this )
			if( gt.getTotalCount() >= 1 ) num++;
		return num;
	}

	/**
	 * Calculate how many symbol-IDs in are there in all these GOTerms
	 * @return Number of interesting symbols
	 */
	public int numberOfSymbols() {
		HashSet syms = new HashSet();
		for( GoTerm gt : this )
			syms.addAll(gt.getSymbolIdSet());
		return syms.size();
	}

	/**
	 * Reads a file containing every gene (names and ids) associated GO terms
	 * @param goGenesFile : A file containing gene associations to GO terms
	 */
	public void readGeneAssocFile(String goGenesFile, boolean useGeneId) {
		try {
			System.err.println("Reading gene association file: '" + goGenesFile + "'");

			HashSet notFound = new HashSet();

			// Open file and initialize buffers
			BufferedReader inFile = Gpr.reader(goGenesFile);
			String line;
			int lineNum;

			// Read each line
			for( lineNum = 1; (line = inFile.readLine()) != null; lineNum++ ) {
				if( !line.startsWith("!") ) {
					String items[] = line.split("\t");
					if( items.length > 4 ) {
						String geneName = useGeneId ? items[1] : items[2]; // Use geneID or geneName?
						String goTermAcc = items[4];

						// Add mappings
						GoTerm goTerm = getGoTerm(goTermAcc);
						if( goTerm == null ) notFound.add(goTermAcc);
						else addSymbolId(goTerm, geneName);
					} else System.err.println("Ignoring line " + lineNum + ": '" + line + "'");
				}
			}

			// OK, finished
			inFile.close();

			// Show errors if any
			if( notFound.size() > 0 ) {
				LinkedList ll = new LinkedList(notFound);
				Collections.sort(ll);
				System.err.println("WARNING: Couldn't find some GOTerms while reading file '" + goGenesFile + "'\n\tNot found (" + notFound.size() + ") : " + ll);
			}

			if( verbose ) System.err.println("Finished reding GoGenes file '" + goGenesFile + "' : " + lineNum + " lines.");
		} catch(IOException e) {
			throw new RuntimeException(e);
		}
	}

	/**
	 * Reads a file with a list of 'interesting' genes (one per line)
	 * @param fileName : Can be "-" for no-file
	 * @return
	 */
	public void readInterestingSymbolIdsFile(String fileName) {
		System.err.println("Reading 'interesting' genes from file: '" + fileName + "'");

		// First: Reset 'interesting' terms
		resetInterestingSymbolIds();
		HashSet noGoTermFound = new HashSet();
		HashSet symbolNotFound = new HashSet();

		if( fileName.equals("-") ) return;

		// Read file
		int lineNum;
		try {
			// Open file and initialize buffers
			BufferedReader inFile = new BufferedReader(new FileReader(fileName));
			String line;

			// Read each line
			for( lineNum = 1; (line = inFile.readLine()) != null; lineNum++ ) {
				String symbolName = line.trim();
				addInterestingSymbol(symbolName, lineNum, noGoTermFound);
			}

			// Ok, finished
			inFile.close();
		} catch(IOException e) {
			throw new RuntimeException(e);
		}

		// Any missing value?
		if( (symbolNotFound.size() > 0) || (noGoTermFound.size() > 0) ) {
			LinkedList ngtfs = new LinkedList(noGoTermFound);
			Collections.sort(ngtfs);
			System.err.println("WARNING: There were some missing values while reading interesting symbolIds file: '" + fileName + "'" + "\n\tGenes's list size: " + lineNum + "\n\tFound: " + getInterestingSymbolIdsSize() + "\n\tNo GOTerm found for symbolIds (" + ngtfs.size() + "): " + ngtfs);
		}
	}

	/**
	 * Read an OBO file
	 * 
	 * @param oboFile
	 * @param nameSpace
	 */
	public void readOboFile(String oboFile, boolean removeObsolete) {
		try {
			// Open file and initialize buffers
			BufferedReader inFile = Gpr.reader(oboFile);
			String line;
			int lineNum;
			int found = 0, removed = 0;
			String goTermAcc = null, isa = null;
			GoTerm goTerm = null;
			HashSet goNotFound = new HashSet();

			// Read each line
			for( lineNum = 0; (line = inFile.readLine()) != null; lineNum++ ) {
				// Get term's accesion
				if( line.startsWith("id: ") ) {
					goTermAcc = line.substring(4);
					if( goTermAcc.startsWith("GO:") ) goTerm = findOrCreate(goTermAcc);
					else goTerm = null;
				} else if( goTerm != null ) {
					if( line.startsWith("namespace: ") ) { // Read namespace
						String termNameSpace = line.substring(11);
						goTerm.setNameSpace(termNameSpace);
						// Different nameSpaces? => Ignore GOTerm
						if( (nameSpace != null) && (!nameSpace.equals(termNameSpace)) ) goTermAcc = null;
					} else if( line.startsWith("name: ") ) { // GO name
						goTerm.setSescription(line.substring(6).trim());
					} else if( line.startsWith("is_obsolete: ") && removeObsolete ) { // Is this an obsolete term? => ignore it
						removeGOTerm(goTermAcc);
						removed++;
					} else if( (goTermAcc != null) && (line.startsWith("is_a: ")) ) { // Add childs & parents as needed
						isa = line.substring(6, 16);
						GoTerm parent = findOrCreate(isa); // Add this as child
						parent.addChild(goTerm);
					}
				}
			}

			// OK, finished
			inFile.close();

			// Show errors (if any)
			if( goNotFound.size() > 0 ) {
				LinkedList ll = new LinkedList(goNotFound);
				Collections.sort(ll);
				System.err.println("WARNING: Some GO-Terms were not found while loading OBO file \'" + oboFile + "\':\n\tNot found: " + goNotFound.size() + "\n\tFound:" + found + "\n\tNot found GOTerms: " + ll);
			}
		} catch(IOException e) {
			throw new RuntimeException(e);
		}

	}

	/**
	 * Remove a GOTerm
	 */
	public void removeGOTerm(String goTermAcc) {
		goTermsByGoTermAcc.remove(goTermAcc);
	}

	/**
	 * Reset every 'interesting' symbolId (on every single GOTerm in this GOTerms)
	 */
	public void resetInterestingSymbolIds() {
		maxRank = 0;
		for( GoTerm gt : this )
			gt.resetInterestingSymbolIdSet();
	}

	public Set rootNodes() {
		HashSet roots = new HashSet();

		for( GoTerm gt : this )
			roots.add(gt.rootNode());

		return roots;
	}

	/**
	 * Save gene sets file for GSEA analysis
	 * Format specification: http://www.broad.mit.edu/cancer/software/gsea/wiki/index.php/Data_formats#GMT:_Gene_Matrix_Transposed_file_format_.28.2A.gmt.29
	 * 
	 * @param fileName
	 */
	public void saveGseaGeneSets(String fileName) {
		// Create a string with all the data
		StringBuffer out = new StringBuffer();
		for( GoTerm gt : this ) {
			// Save GOTerm that have at least 1 symbolId
			if( gt.getSymbolIdSet().size() > 0 ) {
				out.append(gt.getAcc() + "\t" + gt.getDescription() + "\t");

				// Add all symbols for this GOTerm
				for( String symbolId : gt.getSymbolIdSet() )
					out.append(symbolId + "\t");
				out.append("\n");
			}
		}

		// Save it 
		Gpr.toFile(fileName, out);
	}

	public void setLabel(String label) {
		this.label = label;
	}

	@Override
	public String toString() {
		StringBuffer stb = new StringBuffer();

		// Create a list of terms (to be ordered)
		LinkedList ll = new LinkedList();
		for( String go : goTermsByGoTermAcc.keySet() )
			ll.add(goTermsByGoTermAcc.get(go));
		Collections.sort(ll);

		// Append each term
		for( GoTerm goTerm : ll )
			stb.append(goTerm.toStringAll() + "\n");

		return stb.toString();
	}

	public Collection values() {
		return goTermsByGoTermAcc.values();
	}

}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy