All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.biopax.paxtools.io.gsea.GMTConverter Maven / Gradle / Ivy

The newest version!
package org.biopax.paxtools.io.gsea;

import org.apache.commons.lang3.StringUtils;
import org.biopax.paxtools.controller.*;
import org.biopax.paxtools.converter.LevelUpgrader;
import org.biopax.paxtools.model.BioPAXElement;
import org.biopax.paxtools.model.BioPAXLevel;
import org.biopax.paxtools.model.Model;
import org.biopax.paxtools.model.level3.*;
import org.biopax.paxtools.normalizer.ConfigurableIDFetcher;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import java.io.IOException;
import java.io.OutputStream;
import java.io.OutputStreamWriter;
import java.io.Writer;
import java.util.*;

/**
 * An experimental/advanced BioPAX to GMT converter
 * that can output the desired type IDs for both genetic elements and chemicals
 * (the output file can be then loaded with GSEA software if gene/protein IDs were used).
 *
 * @see GSEAConverter
 *
 * TODO: finish, test, make it public...
 */
final class GMTConverter {
	private final static Logger LOG = LoggerFactory.getLogger(GMTConverter.class);

	private final IDFetcher idFetcher;
	private boolean skipSubPathways;
	private boolean skipOutsidePathways;
	private int minNumIdsPerEntry;

	public GMTConverter()
	{
		idFetcher = new ConfigurableIDFetcher().chemDbStartsWithOrEquals("chebi")
			.seqDbStartsWithOrEquals("hgnc.symbol") //the order in the list does matter
			.seqDbStartsWithOrEquals("hgnc symbol")
			.seqDbStartsWithOrEquals("hgnc");
		skipSubPathways = true;
		minNumIdsPerEntry = 1;
		skipOutsidePathways = false;
	}

	public boolean isSkipSubPathways() {
		return skipSubPathways;
	}

	public void setSkipSubPathways(boolean skipSubPathways) {
		this.skipSubPathways = skipSubPathways;
	}

	/**
	 * If true, then only GMT entries that (genes) correspond to a Pathway
	 * are printed to the output.
	 * @return true/false
     */
	public boolean isSkipOutsidePathways() {
		return skipOutsidePathways;
	}

	public void setSkipOutsidePathways(boolean skipOutsidePathways) {
		this.skipOutsidePathways = skipOutsidePathways;
	}

	/**
	 * If this value is greater than 0, and the number of proteins/genes
	 * in a gene set is less than that value, then this gene set is to skip
	 * (no GMT entry is written).
	 * @return the min. value
     */
	public int getMinNumIdsPerEntry() {
		return minNumIdsPerEntry;
	}

	public void setMinNumIdsPerEntry(int minNumIdsPerEntry) {
		this.minNumIdsPerEntry = minNumIdsPerEntry;
	}

	/**
	 * Converts model to GMT and writes to out.
	 * See class declaration for more information.
	 *
	 * @param model Model
	 * @param out output stream to write the result to
	 * @throws IOException when there's an output stream error
	 */
	public void writeToGSEA(final Model model, OutputStream out) throws IOException
	{
		Collection entries = convert(model);
		if (entries.size() > 0)
		{
			Writer writer = new OutputStreamWriter(out);
			for (GMTEntry entry : entries) {
				if ((minNumIdsPerEntry <= 1 && !entry.identifiers().isEmpty())
						|| entry.identifiers().size() >= minNumIdsPerEntry)
				{
					writer.write(entry + "\n");
				}
			}
			writer.flush();
		}
	}

	/**
	 * Creates GMT entries from the pathways contained in the model.
	 * @param model Model
	 * @return a set of GMT entries
	 */
	public Collection convert(final Model model)
	{
		final Collection toReturn = new TreeSet<>(
			Comparator.comparing(GMTEntry::toString)
		);

		Model l3Model;
		// convert to level 3 in necessary
		if (model.getLevel() == BioPAXLevel.L2)
			l3Model = (new LevelUpgrader()).filter(model);
		else
			l3Model = model;

		//a modifiable copy of the set of all PRs in the model -
		//after all, it has all the ERs that do not belong to any pathway
		final Set entityReferences =
				new HashSet<>(l3Model.getObjects(EntityReference.class));

		final Collection pathways = l3Model.getObjects(Pathway.class);
		for (Pathway pathway : pathways)
		{
			String name = (pathway.getDisplayName() == null) ? pathway.getStandardName() : pathway.getDisplayName();
			if(name == null || name.isEmpty())
				name = pathway.getUri();

			final Pathway currentPathway = pathway;
			final String currentPathwayName = name;

			LOG.debug("Begin converting " + currentPathwayName + " pathway, uri=" + currentPathway.getUri());
			final Set ers = new HashSet<>();
			final Traverser traverser = new AbstractTraverser(SimpleEditorMap.L3,
					Fetcher.nextStepFilter, Fetcher.objectPropertiesOnlyFilter) {
				@Override
				protected void visit(Object range, BioPAXElement domain, Model model, PropertyEditor editor)
				{
					BioPAXElement bpe = (BioPAXElement) range; //cast is safe (due to objectPropertiesOnlyFilter)
					if(bpe instanceof EntityReference) {
						ers.add((EntityReference) bpe);
					}
					if(bpe instanceof Pathway) {
						if(skipSubPathways)
						{	//do not traverse into the sub-pathway; log
							LOG.debug("Skipping sub-pathway: " + bpe.getUri());
						} else {
							traverse(bpe, model);
						}
					} else {
						traverse(bpe, model);
					}
				}
			};
			//run it - collect all PRs from the pathway
			traverser.traverse(currentPathway, null);

			if(!ers.isEmpty()) {
				LOG.debug("For pathway: " + currentPathwayName + " (" + currentPathway.getUri()
						+ "), got " + ers.size() + " ERs");
				// create GMT entries
				Collection entries = createGseaEntries(currentPathway.getUri(),
						currentPathwayName, getDataSource(currentPathway.getDataSource()), ers);
				if(!entries.isEmpty())
					toReturn.addAll(entries);
				entityReferences.removeAll(ers);//keep not processed PRs (a PR can be processed multiple times)
				LOG.debug("- collected " + entries.size() + " entries.");
			}
		}

		//when there are no pathways, only empty, or pathways without any PRs, then use the rest of PRs -
		//organize PRs by species (GSEA s/w can handle only same species identifiers in a data row)
		if(!entityReferences.isEmpty() && !skipOutsidePathways) {
			LOG.info("Creating entries for the rest of PRs (outside any pathway)...");
			toReturn.addAll(createGseaEntries("other","other",
				getDataSource(l3Model.getObjects(Provenance.class)), entityReferences));
		}

		return toReturn;
	}

	private Collection createGseaEntries(String uri, String name,
																								 String dataSource,
																								 Collection ers)
	{
		Collection toReturn = new ArrayList<>();
		GMTEntry entry = new GMTEntry(uri, "", "",
			String.format("name: %s; datasource: %s", name, dataSource));
		for (EntityReference er : ers) {
			entry.identifiers().addAll(idFetcher.fetchID(er));
		}
		toReturn.add(entry);
		return toReturn;
	}

	/*
	 * Gets datasource names, if any, in a consistent way/order, excl. duplicates
	 */
	private String getDataSource(Collection provenances)
	{
		if(provenances.isEmpty()) return "N/A";
		
		Set dsNames = new TreeSet<>();
		for (Provenance provenance : provenances)
		{
			String name = provenance.getDisplayName();
			if(name == null) 
				name = provenance.getStandardName();
			if(name == null && !provenance.getName().isEmpty()) 
				name = provenance.getName().iterator().next();
			if (name != null && name.length() > 0)
				dsNames.add(name.toLowerCase());
		}
		
		return StringUtils.join(dsNames, ";");
	}

}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy