
org.biopax.paxtools.io.gsea.GSEAConverter Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of gsea-converter Show documentation
Show all versions of gsea-converter Show documentation
Converts data to the GESA (.gmt) format.
The newest version!
package org.biopax.paxtools.io.gsea;
import org.apache.commons.collections15.CollectionUtils;
import org.apache.commons.lang3.StringUtils;
import org.biopax.paxtools.controller.*;
import org.biopax.paxtools.converter.LevelUpgrader;
import org.biopax.paxtools.model.BioPAXLevel;
import org.biopax.paxtools.model.Model;
import org.biopax.paxtools.model.level3.*;
import org.biopax.paxtools.normalizer.Resolver;
import org.biopax.paxtools.util.ClassFilterSet;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.io.IOException;
import java.io.OutputStream;
import java.io.OutputStreamWriter;
import java.io.Writer;
import java.util.*;
/**
* Converts a BioPAX model to the GMT format (used by GSEA software).
*
* It creates GSEA entries from sequence entity reference xrefs
* in the BioPAX model as follows:
*
* Each entry (row) consists of three columns (tab separated):
* name (we use pathway URI),
* description (e.g. "name: Apoptosis; datasource: reactome; organism: 9606 idtype: uniprot"),
* and the list of identifiers (of the same type). For participants not associated with any pathway,
* "other" is used for the pathway name and uri.
*
* The list may have one or more IDs of the same type per entity reference,
* e.g., UniProt IDs or HGNC Symbols; entity references that do not have any xref of
* given db/id type are ignored. Optionally, if there are less than three protein
* references per entry, it will not be printed.
*
* Notes:
* - to effectively enforce the cross-species checks, 'organism' property of pathways and participants must be set
* to a BioSource element with valid ncbitaxon (taxonomy) unification xref;
* - this assumes that the BioPAX Level3 model was validated and normalized (with BioPAX Validator and Normalizer);
*/
public class GSEAConverter
{
private final static Logger LOG = LoggerFactory.getLogger(GSEAConverter.class);
private String idType;
private final boolean crossSpeciesCheckEnabled;
private Collection allowedOrganisms;
private final boolean skipSubPathways;
private Collection skipSubPathwaysOf;
private boolean skipOutsidePathways = false;
private int minNumOfGenesPerEntry = 1;
/**
* Constructor.
*/
public GSEAConverter()
{
this("", true);
}
/**
* Constructor.
*
* See class declaration for more information.
* @param idType - identifier collection/type name, prefix - either the string value
* of the most of EntityReference's xref.db property values in the BioPAX data,
* e.g., "hgnc.symbol", or "HGNC Symbol", "NCBI Gene", "RefSeq", "UniProt", -
* or the namespace/prefix part of the normalized EntityReference URIs,
* such as 'chebi' in identifiers.org/chebi:ID or bioregistry.io/chebi:ID
* (it depends on actual data; so check/verify before using here).
* @param crossSpeciesCheckEnabled - if true, enforces no cross species participants in output
*/
public GSEAConverter(String idType, boolean crossSpeciesCheckEnabled)
{
this(idType, crossSpeciesCheckEnabled, false);
}
/**
* Constructor.
*
* See class declaration for more information.
* @param idType - identifier collection/type name, prefix - either the string value
* of the most of EntityReference's xref.db property values in the BioPAX data,
* e.g., "hgnc.symbol", or "HGNC Symbol", "NCBI Gene", "RefSeq", "UniProt", -
* or the namespace/prefix part of the normalized EntityReference URIs,
* such as 'chebi' in identifiers.org/chebi:ID or bioregistry.io/chebi:ID
* (it depends on actual data; so check/verify before using here).
* @param crossSpeciesCheckEnabled - if true, enforces no cross species participants in output
* @param skipSubPathways - if true, do not traverse into any sub-pathways to collect entity references
* (useful when a model, such as converted to BioPAX KEGG data, has lots of sub-pathways, loops.)
*/
public GSEAConverter(String idType, boolean crossSpeciesCheckEnabled, boolean skipSubPathways)
{
if(Resolver.isKnownNameOrVariant(idType)) {
this.idType = Resolver.getNamespace(idType).getPrefix().toLowerCase(); //(it must be already lowecase though)
} else {
this.idType = (StringUtils.isNotBlank(idType)) ? idType.toLowerCase() : idType;
}
this.crossSpeciesCheckEnabled = crossSpeciesCheckEnabled;
this.skipSubPathways = skipSubPathways;
this.skipSubPathwaysOf = Collections.emptySet();
}
/**
* Constructor.
*
* See class declaration for more information.
* @param idType identifier collection/type name, prefix - either the string value
* of the most of EntityReference's xref.db property values in the BioPAX data,
* e.g., "hgnc.symbol", or "HGNC Symbol", "NCBI Gene", "RefSeq", "UniProt", -
* or the namespace/prefix part of the normalized EntityReference URIs,
* such as 'chebi' in identifiers.org/chebi:ID or bioregistry.io/chebi:ID
* (it depends on actual data; so check/verify before using here).
* @param crossSpeciesCheckEnabled - if true, enforces no cross species participants in output
* @param skipSubPathwaysOf - do not look inside sub-pathways of pathways of given data sources to collect entity references
* (useful when a model, such as converted to BioPAX KEGG data, has lots of sub-pathways, loops.)
*/
public GSEAConverter(String idType, boolean crossSpeciesCheckEnabled, Collection skipSubPathwaysOf)
{
this(idType, crossSpeciesCheckEnabled, false);
this.skipSubPathwaysOf = (skipSubPathwaysOf == null) ? Collections.emptySet() : skipSubPathwaysOf;
}
public String getIdType() {
return idType;
}
public void setIdType(String idType) {
this.idType = idType;
}
/**
* If true, then only GSEA entries that (genes) correspond to a Pathway
* are printed to the output.
* @return true/false
*/
public boolean isSkipOutsidePathways() {
return skipOutsidePathways;
}
public void setSkipOutsidePathways(boolean skipOutsidePathways) {
this.skipOutsidePathways = skipOutsidePathways;
}
public Collection getAllowedOrganisms() {
return allowedOrganisms;
}
public void setAllowedOrganisms(Set allowedOrganisms) {
this.allowedOrganisms = allowedOrganisms;
}
/**
* If this value is greater than 0, and the number of proteins/genes
* in a gene set is less than that value, then this gene set is to skip
* (no GSEA entry is written).
* @return the min. value
*/
public int getMinNumOfGenesPerEntry() {
return minNumOfGenesPerEntry;
}
public void setMinNumOfGenesPerEntry(int minNumOfGenesPerEntry) {
this.minNumOfGenesPerEntry = minNumOfGenesPerEntry;
}
/**
* Converts model to GSEA (GMT) and writes to out.
* See class declaration for more information.
*
* @param model Model
* @param out output stream to write the result to
* @throws IOException when there's an output stream error
*/
public void writeToGSEA(final Model model, OutputStream out) throws IOException
{
Collection entries = convert(model);
if (entries.size() > 0)
{
Writer writer = new OutputStreamWriter(out);
for (GMTEntry entry : entries) {
if ((minNumOfGenesPerEntry <= 1 && !entry.identifiers().isEmpty())
|| entry.identifiers().size() >= minNumOfGenesPerEntry)
{
writer.write(entry + "\n");
}
}
writer.flush();
}
}
/**
* Creates GSEA entries from the pathways contained in the model.
* @param model Model
* @return a set of GSEA entries
*/
public Collection convert(final Model model)
{
final Collection toReturn = new TreeSet<>(Comparator.comparing(GMTEntry::toString));
Model l3Model;
// convert to level 3 in necessary
if (model.getLevel() == BioPAXLevel.L2)
l3Model = (new LevelUpgrader()).filter(model);
else
l3Model = model;
//a modifiable copy of the set of all PRs in the model -
//after all, it has all the PRs that do not belong to any pathway
final Collection sequenceEntityReferences =
new ArrayList<>(l3Model.getObjects(SequenceEntityReference.class));
final Collection pathways = l3Model.getObjects(Pathway.class);
for (Pathway pathway : pathways)
{
String name = (pathway.getDisplayName() == null) ? pathway.getStandardName() : pathway.getDisplayName();
if(StringUtils.isBlank(name)) {
name = pathway.getUri();
}
final Pathway currentPathway = pathway;
final String currentPathwayName = name;
final boolean ignoreSubPathways =
(!skipSubPathwaysOf.isEmpty() && shareSomeObjects(currentPathway.getDataSource(), skipSubPathwaysOf))
|| skipSubPathways;
LOG.debug("Begin converting " + currentPathwayName + " pathway, uri=" + currentPathway.getUri());
// collect sequence entity references from current pathway
Fetcher fetcher = new Fetcher(SimpleEditorMap.L3, Fetcher.nextStepFilter);
fetcher.setSkipSubPathways(ignoreSubPathways);
Set pathwaySers = fetcher.fetch(currentPathway, SequenceEntityReference.class);
if(!pathwaySers.isEmpty()) {
LOG.debug("For pathway: " + currentPathwayName + " (" + currentPathway.getUri()
+ "), got " + pathwaySers.size() + " sERs; now - grouping by organism...");
Map> orgToPrsMap = organismToProteinRefsMap(pathwaySers);
// create GSEA/GMT entries - one entry per organism (null organism also makes one)
String dataSource = getDataSource(currentPathway.getDataSource());
Collection entries = createGseaEntries(currentPathway.getUri(),
currentPathwayName, dataSource, orgToPrsMap);
if(!entries.isEmpty()) {
toReturn.addAll(entries);
}
sequenceEntityReferences.removeAll(pathwaySers);//keep not processed PRs (can be processed multiple times)
LOG.debug("- collected " + entries.size() + " entries.");
}
}
//when there are no pathways, only empty pathays, pathways w/o PRs, then use all/rest of PRs -
//organize PRs by species (GSEA s/w can handle only same species identifiers in a data row)
if(!sequenceEntityReferences.isEmpty() && !skipOutsidePathways) {
LOG.info("Creating entries for the rest of PRs (outside any pathway)...");
Map> orgToPrsMap =
organismToProteinRefsMap(sequenceEntityReferences);
if(!orgToPrsMap.isEmpty()) {
// create GSEA/GMT entries - one entry per organism (null organism also makes one)
if(model.getUri()==null) {
toReturn.addAll(createGseaEntries(
"other", "other", getDataSource(l3Model.getObjects(Provenance.class)), orgToPrsMap));
} else {
toReturn.addAll(createGseaEntries(model.getUri(), model.getName(),
getDataSource(l3Model.getObjects(Provenance.class)), orgToPrsMap));
}
}
}
return toReturn;
}
private Collection createGseaEntries(
String uri, String name, String dataSource,
Map> orgToPrsMap)
{
// generate GSEA entries for each species
final Collection toReturn = new ArrayList<>();
for (final String org : orgToPrsMap.keySet()) {
if(orgToPrsMap.get(org).size() > 0) {
GMTEntry GMTEntry = new GMTEntry(uri, org, idType,
String.format("name: %s; datasource: %s", name, dataSource));
processEntityReferences(orgToPrsMap.get(org), GMTEntry);
toReturn.add(GMTEntry);
}
}
return toReturn;
}
//warn: there can be many equivalent BioSource objects (same taxonomy id, different URIs)
private Map> organismToProteinRefsMap(
Collection seqErs)
{
Map> map = new HashMap<>();
if(seqErs.isEmpty())
throw new IllegalArgumentException("Empty set");
if (crossSpeciesCheckEnabled) {
for (SequenceEntityReference r : seqErs) {
String key = getOrganismKey(r.getOrganism()); // null org. is ok (key == "")
//collect PRs only from allowed organisms
if(allowedOrganisms==null || allowedOrganisms.isEmpty() || allowedOrganisms.contains(key))
{
Collection sers = map.get(key);
if (sers == null) {
sers = new HashSet<>();
map.put(key, sers);
}
sers.add(r);
}
}
} else {
final Set sers = new HashSet<>();
for (SequenceEntityReference r : seqErs) {
String key = getOrganismKey(r.getOrganism());
//collect PRs only from allowed organisms
if(allowedOrganisms==null || allowedOrganisms.isEmpty() || allowedOrganisms.contains(key))
{
sers.add(r);
}
}
map.put("", sers);
}
return map;
}
void processEntityReferences(Collection sers, GMTEntry targetEntry)
{
for (SequenceEntityReference ser : sers)
{
// process PRs that belong to the same species (as targetEntry's) if crossSpeciesCheckEnabled==true
if (crossSpeciesCheckEnabled && !targetEntry.taxID().equals(getOrganismKey(ser.getOrganism()))) {
continue;
}
if (StringUtils.isNotBlank(idType))
{
int added = 0;
for (Xref aXref : ser.getXref())
{
if(aXref instanceof UnificationXref
&& aXref.getId() != null && aXref.getDb() != null
&& Resolver.isKnownNameOrVariant(aXref.getDb())
&& StringUtils.equalsIgnoreCase(idType, Resolver.getNamespace(aXref.getDb()).getPrefix()))
{
targetEntry.identifiers().add(aXref.getId());
++added;
}
}
if(added == 0) {
for (Xref aXref : ser.getXref())
{
if(aXref instanceof RelationshipXref
&& aXref.getId() != null && aXref.getDb() != null
&& Resolver.isKnownNameOrVariant(aXref.getDb())
&& StringUtils.equalsIgnoreCase(idType, Resolver.getNamespace(aXref.getDb()).getPrefix()))
{
targetEntry.identifiers().add(aXref.getId());
++added;
}
}
}
if(added > 12) {
LOG.info("In GSEA entry: " + targetEntry.taxID() + " " + targetEntry.name() +
", sER " + ser.getUri() + " got " + added + " '" + idType + "' identifiers...");
}
} else {
// fallback - use URI (not really useful for the GSEA software)
targetEntry.identifiers().add(ser.getUri());
}
}
}
/*
* Gets datasource names, if any, in a consistent way/order, excl. duplicates
*/
private String getDataSource(Collection provenances)
{
if(provenances.isEmpty()) return "N/A";
Set dsNames = new TreeSet<>();
for (Provenance provenance : provenances)
{
String name = provenance.getDisplayName();
if(name == null)
name = provenance.getStandardName();
if(name == null && !provenance.getName().isEmpty())
name = provenance.getName().iterator().next();
if (name != null && name.length() > 0)
dsNames.add(name.toLowerCase());
}
return StringUtils.join(dsNames, ";");
}
private String getOrganismKey(BioSource org) {
if (org == null) {
return "";//default: unspecified/all species
}
//find a first taxonomy ux (just ignore other/bad data as we cannot do much here then...):
String key = (new ClassFilterSet<>(org.getXref(), UnificationXref.class))
.stream()
.filter(x -> Resolver.isKnownNameOrVariant(x.getDb()) && "ncbitaxon".equals(Resolver.getNamespace(x.getDb()).getPrefix()))
.map(Xref::getId)
.findFirst().orElse("");
//when there are no Taxonomy xrefs - use a name
if(key.isEmpty()) {
if (org.getStandardName()!=null) {
key = org.getStandardName();
} else if(org.getDisplayName()!=null) {
key = org.getDisplayName();
} else if(!org.getName().isEmpty()) {
key = org.getName().iterator().next();
}
}
return key.toLowerCase();
}
private boolean shareSomeObjects(Collection> setA, Collection> setB) {
return (!setA.isEmpty() && !setB.isEmpty()) ? !CollectionUtils.intersection(setA, setB).isEmpty() : false;
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy