
org.biopax.paxtools.io.gsea.GMTConverter Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of gsea-converter Show documentation
Show all versions of gsea-converter Show documentation
Converts data to the GESA (.gmt) format.
The newest version!
package org.biopax.paxtools.io.gsea;
import org.apache.commons.lang3.StringUtils;
import org.biopax.paxtools.controller.*;
import org.biopax.paxtools.converter.LevelUpgrader;
import org.biopax.paxtools.model.BioPAXElement;
import org.biopax.paxtools.model.BioPAXLevel;
import org.biopax.paxtools.model.Model;
import org.biopax.paxtools.model.level3.*;
import org.biopax.paxtools.normalizer.ConfigurableIDFetcher;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.io.IOException;
import java.io.OutputStream;
import java.io.OutputStreamWriter;
import java.io.Writer;
import java.util.*;
/**
* An experimental/advanced BioPAX to GMT converter
* that can output the desired type IDs for both genetic elements and chemicals
* (the output file can be then loaded with GSEA software if gene/protein IDs were used).
*
* @see GSEAConverter
*
* TODO: finish, test, make it public...
*/
final class GMTConverter {
private final static Logger LOG = LoggerFactory.getLogger(GMTConverter.class);
private final IDFetcher idFetcher;
private boolean skipSubPathways;
private boolean skipOutsidePathways;
private int minNumIdsPerEntry;
public GMTConverter()
{
idFetcher = new ConfigurableIDFetcher().chemDbStartsWithOrEquals("chebi")
.seqDbStartsWithOrEquals("hgnc.symbol") //the order in the list does matter
.seqDbStartsWithOrEquals("hgnc symbol")
.seqDbStartsWithOrEquals("hgnc");
skipSubPathways = true;
minNumIdsPerEntry = 1;
skipOutsidePathways = false;
}
public boolean isSkipSubPathways() {
return skipSubPathways;
}
public void setSkipSubPathways(boolean skipSubPathways) {
this.skipSubPathways = skipSubPathways;
}
/**
* If true, then only GMT entries that (genes) correspond to a Pathway
* are printed to the output.
* @return true/false
*/
public boolean isSkipOutsidePathways() {
return skipOutsidePathways;
}
public void setSkipOutsidePathways(boolean skipOutsidePathways) {
this.skipOutsidePathways = skipOutsidePathways;
}
/**
* If this value is greater than 0, and the number of proteins/genes
* in a gene set is less than that value, then this gene set is to skip
* (no GMT entry is written).
* @return the min. value
*/
public int getMinNumIdsPerEntry() {
return minNumIdsPerEntry;
}
public void setMinNumIdsPerEntry(int minNumIdsPerEntry) {
this.minNumIdsPerEntry = minNumIdsPerEntry;
}
/**
* Converts model to GMT and writes to out.
* See class declaration for more information.
*
* @param model Model
* @param out output stream to write the result to
* @throws IOException when there's an output stream error
*/
public void writeToGSEA(final Model model, OutputStream out) throws IOException
{
Collection entries = convert(model);
if (entries.size() > 0)
{
Writer writer = new OutputStreamWriter(out);
for (GMTEntry entry : entries) {
if ((minNumIdsPerEntry <= 1 && !entry.identifiers().isEmpty())
|| entry.identifiers().size() >= minNumIdsPerEntry)
{
writer.write(entry + "\n");
}
}
writer.flush();
}
}
/**
* Creates GMT entries from the pathways contained in the model.
* @param model Model
* @return a set of GMT entries
*/
public Collection convert(final Model model)
{
final Collection toReturn = new TreeSet<>(
Comparator.comparing(GMTEntry::toString)
);
Model l3Model;
// convert to level 3 in necessary
if (model.getLevel() == BioPAXLevel.L2)
l3Model = (new LevelUpgrader()).filter(model);
else
l3Model = model;
//a modifiable copy of the set of all PRs in the model -
//after all, it has all the ERs that do not belong to any pathway
final Set entityReferences =
new HashSet<>(l3Model.getObjects(EntityReference.class));
final Collection pathways = l3Model.getObjects(Pathway.class);
for (Pathway pathway : pathways)
{
String name = (pathway.getDisplayName() == null) ? pathway.getStandardName() : pathway.getDisplayName();
if(name == null || name.isEmpty())
name = pathway.getUri();
final Pathway currentPathway = pathway;
final String currentPathwayName = name;
LOG.debug("Begin converting " + currentPathwayName + " pathway, uri=" + currentPathway.getUri());
final Set ers = new HashSet<>();
final Traverser traverser = new AbstractTraverser(SimpleEditorMap.L3,
Fetcher.nextStepFilter, Fetcher.objectPropertiesOnlyFilter) {
@Override
protected void visit(Object range, BioPAXElement domain, Model model, PropertyEditor editor)
{
BioPAXElement bpe = (BioPAXElement) range; //cast is safe (due to objectPropertiesOnlyFilter)
if(bpe instanceof EntityReference) {
ers.add((EntityReference) bpe);
}
if(bpe instanceof Pathway) {
if(skipSubPathways)
{ //do not traverse into the sub-pathway; log
LOG.debug("Skipping sub-pathway: " + bpe.getUri());
} else {
traverse(bpe, model);
}
} else {
traverse(bpe, model);
}
}
};
//run it - collect all PRs from the pathway
traverser.traverse(currentPathway, null);
if(!ers.isEmpty()) {
LOG.debug("For pathway: " + currentPathwayName + " (" + currentPathway.getUri()
+ "), got " + ers.size() + " ERs");
// create GMT entries
Collection entries = createGseaEntries(currentPathway.getUri(),
currentPathwayName, getDataSource(currentPathway.getDataSource()), ers);
if(!entries.isEmpty())
toReturn.addAll(entries);
entityReferences.removeAll(ers);//keep not processed PRs (a PR can be processed multiple times)
LOG.debug("- collected " + entries.size() + " entries.");
}
}
//when there are no pathways, only empty, or pathways without any PRs, then use the rest of PRs -
//organize PRs by species (GSEA s/w can handle only same species identifiers in a data row)
if(!entityReferences.isEmpty() && !skipOutsidePathways) {
LOG.info("Creating entries for the rest of PRs (outside any pathway)...");
toReturn.addAll(createGseaEntries("other","other",
getDataSource(l3Model.getObjects(Provenance.class)), entityReferences));
}
return toReturn;
}
private Collection createGseaEntries(String uri, String name,
String dataSource,
Collection ers)
{
Collection toReturn = new ArrayList<>();
GMTEntry entry = new GMTEntry(uri, "", "",
String.format("name: %s; datasource: %s", name, dataSource));
for (EntityReference er : ers) {
entry.identifiers().addAll(idFetcher.fetchID(er));
}
toReturn.add(entry);
return toReturn;
}
/*
* Gets datasource names, if any, in a consistent way/order, excl. duplicates
*/
private String getDataSource(Collection provenances)
{
if(provenances.isEmpty()) return "N/A";
Set dsNames = new TreeSet<>();
for (Provenance provenance : provenances)
{
String name = provenance.getDisplayName();
if(name == null)
name = provenance.getStandardName();
if(name == null && !provenance.getName().isEmpty())
name = provenance.getName().iterator().next();
if (name != null && name.length() > 0)
dsNames.add(name.toLowerCase());
}
return StringUtils.join(dsNames, ";");
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy