
slib.graph.io.loader.bio.gaf2.GraphLoader_GAF_2 Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of slib-graph-io Show documentation
Show all versions of slib-graph-io Show documentation
Graph I/O library which can be used to load and store graphs
The newest version!
/*
* Copyright or © or Copr. Ecole des Mines d'Alès (2012-2014)
*
* This software is a computer program whose purpose is to provide
* several functionalities for the processing of semantic data
* sources such as ontologies or text corpora.
*
* This software is governed by the CeCILL license under French law and
* abiding by the rules of distribution of free software. You can use,
* modify and/ or redistribute the software under the terms of the CeCILL
* license as circulated by CEA, CNRS and INRIA at the following URL
* "http://www.cecill.info".
*
* As a counterpart to the access to the source code and rights to copy,
* modify and redistribute granted by the license, users are provided only
* with a limited warranty and the software's author, the holder of the
* economic rights, and the successive licensors have only limited
* liability.
* In this respect, the user's attention is drawn to the risks associated
* with loading, using, modifying and/or developing or reproducing the
* software by the user in light of its specific status of free software,
* that may mean that it is complicated to manipulate, and that also
* therefore means that it is reserved for developers and experienced
* professionals having in-depth computer knowledge. Users are therefore
* encouraged to load and test the software's suitability as regards their
* requirements in conditions enabling the security of their systems and/or
* data to be ensured and, more generally, to use and operate it in the
* same conditions as regards security.
*
* The fact that you are presently reading this means that you have had
* knowledge of the CeCILL license and that you accept its terms.
*/
package slib.graph.io.loader.bio.gaf2;
import java.util.HashSet;
import java.util.Set;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.openrdf.model.URI;
import org.openrdf.model.vocabulary.RDF;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import slib.graph.io.conf.GDataConf;
import slib.graph.io.loader.GraphLoader;
import slib.graph.io.loader.utils.filter.graph.Filter;
import slib.graph.io.loader.utils.filter.graph.gaf2.FilterGraph_GAF2;
import slib.graph.io.loader.utils.filter.graph.gaf2.FilterGraph_GAF2_cst;
import slib.graph.io.loader.utils.filter.graph.repo.FilterRepository;
import slib.graph.model.graph.G;
import slib.graph.model.impl.repo.URIFactoryMemory;
import slib.graph.model.repo.URIFactory;
import slib.utils.ex.SLIB_Ex_Critic;
import slib.utils.impl.BigFileReader;
/**
* GAF 2 parser used to construct a graph from a GAF 2.0 annotation file
*
* The parser consider considering GAF 2.0 specification provided at
* http://wiki.geneontology.org/index.php/GO_Consortium_web_GAF2.0_documentation
* http://www.geneontology.org/GO.format.gaf-2_0.shtml
*
* TODO - Manage multiple organism specification : taxon:1|taxon:1000
*
* @author Sébastien Harispe
*/
public class GraphLoader_GAF_2 implements GraphLoader {
public final static int DB = 0;
public final static int DB_OBJECT_ID = 1;
public final static int DB_OBJECT_SYMBOL = 2;
public final static int QUALIFIER = 3;
public final static int GOID = 4;
public final static int REFERENCE = 5;
public final static int EVIDENCE_CODE = 6;
public final static int WITH = 7;
public final static int ASPECT = 8;
public final static int DB_OBJECT_NAME = 9;
public final static int DB_OBJECT_SYNONYM = 10;
public final static int DB_OBJECT_TYPE = 11;
public final static int TAXON = 12;
public final static int DATE = 13;
public final static int ASSIGNED_BY = 14;
public final static int ANNOTATION_XP = 15;
public final static int GENE_PRODUCT_ISOFORM = 16;
private G graph;
Logger logger = LoggerFactory.getLogger(this.getClass());
URIFactoryMemory factory = URIFactoryMemory.getSingleton();
String prefixUriInstance;
String defaultURIprefix;
Pattern colon = Pattern.compile(":");
/**
* Method used to load an annotation repository considering a specific
* configuration and a potential mapping restriction i.e. a {@link G}
* containing the concepts to consider. A {@link FilterGraph_GAF2} can be
* associated to a configuration in order to define restrictions to consider
* during the parsing (e.g. taxons, Evidence Code, origin knowledge base)
*
* @param conf object defining a configuration. If the configuration file
* define a {@link Filter} {@link FilterGraph_GAF2}, it will be evaluated
* during the parsing.
* @param graph a graph defining the concepts to consider, can be set to
* null if no mapping restriction have to be take into account If a graph is
* specified only annotation corresponding a graph Node will be loaded.
* @throws SLIB_Ex_Critic
*
*/
@Override
public void populate(GDataConf conf, G graph) throws SLIB_Ex_Critic {
logger.info("-------------------------------------");
logger.info("Loading data using GAF2 loader.");
logger.info("-------------------------------------");
if (graph == null) {
throw new SLIB_Ex_Critic("Cannot process Null Graph");
}
logger.info("GAF 2 loader populates graph " + graph.getURI());
this.graph = graph;
process(conf);
logger.info("-------------------------------------");
}
private void process(GDataConf conf) throws SLIB_Ex_Critic {
prefixUriInstance = (String) conf.getParameter("prefix");
if (prefixUriInstance == null) {
prefixUriInstance = graph.getURI().getNamespace();
}
logger.info("Instance URIs will be prefixed by: " + prefixUriInstance);
defaultURIprefix = prefixUriInstance;
logger.info("Default URI prefix is set to: " + prefixUriInstance);
Set filters = new HashSet();
String filtersAsStrings = (String) conf.getParameter("filters");
if (filtersAsStrings != null) {
String[] filterNames = filtersAsStrings.split(",");
FilterRepository filtersRepo = FilterRepository.getInstance();
for (String fname : filterNames) {
Filter f = filtersRepo.getFilter(fname);
if (f == null) {
throw new SLIB_Ex_Critic("Cannot locate filter associated to id " + fname);
}
filters.add(f);
}
}
FilterGraph_GAF2 filter = null;
Set taxons = null;
Set excludedEC = null;
if (!filters.isEmpty()) {
for (Filter f : filters) {
if (f instanceof FilterGraph_GAF2) {
if (filter != null) {
throw new SLIB_Ex_Critic("Two filters " + FilterGraph_GAF2_cst.TYPE + " have been specified. Only one admitted");
} else {
filter = (FilterGraph_GAF2) f;
logger.info("Filtering according to filter " + filter.getId() + "\ttype" + filter.getType());
taxons = filter.getTaxons();
excludedEC = filter.getExcludedEC();
}
}
}
}
Pattern p_tab = Pattern.compile("\t");
Pattern p_taxid = null;
String fileLocation = conf.getLoc();
if (taxons != null) {
p_taxid = Pattern.compile(".?taxon:(\\d+).?");
}
int countEntities = 0;
int countAnnotsLoaded = 0;
logger.info("file location : " + fileLocation);
int existsQualifier = 0; // a qualifier exists for the annotation
int not_found = 0; // the annotation is not found on the loaded graph
int eC_restriction = 0; // excluded due to evidence code restriction
int taxonsRestriction = 0;
logger.info("Loading...");
URIFactory uriManager = URIFactoryMemory.getSingleton();
boolean validHeader = false;
String line, qualifier, gotermURIstring, evidenceCode, taxon_ids;
int c = 0;
try {
BigFileReader file = new BigFileReader(fileLocation);
String[] data;
URI uriGOterm, entityID;
while (file.hasNext()) {
line = file.nextTrimmed();
if (line.startsWith("!")) {
data = line.split(":");
if (data.length == 2) {
String flag = data[0].trim().substring(1);
String version = data[1].trim();
if (flag.equals("gaf-version") && (version.equals("2") || version.equals("2.0"))) {
validHeader = true;
}
}
} else if (validHeader) {
data = p_tab.split(line);
entityID = uriManager.getURI(prefixUriInstance + data[DB_OBJECT_ID]);
gotermURIstring = buildURI(data[GOID]);
qualifier = data[QUALIFIER];
evidenceCode = data[EVIDENCE_CODE];
taxon_ids = data[TAXON];
// check if Evidence Code is valid
if (excludedEC == null || EvidenceCodeRules.areValid(excludedEC, evidenceCode)) {
// We do not consider go term associated with a qualifier
// e.g. NOT, contributes_to ...
// TODO take into consideration this information !
if (qualifier.isEmpty()) {
uriGOterm = uriManager.getURI(gotermURIstring);
if (graph.containsVertex(uriGOterm)) { // if the annotation is in the graph
boolean valid = true;
if (p_taxid != null) {
Matcher m = p_taxid.matcher(taxon_ids);
valid = false;
while (m.find() && !valid) {
if (taxons != null && taxons.contains(m.group(1))) {
valid = true;
}
}
}
if (valid) {
if (!graph.containsVertex(entityID)) {
graph.addV(entityID);
countEntities++;
}
graph.addE(entityID, RDF.TYPE, uriGOterm);
countAnnotsLoaded++;
} else {
taxonsRestriction++;
}
} else {
not_found++;
logger.debug("Cannot found GO term " + uriGOterm);
}
} else {
existsQualifier++;
}
} else {
eC_restriction++;
}
}
c++;
if (c % 1000000 == 0) {
logger.info(c + " GAF entries processed");
}
}
file.close();
} catch (Exception e) {
throw new SLIB_Ex_Critic(e);
}
if (!validHeader) {
throw new SLIB_Ex_Critic("Invalid header for GAF-2 file " + fileLocation + "\nExpecting \"!gaf-version: 2.0\" as first line");
}
logger.info("\tExcluded - Taxons restriction : " + taxonsRestriction);
logger.info("\tExcluded - Evidence Code restriction : " + eC_restriction);
logger.info("\tExcluded - Contains qualifier : " + existsQualifier);
logger.info("\tNot found unexisting term in the graph : " + not_found);
logger.info("Number of Instance loaded : " + countEntities);
logger.info("Number of Annotation loaded : " + countAnnotsLoaded);
logger.info("GAF2 Loader done.");
}
private String buildURI(String value) throws SLIB_Ex_Critic {
String info[] = getDataColonSplit(value);
if (info != null && info.length == 2) {
String ns = factory.getNamespace(info[0]);
if (ns == null) {
throw new SLIB_Ex_Critic("No namespace associated to prefix " + info[0] + ". Cannot load " + value + ", please load required namespace prefix");
}
return ns + info[1];
} else {
return defaultURIprefix + value;
}
}
private String[] getDataColonSplit(String line) {
if (line.isEmpty()) {
return null;
}
String data[] = colon.split(line);
data[0] = data[0].trim();
if (data.length > 1) {
data[1] = data[1].trim();
}
return data;
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy