Please wait. This can take some minutes ...
Many resources are needed to download a project. Please understand that we have to compensate our server costs. Thank you in advance.
Project price only 1 $
You can buy this project and download/modify it how often you want.
org.biopax.paxtools.normalizer.Resolver Maven / Gradle / Ivy
Go to download
A BioPAX Normalizer and modified off-line MiriamLink (MIRIAM, EBI, OLS)
package org.biopax.paxtools.normalizer;
import java.io.IOException;
import java.io.InputStream;
import java.net.URL;
import java.util.*;
import java.util.concurrent.ConcurrentSkipListMap;
import java.util.regex.Pattern;
import com.fasterxml.jackson.core.type.TypeReference;
import com.fasterxml.jackson.databind.DeserializationFeature;
import com.fasterxml.jackson.databind.ObjectMapper;
import org.apache.commons.lang3.StringUtils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
/**
* Singleton bio identifier resolver utility.
*/
public class Resolver {
private static final Logger LOG = LoggerFactory.getLogger(Resolver.class);
private static Map namespaces;
private static Map spellmap; // compressed name -> prefix (combines custom spellmap.json and registry)
private static Map synonymap;// synonym -> prefix (build from custom synonymap.json and registry)
public static final String BIOREGISTRY_IO = "http://bioregistry.io/";
public static final String BIOREGISTRY_JSON_URL =
"https://raw.githubusercontent.com/biopragmatics/bioregistry/main/exports/registry/registry.json";
protected Resolver() {
}
static {
InputStream is = null;
try {
ObjectMapper om = new ObjectMapper();
om.disable(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES); //to quietly ignore the data we don't need
// om.setVisibility(VisibilityChecker.Std.defaultInstance().withFieldVisibility(JsonAutoDetect.Visibility.ANY));
// load a map of "compressed" (non letter/digit chars were removed) spelling variants to combine with namespaces map
is = Resolver.class.getResourceAsStream("spellmap.json");
spellmap = new ConcurrentSkipListMap<>(om.readValue(is, new TypeReference>(){}));
// load a custom map of synonyms
is = Resolver.class.getResourceAsStream("synonymap.json");
synonymap = new ConcurrentSkipListMap<>(om.readValue(is, new TypeReference>(){}));
//load registry.json from URL if the system prop. is true
boolean useLatestBioregistry = Boolean.getBoolean("paxtools.normalizer.use-latest-registry");
if(useLatestBioregistry) {
LOG.info("Getting registry.json from bioregistry.io (property: paxtools.normalizer.use-latest-registry=true)");
try {
is = new URL(BIOREGISTRY_JSON_URL).openStream();
namespaces = new ConcurrentSkipListMap<>(om.readValue(is, new TypeReference>() {
}));
} catch (Exception e) {
LOG.warn("Failed loading registry.json from URL (fallback to the built-in, v2023-08-26)", e);
}
}
if(namespaces == null || namespaces.isEmpty()) {
is = Resolver.class.getResourceAsStream("registry.json");
namespaces = new ConcurrentSkipListMap<>(om.readValue(is, new TypeReference>() {
}));
}
LOG.info("read registry.json to the namespaces map; size: {}", namespaces.size());
//Post-process registry entries to add synonyms and spellings
namespaces.forEach((prefix, ns) -> {
ns.setPrefix(prefix);
//todo: skip for deprecated?
String uname = ns.getName().toUpperCase();
String upref = prefix.toUpperCase();
//add name, prefix, synonyms to the synonymap (uppercase)
//also - to spellmap (removing non-alphanumeric chars)
synonymap.put(uname, prefix);
synonymap.put(upref, prefix);
spellmap.put(uname.replaceAll("[^A-Z0-9]",""), prefix);
spellmap.put(upref.replaceAll("[^A-Z0-9]",""), prefix);
//add name, prefix, synonyms to synonymap
ns.getSynonyms().forEach(syn -> {
String s = syn.toUpperCase();
synonymap.putIfAbsent(s, prefix);
spellmap.putIfAbsent(s.replaceAll("[^A-Z0-9]",""), prefix);
});
});
LOG.info("initialized");
} catch (Exception e) {
throw new RuntimeException(e);
} finally {
if(is != null) {
try {
is.close();
} catch (IOException ignored) {}
}
}
}
/**
* Checks if the identifier given follows the regular expression
* of its data type (also provided).
*
* @param identifier internal identifier used by the data type
* @param datatype name, synonym or URI of a data type
* @return "true" when datatype is recognized, and identifier either matches the pattern or there's no pattern
*/
public static boolean checkRegExp(String identifier, String datatype) {
Namespace dt = getNamespace(datatype);
if(dt == null) {
LOG.debug("Cannot check id: {} for unknown collection: {}; return: false", identifier, datatype);
return false;
}
//remove "banana" and "peel" prefix from the identifier if any defined and present
String banana = dt.getBanana();
if(StringUtils.isNotBlank(banana)) {
identifier = identifier.replaceFirst(banana+dt.getBanana_peel(), "");
}
//return true when there is no regex pattern defined or the identifier matches
return (StringUtils.isNotBlank(dt.getPattern())) ? Pattern.compile(dt.getPattern()).matcher(identifier).find() : true;
}
/**
* Gets Namespace by its Name, Prefix, URI (URN/URL), known synonym or misspelling.
*
* @param key a datatype name, prefix or URN/URL (case insensitive)
* @return Namespace bean or null
*/
public static Namespace getNamespace(String key) {
return getNamespace(key, true);
}
/**
* Gets Namespace by its Name, Prefix, URN/URL, or (optionally) spelling variant or synonym
*
* @param key a name (case insensitive)
* @param allowVariants to allow or not searching with some known synonyms/misspellings
* @return Namespace bean or null
*/
public static Namespace getNamespace(String key, boolean allowVariants) {
if(StringUtils.isBlank(key)) {
return null;
}
//uppercase!
key = key.toUpperCase();
//quick-fix some obsolete prefixes
key = StringUtils.removeEnd(key, "/");
key = key.replaceFirst("OBO\\.", "");
key = key.replaceFirst("PSI-", "");
key = key.replaceFirst("URN:MIRIAM:", "");
key = key.replaceFirst(".*IDENTIFIERS\\.ORG/(.*/)?", ""); //e.g. match both ".ORG/CHEBI/CHEBI:1234" and ".ORG/CHEBI:1234" variants
key = key.replaceFirst(".*BIOREGISTRY\\.IO/", "");
String prefix = key.toLowerCase();
Namespace ns = namespaces.get(prefix);
//also try synonyms
if(ns == null) {
prefix = synonymap.get(key); //uppercase search
if(prefix != null) {
ns = namespaces.get(prefix.toLowerCase());
}
}
if(ns == null && allowVariants) {
//keep only alphanumeric chars in the key (it's uppercase) and search the spellmap
prefix = spellmap.get(key.replaceAll("[^A-Z0-9]",""));
if(prefix != null) {
ns = namespaces.get(prefix.toLowerCase());
}
}
return ns;
}
/**
* Builds a URI of the bioentity (e.g., "http://bioregistry.io/go:0045202")
* from the collection name/synonym and bio id.
*
* @param name - name, URI, or ID of a data collection (examples: "ChEBI", "go")
* @param id identifier of an entity within the data type (examples: "GO:0045202" or "0045202", "P62158")
* @return URI (without the protocol prefix)
*/
public static String getURI(String name, String id) {
String cid = getCURIE(name, id);
return cid != null ? BIOREGISTRY_IO + cid : null;
}
/**
* Builds a normalized CURIE (Compact URI/ID) of the biochemical entity
* from the collection name/synonym and bio id.
* @param name bio namespace/db/collection name
* @param id bio identifier
* @return CURIE
*/
public static String getCURIE(String name, String id) {
Namespace ns = getNamespace(name, true);
if (ns != null && StringUtils.isNotBlank(ns.getPattern())) {
if (checkRegExp(id, name)) {
String prefix = ns.getPrefix();
//remove "banana" and "peel" prefix from the identifier if any defined and present
String banana = ns.getBanana();
if(StringUtils.isNotBlank(banana)) {
id = id.replaceFirst(banana + ns.getBanana_peel(), "");
}
return prefix + ":" + id;
}
}
return null;
}
/**
* Gets the unmodifiable map of the bio identifier types registry records.
* @return prefix to Namespace object map
*/
public static Map getNamespaces() {
return Collections.unmodifiableMap(namespaces);
}
/**
* Gets the unmodifiable map - mapping a sanitized id-type name/variant/synonym
* (non-alphanumeric chars are removed) to the corresponding namespace prefix.
* All the key/values are upper case.
* @return "sanitized" variant to prefix map
*/
public static Map getSpellmap() {
return Collections.unmodifiableMap(spellmap);
}
/**
* Customize the mapping from a bio identifier type name/variant/synonym
* (non-alphanumeric chars should be removed) to the corresponding record/namespace prefix.
* All the key/values should be stored in upper case.
*/
public static void setSpellmap(Map map) {
spellmap = map;
}
/**
* Gets the unmodifiable map - mapping an identifier type synonym to the corresponding namespace prefix.
* All the key/values are upper case.
* @return synonym to prefix map
*/
public static Map getSynonymap() {
return Collections.unmodifiableMap(synonymap);
}
/**
* Customize the mapping from a bio identifier type synonym to the corresponding namespace prefix.
* All the key/values should be stored in upper case.
*/
public static void setSynonymap(Map map) {
synonymap = map;
}
/**
*
* @param name bio identifiers type/collection name
* @return true when a namespace can be found by this name or after all non-alphanumeric chars get removed
*/
public static boolean isKnownNameOrVariant(String name) {
return getNamespace(name,true) != null;
}
}