
org.biopax.paxtools.pattern.util.ChemicalNameNormalizer Maven / Gradle / Ivy
package org.biopax.paxtools.pattern.util;
import org.biopax.paxtools.controller.PathAccessor;
import org.biopax.paxtools.io.SimpleIOHandler;
import org.biopax.paxtools.model.Model;
import org.biopax.paxtools.model.level3.SimplePhysicalEntity;
import org.biopax.paxtools.model.level3.SmallMoleculeReference;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.util.*;
/**
* This class is used for finding a standard name for a small molecule. During detection of
* ubiquitous small molecules, we map the duplicated small molecules to one standard name, otherwise
* their degree would be divided and this would spoil the detection method.
*
* @author Ozgun Babur
*/
public class ChemicalNameNormalizer
{
/**
* Mapping from the a small molecule to the one that contains the standard name.
*/
Map map;
public static void main(String[] args) throws FileNotFoundException
{
SimpleIOHandler reader = new SimpleIOHandler();
Model model = reader.convertFromOWL(new FileInputStream(
"/home/ozgun/Projects/biopax-pattern/All-Data.owl"));
new ChemicalNameNormalizer(model);
}
/**
* Gets the standard name of the small molecule.
* @param smr the molecule to check standard name
* @return standard name
*/
public String getName(SmallMoleculeReference smr)
{
if (map.containsKey(smr)) return map.get(smr).getDisplayName();
else return smr.getDisplayName();
}
/**
* Constructor that also infers all the mapping.
* @param model the big picture
*/
public ChemicalNameNormalizer(Model model)
{
map = new HashMap();
Set standard = new HashSet();
Set other = new HashSet();
for (SmallMoleculeReference smr : model.getObjects(SmallMoleculeReference.class))
{
if (smr.getRDFId().startsWith("http://identifiers")) standard.add(smr);
else other.add(smr);
}
System.out.println("Standard smr = " + standard.size());
System.out.println("Other smr = " + other.size());
Map> smrNames = collectNames(false, standard, other);
Map> smNames = collectNames(true, standard, other);
// Unify names of standards
Map> standardSelfMatch =
getSelfMatching(standard, smrNames, smNames, true);
for (SmallMoleculeReference smr : standardSelfMatch.keySet())
{
Set matches = standardSelfMatch.get(smr);
if (matches.size() == 1)
{
SmallMoleculeReference m = matches.iterator().next();
if (smr.getDisplayName().length() <= m.getDisplayName().length())
{
map.put(smr, m);
standard.remove(smr);
}
}
else
{
System.out.print(smr.getDisplayName() + " matched more than one");
for (SmallMoleculeReference match : matches)
{
System.out.print("\t" + match.getDisplayName());
}
System.out.println();
}
}
Map> selfMatch =
getSelfMatching(other, smrNames, smNames, false);
enrichNamesWithMatchings(selfMatch, smrNames);
enrichNamesWithMatchings(selfMatch, smNames);
Set missed = new HashSet();
Map> multiMap =
new HashMap>();
for (SmallMoleculeReference smr : other)
{
Set matching = getMatching(smr, standard, smrNames, smNames);
if (matching.size() == 1)
{
map.put(smr, matching.iterator().next());
}
else if (matching.size() > 1)
{
multiMap.put(smr, matching);
}
else
{
missed.add(smr);
}
}
for (SmallMoleculeReference smr : multiMap.keySet())
{
if (isGeneric(smr)) continue;
Set matches = multiMap.get(smr);
SmallMoleculeReference rep = selectRepresentative(matches, map);
map.put(smr, rep);
for (SmallMoleculeReference match : matches)
{
if (match == rep) continue;
if (map.containsKey(match))
{
if (map.get(match) == rep) continue;
System.out.println("Already matched " + match.getDisplayName() + " to " +
map.get(match).getDisplayName() + ". This one is " + rep.getDisplayName());
}
else if (map.values().contains(match))
{
System.out.println(match.getDisplayName() + " was mapped from another chem");
}
else map.put(match, rep);
}
}
Iterator iter = multiMap.keySet().iterator();
while (iter.hasNext())
{
SmallMoleculeReference smr = iter.next();
if (map.containsKey(smr)) iter.remove();
}
System.out.println("matchCnt = " + map.size());
System.out.println("multiCnt = " + multiMap.size());
System.out.println("missCnt = " + missed.size());
System.out.println();
// printTopPart("Multi match", multiMap, 50);
// printTopPart("Miss-match", missed, 50);
}
private Map> collectNames(boolean peLevel,
Set... sets)
{
Map> map =
new HashMap>();
for (Set set : sets)
{
for (SmallMoleculeReference smr : set)
{
map.put(smr, new HashSet());
if (!peLevel)
{
for (String name : smr.getName())
{
map.get(smr).add(name.toLowerCase());
}
}
else
{
for (SimplePhysicalEntity sm : smr.getEntityReferenceOf())
{
for (String name : sm.getName())
{
map.get(smr).add(name.toLowerCase());
}
}
}
}
}
return map;
}
private Set getNameNormalizedMatching(SmallMoleculeReference smr,
Set smrs)
{
String name = null;
String dispName = smr.getDisplayName().toLowerCase();
if (dispName.endsWith("-)") || dispName.endsWith("+)"))
{
name = dispName.substring(0, dispName.lastIndexOf("(")).trim();
}
else if (dispName.endsWith(" zwitterion"))
{
name = dispName.substring(0, dispName.lastIndexOf(" ")).trim();
}
if (name == null) return Collections.emptySet();
Set matching = new HashSet();
for (SmallMoleculeReference ref : smrs)
{
if (ref.getDisplayName().toLowerCase().equals(name)) matching.add(ref);
}
return matching;
}
private Set getMatching(SmallMoleculeReference smr,
Set standard, Map> smrNames,
Map> smNames)
{
Set matching = new HashSet();
for (SmallMoleculeReference std : standard)
{
if (std.getDisplayName() != null && smr.getDisplayName() != null &&
std.getDisplayName().toLowerCase().equals(smr.getDisplayName().toLowerCase()))
matching.add(std);
}
if (!matching.isEmpty()) return matching;
for (SmallMoleculeReference std : standard)
{
for (String name : smrNames.get(smr))
{
if(smrNames.get(std).contains(name)) matching.add(std);
}
}
if (!matching.isEmpty()) return matching;
for (SmallMoleculeReference std : standard)
{
for (String name : smrNames.get(smr))
{
if(smNames.get(std).contains(name)) matching.add(std);
}
}
if (!matching.isEmpty()) return matching;
for (SmallMoleculeReference std : standard)
{
for (String name : smNames.get(smr))
{
if(smrNames.get(std).contains(name)) matching.add(std);
}
}
if (!matching.isEmpty()) return matching;
for (SmallMoleculeReference std : standard)
{
for (String name : smNames.get(smr))
{
if(smNames.get(std).contains(name)) matching.add(std);
}
}
return matching;
}
private Map> getSelfMatching(
Set smrs, Map> smrNames,
Map> smNames, boolean normalizeName)
{
Map> map =
new HashMap>();
for (SmallMoleculeReference smr : smrs)
{
Set matching = normalizeName ?
getNameNormalizedMatching(smr, smrs) :
getMatching(smr, smrs, smrNames, smNames);
assert normalizeName || !matching.isEmpty(); // it should at least detect itself
matching.remove(smr);
if (!matching.isEmpty()) map.put(smr, matching);
}
return map;
}
private static final PathAccessor INTER_ACC =
new PathAccessor("SmallMoleculeReference/entityReferenceOf/participantOf");
private Map getInteractionCounts(
Set... smrSets)
{
Map cnt = new HashMap();
for (Set smrSet : smrSets)
{
for (SmallMoleculeReference smr : smrSet)
{
if (cnt.containsKey(smr)) continue;
cnt.put(smr, INTER_ACC.getValueFromBean(smr).size());
}
}
return cnt;
}
private List getSortedList(Collection smrs,
final Map cnt)
{
List list = new ArrayList(smrs);
Collections.sort(list, new Comparator()
{
@Override
public int compare(SmallMoleculeReference o1, SmallMoleculeReference o2)
{
return cnt.get(o2).compareTo(cnt.get(o1));
}
});
return list;
}
private void printTopPart(String listName, Set smrs, int upTo)
{
Map cnt = getInteractionCounts(smrs);
List list = getSortedList(smrs, cnt);
int i = 0;
System.out.println(listName + "\n--------------");
for (SmallMoleculeReference smr : list)
{
System.out.println(cnt.get(smr) + "\t" + smr.getDisplayName());
if (++i == upTo) break;
}
System.out.println();
}
private void printTopPart(String listName,
Map> smrMap, int upTo)
{
Map cnt = getInteractionCounts(smrMap.keySet());
List list = getSortedList(smrMap.keySet(), cnt);
int i = 0;
System.out.println(listName + "\n--------------");
for (SmallMoleculeReference smr : list)
{
System.out.print(cnt.get(smr) + "\t" + smr.getDisplayName() + "\t");
for (SmallMoleculeReference match : smrMap.get(smr))
{
System.out.print("\t" + match.getDisplayName());
}
System.out.println();
if (++i == upTo) break;
}
System.out.println();
}
private void enrichNamesWithMatchings(
Map> matchMap,
Map> names)
{
for (SmallMoleculeReference smr : matchMap.keySet())
{
for (SmallMoleculeReference match : matchMap.get(smr))
{
names.get(smr).addAll(names.get(match));
}
}
}
private boolean isGeneric(SmallMoleculeReference smr)
{
if (!smr.getMemberEntityReference().isEmpty()) return true;
for (SimplePhysicalEntity sm : smr.getEntityReferenceOf())
{
if (!sm.getMemberPhysicalEntity().isEmpty()) return true;
}
return false;
}
private SmallMoleculeReference selectRepresentative(Set smrs,
final Map map)
{
List list = new ArrayList(smrs);
final Map cnt = getInteractionCounts(smrs);
Collections.sort(list, new Comparator()
{
@Override
public int compare(SmallMoleculeReference o1, SmallMoleculeReference o2)
{
if (map.containsValue(o1))
{
if (!map.containsValue(o2)) return -1;
}
else
{
if (map.containsValue(o2)) return 1;
}
if (!cnt.get(o1).equals(cnt.get(o2))) return cnt.get(o2).compareTo(cnt.get(o1));
if (o1.getDisplayName().endsWith(")"))
{
if (!o2.getDisplayName().endsWith(")")) return -1;
}
else if (o2.getDisplayName().endsWith(")")) return 1;
return o1.getDisplayName().compareTo(o2.getDisplayName());
}
});
return list.get(0);
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy