org.xmlcml.cml.tools.AtomTreeMatcher Maven / Gradle / Ivy
/**
* Copyright 2011 Peter Murray-Rust et. al.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.xmlcml.cml.tools;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
import org.apache.log4j.Logger;
import org.xmlcml.cml.base.CMLConstants;
import org.xmlcml.cml.base.CMLElements;
import org.xmlcml.cml.element.CMLAtom;
import org.xmlcml.cml.element.CMLAtomSet;
import org.xmlcml.cml.element.CMLLink;
import org.xmlcml.cml.element.CMLMap;
import org.xmlcml.cml.element.CMLMap.Direction;
import org.xmlcml.euclid.IntMatrix;
import org.xmlcml.molutil.ChemicalElement;
public class AtomTreeMatcher extends AtomMatcher {
private static Logger LOG = Logger.getLogger(AtomTreeMatcher.class);
public static String BALANCED = "balanced";
public static String UNBALANCED = "unbalanced";
public static String ORPHAN = "orphan";
public static String COMMON_LIGANDS = "commonLigands";
public static String COMMON_ATOMTREE = "commonAtomTree";
public static String UNIQUE_TREE = "unique treeString";
private static int SAFETY = 3;
private AtomTreeData atomTreeData0;
private AtomTreeData atomTreeData1;
private CMLMap cmlMap = null;
private Map from2ToAtomAtomMap;
private Map to2FromAtomAtomMap;
public CMLMap match(CMLAtomSet atomSet0, CMLAtomSet atomSet1, String title) {
atomTreeData0 = new AtomTreeData(atomSet0);
atomTreeData1 = new AtomTreeData(atomSet1);
Map atomSetByAtomTreeString0 = atomTreeData0.createAtomSetByAtomTreeString(atomMatchObject);
Map atomSetByAtomTreeString1 = atomTreeData1.createAtomSetByAtomTreeString(atomMatchObject);
if (atomSetByAtomTreeString0.size() != 0 || atomSetByAtomTreeString1.size() != 0) {
mapSingleMolecules(title);
}
return cmlMap;
}
private void mapSingleMolecules(String title) {
cmlMap = makeMap();
/*Set uniqueAtomTreeStringSet = */
mapUniqueAtomsByTreeString(UNIQUE_TREE);
mapByUniqueLargestCommonAtomTreeStrings();
addUniqueLigandsToUniqueAtoms();
resolveAmbiguousLinks();
addMissingIds();
tidyOrphansAndMismatches();
}
private void tidyOrphansAndMismatches() {
boolean change = true;
int tries = SAFETY;
while (tries-- > 0 || change) {
CMLElements links = cmlMap.getLinkElements();
List unequalLinkList = makeUnequalToFromList(links);
// debugLinks("UNEQUAL", unequalLinkList);
List orphanList = makeOrphanList(links);
// debugLinks("ORPHAN", orphanList);
change = false;
change |= deOrphanizeSingleToFrom(orphanList);
change |= tryToResolveConnectivity(orphanList);
change |= tryToResolveConnectivity(unequalLinkList);
// change |= conflateUnbalanced();
}
}
private boolean tryToResolveConnectivity(List list) {
boolean overallChange = false;
makeAtomAtomMaps();
for (CMLLink link : list) {
int ii = SAFETY;
boolean change = true;
while (ii-- > 0 && change) {
change = tryToResolveConnectivity(link);
overallChange |= change;
}
}
return overallChange;
}
private void makeAtomAtomMaps() {
from2ToAtomAtomMap = new HashMap();
to2FromAtomAtomMap = new HashMap();
CMLElements links = cmlMap.getLinkElements();
for (CMLLink link : links) {
if (link.getTitle().startsWith(ORPHAN) ||
link.getTitle().contains(UNBALANCED)) {
continue;
}
LinkTool linkTool = LinkTool.getOrCreateTool(link);
List fromAtoms = linkTool.getSet(Direction.FROM, atomTreeData0.atomSet).getAtoms();
List toAtoms = linkTool.getSet(Direction.TO, atomTreeData1.atomSet).getAtoms();
addToAtomsToAtomSetIndexedByFrom(from2ToAtomAtomMap, fromAtoms,
toAtoms);
addToAtomsToAtomSetIndexedByFrom(to2FromAtomAtomMap, toAtoms,
fromAtoms);
}
// debugAtomAtomMap("FROM", from2ToAtomAtomMap);
// debugAtomAtomMap("TO", to2FromAtomAtomMap);
}
private void debugAtomAtomMap(String title, Map atomSetX) {
System.out.println(title);
for (CMLAtom fromAtom : atomSetX.keySet()) {
CMLAtomSet atomSet = atomSetX.get(fromAtom);
System.out.println(fromAtom.getId()+" "+((atomSet == null) ? 0 : atomSet.size()));
}
}
private void addToAtomsToAtomSetIndexedByFrom(
Map from2ToAtomAtomMap,
List fromAtoms, List toAtoms) {
for (CMLAtom from : fromAtoms) {
CMLAtomSet toAtomSet = from2ToAtomAtomMap.get(from);
if (toAtomSet == null) {
toAtomSet = new CMLAtomSet();
from2ToAtomAtomMap.put(from, toAtomSet);
}
for (CMLAtom toAtom : toAtoms) {
toAtomSet.addAtom(toAtom);
}
}
}
private boolean tryToResolveConnectivity(CMLLink orphanLink) {
boolean match = false;
LinkTool orphanLinkTool = LinkTool.getOrCreateTool(orphanLink);
CMLAtomSet fromAtomSet = orphanLinkTool.getSet(Direction.FROM, atomTreeData0.atomSet);
List fromAtoms = fromAtomSet.getAtoms();
CMLAtomSet toAtomSet = orphanLinkTool.getSet(Direction.TO, atomTreeData1.atomSet);
List toAtoms = toAtomSet.getAtoms();
LOG.trace("checkingFromTo");
for (CMLAtom fromAtom : fromAtoms) {
for (CMLAtom toAtom : toAtoms) {
match = doLigandsMatch(fromAtom, toAtom);
if (match) {
LOG.trace("MATCH!!!!!!!!!!!!!!"+fromAtom.getId()+" .. "+toAtom.getId());
fromAtomSet.removeAtom(fromAtom);
toAtomSet.removeAtom(toAtom);
orphanLink.setFromSet(fromAtomSet.getAtomIDs());
orphanLink.setToSet(toAtomSet.getAtomIDs());
CMLLink link = new CMLLink();
LinkTool linkTool = LinkTool.getOrCreateTool(link);
linkTool.addSingleAtomsToSets(fromAtom, toAtom);
cmlMap.addLink(link);
link.setTitle("de-orphan");
break;
}
}
if (match) break;
}
return match;
}
private boolean doLigandsMatch(CMLAtom fromAtom, CMLAtom toAtom) {
boolean match = false;
List fromLigands = fromAtom.getLigandAtoms();
List toLigands = toAtom.getLigandAtoms();
if (fromLigands.size() == toLigands.size()) {
for (CMLAtom fromLigand : fromLigands) {
match = false;
CMLAtomSet toAtomSet = from2ToAtomAtomMap.get(fromLigand);
if (toAtomSet != null) {
for (CMLAtom toLigand : toLigands) {
if (toAtomSet.contains(toLigand)) {
match = true;
break;
}
}
}
if (!match) break;
}
}
return match;
}
private void debugLinks(String title, List linkList) {
if (linkList.size() > 0) {
System.out.println(title);
for (CMLLink link : linkList) {
link.debug();
}
}
}
private boolean deOrphanizeSingleToFrom(List orphanList) {
boolean change = false;
List deOrphanList = new ArrayList();
for (CMLLink link : orphanList) {
if (LinkTool.getLinkSetLength(link, Direction.TO) == 1 &&
LinkTool.getLinkSetLength(link, Direction.FROM) == 1) {
LOG.trace("de-orphanising");
change = true;
link.setTitle("de-"+link.getTitle());
deOrphanList.add(link);
}
}
orphanList.removeAll(deOrphanList);
return change;
}
private List makeUnequalToFromList(CMLElements links) {
List unequalList = new ArrayList();
for (CMLLink link : links) {
if (!link.getTitle().startsWith(ORPHAN) &&
link.getToSet().length != link.getFromSet().length) {
unequalList.add(link);
}
}
return unequalList;
}
private List makeOrphanList(CMLElements links) {
List orphanList = new ArrayList();
for (CMLLink link : links) {
if (link.getTitle().startsWith(ORPHAN)) {
orphanList.add(link);
}
}
return orphanList;
}
void addMissingIds() {
CMLElements links = cmlMap.getLinkElements();
Map> idListByElementMap0 =
getIdListByChemicalElement(links, Direction.FROM, atomTreeData0.atomSet);
Map> idListByElementMap1 =
getIdListByChemicalElement(links, Direction.TO, atomTreeData1.atomSet);
Set elementSet0 = idListByElementMap0.keySet();
LOG.trace("set0 "+elementSet0.size());
Set elementSet1 = idListByElementMap1.keySet();
LOG.trace("set1 "+elementSet1.size());
addOrphanLinks(idListByElementMap0, idListByElementMap1, elementSet0, Direction.FROM);
elementSet1.removeAll(elementSet0);
addOrphanLinks(idListByElementMap1, idListByElementMap0, elementSet1, Direction.TO);
}
private void addOrphanLinks(
Map> idListByElementMap0,
Map> idListByElementMap1,
Set elementSet0, Direction direction) {
for (ChemicalElement elem : elementSet0) {
List list0 = idListByElementMap0.get(elem);
List list1 = idListByElementMap1.get(elem);
CMLLink link = new CMLLink();
List fromList = (direction.equals(Direction.FROM)) ? list0 : list1;
List toList = (direction.equals(Direction.FROM)) ? list1 : list0;
if (fromList != null) {
link.setFromSet(fromList.toArray(new String[0]));
}
if (toList != null) {
link.setToSet(toList.toArray(new String[0]));
}
link.setTitle(ORPHAN+" "+elem.getSymbol());
cmlMap.addLink(link);
}
}
private Map> getIdListByChemicalElement(
CMLElements links, Direction direction, CMLAtomSet atomSet) {
Map> idListByElementMap = new HashMap>();
List orphanList = getAtomsWithoutLinks(links, direction, atomSet);
for (String id : orphanList) {
CMLAtom atom = atomSet.getAtomById(id);
ChemicalElement elem = atom.getChemicalElement();
List idList = idListByElementMap.get(elem);
if (idList == null) {
idList = new ArrayList();
idListByElementMap.put(elem, idList);
}
idList.add(id);
}
return idListByElementMap;
}
private List getAtomsWithoutLinks(CMLElements links, Direction direction, CMLAtomSet atomSet) {
Map> linkListMap = getLinkListById(links, direction);
List missing = new ArrayList();
List atoms = atomSet.getAtoms();
for (CMLAtom atom : atoms) {
String id = atom.getId();
if (!linkListMap.containsKey(id)) {
missing.add(id);
}
}
return missing;
}
private void resolveAmbiguousLinks() {
CMLElements links = cmlMap.getLinkElements();
Map> fromLinks = getAmbiguousLinksById(links, Direction.FROM);
Map> toLinks = getAmbiguousLinksById(links, Direction.TO);
}
private Map> getAmbiguousLinksById(CMLElements links, Direction direction) {
Map> linkListMap = getLinkListById(links, direction);
List uniqueIds = getUniqueIdsAsInOneLinkAndEqualFromToSets(
direction, linkListMap);
for (String id : uniqueIds) {
linkListMap.remove(id);
}
// ToolUtils.debugMap("Ambig "+direction, linkListMap);
return linkListMap;
}
private Map> getLinkListById(
CMLElements links, Direction direction) {
Map> linkListMap = new HashMap>();
for (CMLLink link : links) {
String[] ids = (direction.equals(Direction.FROM)) ? link.getFromSet() : link.getToSet();
for (String id : ids) {
List linkList = linkListMap.get(id);
if (linkList == null) {
linkList = new ArrayList();
linkListMap.put(id, linkList);
}
linkList.add(link);
}
}
return linkListMap;
}
private List getUniqueIdsAsInOneLinkAndEqualFromToSets(
Direction direction, Map> linkListMap) {
List uniqueIds = new ArrayList();
for (String id : linkListMap.keySet()) {
List linkList = linkListMap.get(id);
if (linkList.size() == 1) {
CMLLink link = linkList.get(0);
String[] sourceIds = (direction.equals(Direction.FROM)) ? link.getFromSet() : link.getToSet();
String[] targetIds = (direction.equals(Direction.FROM)) ? link.getToSet() : link.getFromSet();
if (sourceIds.length == targetIds.length) {
uniqueIds.add(id);
}
}
}
return uniqueIds;
}
private Set mapUniqueAtomsByTreeString(String title) {
CMLLink cmlLink;
Set uniqueAtomTreeStringSet = new HashSet();
for (String atomTreeString : atomTreeData0.atomSetByAtomTreeString.keySet()) {
CMLAtomSet atomSetx0 = atomTreeData0.atomSetByAtomTreeString.get(atomTreeString);
CMLAtomSet atomSetx1 = atomTreeData1.atomSetByAtomTreeString.get(atomTreeString);
cmlLink = null;
if (atomSetx1 == null || atomSetx1.size() == 0) {
// do nothing
} else if (atomSetx1.size() == atomSetx0.size()) {
String elementType = getElementTypeFrom(atomSetx0, atomSetx1);
cmlLink = LinkTool.makeLink(title+" "+elementType, atomSetx0, atomSetx1);
} else {
// atomSetx0.debug("unequal set 0");
// atomSetx1.debug("unequal set 1");
// LOG.info(
// "BUG: Unequal atomSets for link in AtomTreeMatching");
}
if (cmlLink != null) {
cmlMap.addUniqueLink(cmlLink, CMLMap.Direction.NEITHER);
uniqueAtomTreeStringSet.add(atomTreeString);
}
}
atomTreeData0.removeUniqueAtoms(uniqueAtomTreeStringSet);
atomTreeData1.removeUniqueAtoms(uniqueAtomTreeStringSet);
return uniqueAtomTreeStringSet;
}
private String getElementTypeFrom(CMLAtomSet atomSetx0, CMLAtomSet atomSetx1) {
String elementType = null;
elementType = getElementTypeFromAtomSet(atomSetx0, elementType);
elementType = getElementTypeFromAtomSet(atomSetx1, elementType);
return elementType;
}
private String getElementTypeFromAtomSet(CMLAtomSet atomSetx0,
String elementType) {
for (CMLAtom atom : atomSetx0.getAtoms()) {
String newElementType = atom.getElementType();
if (elementType == null) {
elementType = newElementType;
} else if (!elementType.equals(newElementType)){
LOG.error("atomSets : "+elementType+" != "+newElementType);
// throw new RuntimeException("atomSets : "+elementType+" != "+newElementType);
}
}
return elementType;
}
private void mapByUniqueLargestCommonAtomTreeStrings() {
atomTreeData0.makeSortedListAndAtomSetValues();
List sortedAtomTreeString0 = atomTreeData0.makeSortedList();
List sortedAtomTreeString1 = atomTreeData1.makeSortedList();
List sortedAtomSetValues0 = atomTreeData0.makeSortedAtomSetValues();
List sortedAtomSetValues1 = atomTreeData1.makeSortedAtomSetValues();
// LOG.debug("sortedAtomTreeString0 "+sortedAtomTreeString0.size());
// LOG.debug("sortedAtomTreeString1 "+sortedAtomTreeString1.size());
// LOG.debug("sortedAtomSetValues0 "+sortedAtomSetValues0.size());
// LOG.debug("sortedAtomSetValues1 "+sortedAtomSetValues1.size());
IntMatrix intMatrix = AtomTree.createSimilarityMatrix(sortedAtomTreeString0, sortedAtomTreeString1);
LOG.trace("IM "+intMatrix);
List largestIndexList = IntMatrix.findLargestUniqueElementsInRowColumn(intMatrix);
addMatrixElementsToMap(
largestIndexList, sortedAtomSetValues0, sortedAtomSetValues1);
atomTreeData0.removeUniqueElementsFromMap(0, largestIndexList);
atomTreeData1.removeUniqueElementsFromMap(1, largestIndexList);
}
private void addUniqueLigandsToUniqueAtoms() {
int safetyCount = SAFETY;
while (safetyCount-- > 0) {
atomTreeData0.makeLinkedAndUnlinkedAtomSets(CMLMap.Direction.FROM, cmlMap);
atomTreeData1.makeLinkedAndUnlinkedAtomSets(CMLMap.Direction.TO, cmlMap);
if (!expandLigandsFromAtomSets()) break;
}
}
private boolean expandLigandsFromAtomSets() {
MapTool mapTool = MapTool.getOrCreateTool(cmlMap);
Map fromSetValue2ToSetValueMap = mapTool.getFromSetToSetMap(Direction.FROM);
List fromSets = atomTreeData0.getSetsFromKeys(fromSetValue2ToSetValueMap);
List ligandSetList0 = atomTreeData0.getNonUniqueLigandSetList();
List ligandSetList1 = atomTreeData1.getNonUniqueLigandSetList();
List> matchedTargetSetList = createSetOfTargetIdsForEachUnmatchedAtomSet(
fromSetValue2ToSetValueMap, fromSets, ligandSetList0);
boolean change = mapTargetLigandsOntoFromLigandsAndUpdateMap(
ligandSetList1, matchedTargetSetList);
return change;
}
private boolean mapTargetLigandsOntoFromLigandsAndUpdateMap(
List ligandSetList1,
List> matchedTargetSetList) {
boolean change = false;
for (int i = 0; i < matchedTargetSetList.size(); i++) {
String[] matchedIds = matchedTargetSetList.get(i).toArray(new String[0]);
CMLAtomSet matchedSet = AtomSetTool.createAtomSet(atomTreeData1.atomSet, matchedIds);
for (int j = 0; j < ligandSetList1.size(); j++) {
CMLAtomSet ligandSet1 = ligandSetList1.get(j);
if (ligandSet1 != null) {
if (matchedSet.size() == ligandSet1.size()) {
if (ligandSet1.size() > 0 && ligandSet1.complement(matchedSet).size() == 0) {
String fromSetS = atomTreeData0.currentAtomSetValueList.get(i);
String toSetS = atomTreeData1.currentAtomSetValueList.get(j);
addFromSetToSetLink(fromSetS, toSetS, COMMON_LIGANDS);
atomTreeData0.removeAtoms(fromSetS);
atomTreeData1.removeAtoms(toSetS);
change = true;
break;
}
}
}
}
}
LOG.trace("....... change "+change);
return change;
}
private List> createSetOfTargetIdsForEachUnmatchedAtomSet(
Map fromSetValue2ToSetValueMap,
List fromSets, List ligandSetList0) {
List> matchedTargetSetList = new ArrayList>();
for (int i = 0, max = ligandSetList0.size(); i < max; i++) {
CMLAtomSet ligandSet0 = ligandSetList0.get(i);
if (ligandSet0 != null) {
List ligands0 = ligandSet0.getAtoms();
Set targetAtomSetValueSet = new HashSet();
String toSetValue = null;
for (CMLAtom ligand0 : ligands0) {
for (CMLAtomSet fromSet : fromSets) {
if (fromSet.contains(ligand0)) {
String fromSetValue = fromSet.getValue();
toSetValue = fromSetValue2ToSetValueMap.get(fromSetValue);
break;
}
}
if (toSetValue == null) {
LOG.error("**********Cannot find match for "+ligand0.getId());
targetAtomSetValueSet = null;
break;
} else {
// currently flatten atomSets
String[] ids = toSetValue.split(CMLConstants.S_WHITEREGEX);
for (String id : ids) {
targetAtomSetValueSet.add(id);
}
}
}
matchedTargetSetList.add(targetAtomSetValueSet);
}
}
return matchedTargetSetList;
}
private void addMatrixElementsToMap(
List largestIndexList,
List sortedAtomTreeStringi,
List sortedAtomTreeStringj) {
for (int jcol = 0; jcol < largestIndexList.size(); jcol++) {
int irow = largestIndexList.get(jcol);
if (irow > -1) {
String si = sortedAtomTreeStringi.get(irow);
String sj = sortedAtomTreeStringj.get(jcol);
addFromSetToSetLink(si, sj, COMMON_ATOMTREE);
}
}
// cmlMap.debug("LINK");
}
private CMLLink addFromSetToSetLink(String si, String sj, String title) {
CMLLink link = new CMLLink();
link.setFromSet(si);
link.setToSet(sj);
cmlMap.addLink(link);
CMLAtomSet fromSet = AtomSetTool.createAtomSet(atomTreeData0.atomSet, si.split(CMLConstants.S_WHITEREGEX));
CMLAtomSet toSet = AtomSetTool.createAtomSet(atomTreeData1.atomSet, sj.split(CMLConstants.S_WHITEREGEX));
boolean balanced = fromSet.getAtoms().size() == toSet.getAtoms().size();
String balancedS = (balanced) ? BALANCED : UNBALANCED;
String elementType = getElementTypeFrom(fromSet, toSet);
if (elementType == null) {
fromSet.debug("FROM");
toSet.debug("TO");
throw new RuntimeException("inconsistent atom types");
}
link.setTitle(balancedS+" "+title+" "+elementType);
return link;
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy