All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.xmlcml.cml.tools.AtomTreeMatcher Maven / Gradle / Ivy

/**
 *    Copyright 2011 Peter Murray-Rust et. al.
 *
 *    Licensed under the Apache License, Version 2.0 (the "License");
 *    you may not use this file except in compliance with the License.
 *    You may obtain a copy of the License at
 *
 *        http://www.apache.org/licenses/LICENSE-2.0
 *
 *    Unless required by applicable law or agreed to in writing, software
 *    distributed under the License is distributed on an "AS IS" BASIS,
 *    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 *    See the License for the specific language governing permissions and
 *    limitations under the License.
 */

package org.xmlcml.cml.tools;

import java.util.ArrayList;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;

import org.apache.log4j.Logger;
import org.xmlcml.cml.base.CMLConstants;
import org.xmlcml.cml.base.CMLElements;
import org.xmlcml.cml.element.CMLAtom;
import org.xmlcml.cml.element.CMLAtomSet;
import org.xmlcml.cml.element.CMLLink;
import org.xmlcml.cml.element.CMLMap;
import org.xmlcml.cml.element.CMLMap.Direction;
import org.xmlcml.euclid.IntMatrix;
import org.xmlcml.molutil.ChemicalElement;

public class AtomTreeMatcher extends AtomMatcher {

	private static Logger LOG = Logger.getLogger(AtomTreeMatcher.class);

	public static String BALANCED = "balanced";
	public static String UNBALANCED = "unbalanced";
	public static String ORPHAN = "orphan";
	public static String COMMON_LIGANDS = "commonLigands";
	public static String COMMON_ATOMTREE = "commonAtomTree";
	public static String UNIQUE_TREE = "unique treeString";
	
	private static int SAFETY = 3;
	
	private AtomTreeData atomTreeData0;
	private AtomTreeData atomTreeData1;
	private CMLMap cmlMap = null;
  
	private Map from2ToAtomAtomMap;

	private Map to2FromAtomAtomMap;
	
	public CMLMap match(CMLAtomSet atomSet0, CMLAtomSet atomSet1, String title) {
		atomTreeData0 = new AtomTreeData(atomSet0);
		atomTreeData1 = new AtomTreeData(atomSet1);

		Map atomSetByAtomTreeString0 = atomTreeData0.createAtomSetByAtomTreeString(atomMatchObject);
		Map atomSetByAtomTreeString1 = atomTreeData1.createAtomSetByAtomTreeString(atomMatchObject);
		if (atomSetByAtomTreeString0.size() != 0 || atomSetByAtomTreeString1.size() != 0) {
			mapSingleMolecules(title);
		}
		return cmlMap;
	}

	private void mapSingleMolecules(String title) {
		cmlMap = makeMap();
		/*Set uniqueAtomTreeStringSet = */
		mapUniqueAtomsByTreeString(UNIQUE_TREE);
		mapByUniqueLargestCommonAtomTreeStrings();
		addUniqueLigandsToUniqueAtoms();
		resolveAmbiguousLinks();
		addMissingIds();
		tidyOrphansAndMismatches();
	}

	private void tidyOrphansAndMismatches() {
		boolean change = true;
		int tries = SAFETY;
		while (tries-- > 0 || change) {
			CMLElements links = cmlMap.getLinkElements();
			List unequalLinkList = makeUnequalToFromList(links);
//			debugLinks("UNEQUAL", unequalLinkList);
			List orphanList = makeOrphanList(links);
//			debugLinks("ORPHAN", orphanList);
			change = false;
			change |= deOrphanizeSingleToFrom(orphanList);
			change |= tryToResolveConnectivity(orphanList);
			change |= tryToResolveConnectivity(unequalLinkList);
//			change |= conflateUnbalanced();
		}
	}

	private boolean tryToResolveConnectivity(List list) {
		boolean overallChange = false;
		makeAtomAtomMaps();
		for (CMLLink link : list) {
			int ii = SAFETY;
			boolean change = true;
			while (ii-- > 0 && change) {
				change = tryToResolveConnectivity(link);
				overallChange |= change;
			}
		}
		return overallChange;
	}

	private void makeAtomAtomMaps() {
		from2ToAtomAtomMap = new HashMap();
		to2FromAtomAtomMap = new HashMap();
		CMLElements links = cmlMap.getLinkElements();
		for (CMLLink link : links) {
			if (link.getTitle().startsWith(ORPHAN) ||
				link.getTitle().contains(UNBALANCED)) {
				continue;
			}
			LinkTool linkTool = LinkTool.getOrCreateTool(link);
			List fromAtoms = linkTool.getSet(Direction.FROM, atomTreeData0.atomSet).getAtoms();
			List toAtoms = linkTool.getSet(Direction.TO, atomTreeData1.atomSet).getAtoms();
			addToAtomsToAtomSetIndexedByFrom(from2ToAtomAtomMap, fromAtoms,
					toAtoms);
			addToAtomsToAtomSetIndexedByFrom(to2FromAtomAtomMap, toAtoms,
					fromAtoms);
		}
//		debugAtomAtomMap("FROM", from2ToAtomAtomMap);
//		debugAtomAtomMap("TO", to2FromAtomAtomMap);
	}

	private void debugAtomAtomMap(String title, Map atomSetX) {
		System.out.println(title);
		for (CMLAtom fromAtom : atomSetX.keySet()) {
			CMLAtomSet atomSet = atomSetX.get(fromAtom);
			System.out.println(fromAtom.getId()+" "+((atomSet == null) ? 0 : atomSet.size()));
		}
	}

	private void addToAtomsToAtomSetIndexedByFrom(
			Map from2ToAtomAtomMap,
			List fromAtoms, List toAtoms) {
		for (CMLAtom from : fromAtoms) {
			CMLAtomSet toAtomSet = from2ToAtomAtomMap.get(from);
			if (toAtomSet == null) {
				toAtomSet = new CMLAtomSet();
				from2ToAtomAtomMap.put(from, toAtomSet);
			}
			for (CMLAtom toAtom : toAtoms) {
				toAtomSet.addAtom(toAtom);
			}
		}
	}

	private boolean tryToResolveConnectivity(CMLLink orphanLink) {
		boolean match = false;
		LinkTool orphanLinkTool = LinkTool.getOrCreateTool(orphanLink);
		CMLAtomSet fromAtomSet = orphanLinkTool.getSet(Direction.FROM, atomTreeData0.atomSet);
		List fromAtoms = fromAtomSet.getAtoms();
		CMLAtomSet toAtomSet = orphanLinkTool.getSet(Direction.TO, atomTreeData1.atomSet);
		List toAtoms = toAtomSet.getAtoms();
		LOG.trace("checkingFromTo");
		for (CMLAtom fromAtom : fromAtoms) {
			for (CMLAtom toAtom : toAtoms) {
				match = doLigandsMatch(fromAtom, toAtom);
				if (match) {
					LOG.trace("MATCH!!!!!!!!!!!!!!"+fromAtom.getId()+" .. "+toAtom.getId());
					fromAtomSet.removeAtom(fromAtom);
					toAtomSet.removeAtom(toAtom);
					orphanLink.setFromSet(fromAtomSet.getAtomIDs());
					orphanLink.setToSet(toAtomSet.getAtomIDs());
					
					CMLLink link = new CMLLink();
					LinkTool linkTool = LinkTool.getOrCreateTool(link);
					linkTool.addSingleAtomsToSets(fromAtom, toAtom);
					cmlMap.addLink(link);
					link.setTitle("de-orphan");
					break;
				}
			}
			if (match) break;
		}
		return match;
	}

	private boolean doLigandsMatch(CMLAtom fromAtom, CMLAtom toAtom) {
		boolean match = false;
		List fromLigands = fromAtom.getLigandAtoms();
		List toLigands = toAtom.getLigandAtoms();
		if (fromLigands.size() == toLigands.size()) {
			for (CMLAtom fromLigand : fromLigands) {
				match = false;
				CMLAtomSet toAtomSet = from2ToAtomAtomMap.get(fromLigand);
				if (toAtomSet != null) {
					for (CMLAtom toLigand : toLigands) {
						if (toAtomSet.contains(toLigand)) {
							match = true;
							break;
						}
					}
				}
				if (!match) break;
			}
		}
		return match;
	}

	private void debugLinks(String title, List linkList) {
		if (linkList.size() > 0) {
			System.out.println(title);
			for (CMLLink link : linkList) {
				link.debug();
			}
		}
	}

	private boolean deOrphanizeSingleToFrom(List orphanList) {
		boolean change = false;
		List deOrphanList = new ArrayList();
		for (CMLLink link : orphanList) {
			if (LinkTool.getLinkSetLength(link, Direction.TO) == 1 &&
				LinkTool.getLinkSetLength(link, Direction.FROM) == 1) {
				LOG.trace("de-orphanising");
				change = true;
				link.setTitle("de-"+link.getTitle());
				deOrphanList.add(link);
			}
		}
		orphanList.removeAll(deOrphanList);
		return change;
	}

	private List makeUnequalToFromList(CMLElements links) {
		List unequalList = new ArrayList();
		for (CMLLink link : links) {
			if (!link.getTitle().startsWith(ORPHAN) && 
				link.getToSet().length != link.getFromSet().length) {
				unequalList.add(link);
			}
		}
		return unequalList;
	}

	private List makeOrphanList(CMLElements links) {
		List orphanList = new ArrayList();
		for (CMLLink link : links) {
			if (link.getTitle().startsWith(ORPHAN)) {
				orphanList.add(link);
			}
		}
		return orphanList;
	}

	void addMissingIds() {
		CMLElements links = cmlMap.getLinkElements();
		Map> idListByElementMap0 = 
			getIdListByChemicalElement(links,  Direction.FROM,  atomTreeData0.atomSet);
		Map> idListByElementMap1 = 
			getIdListByChemicalElement(links,  Direction.TO,  atomTreeData1.atomSet);
		Set elementSet0 = idListByElementMap0.keySet();
		LOG.trace("set0 "+elementSet0.size());
		Set elementSet1 = idListByElementMap1.keySet();
		LOG.trace("set1 "+elementSet1.size());
		addOrphanLinks(idListByElementMap0, idListByElementMap1, elementSet0, Direction.FROM);
		elementSet1.removeAll(elementSet0);
		addOrphanLinks(idListByElementMap1, idListByElementMap0, elementSet1, Direction.TO);
	}

	private void addOrphanLinks(
		Map> idListByElementMap0,
		Map> idListByElementMap1,
		Set elementSet0, Direction direction) {
		for (ChemicalElement elem : elementSet0) {
			List list0 = idListByElementMap0.get(elem);
			List list1 = idListByElementMap1.get(elem);
			CMLLink link = new CMLLink();
			List fromList = (direction.equals(Direction.FROM)) ? list0 : list1;
			List toList = (direction.equals(Direction.FROM)) ? list1 : list0;
			if (fromList != null) {
				link.setFromSet(fromList.toArray(new String[0]));
			}
			if (toList != null) {
				link.setToSet(toList.toArray(new String[0]));
			}
			link.setTitle(ORPHAN+" "+elem.getSymbol());
			cmlMap.addLink(link);
		}
	}

	private Map> getIdListByChemicalElement(
			CMLElements links, Direction direction, CMLAtomSet atomSet) {
		Map> idListByElementMap = new HashMap>();
		List orphanList = getAtomsWithoutLinks(links, direction, atomSet);
		for (String id : orphanList) {
			CMLAtom atom = atomSet.getAtomById(id);
			ChemicalElement elem = atom.getChemicalElement();
			List idList = idListByElementMap.get(elem);
			if (idList == null) {
				idList = new ArrayList();
				idListByElementMap.put(elem, idList);
			}
			idList.add(id);
		}
		return idListByElementMap;
	}

	private List getAtomsWithoutLinks(CMLElements links, Direction direction, CMLAtomSet atomSet) {
		Map> linkListMap = getLinkListById(links, direction);
		List missing = new ArrayList();
		List atoms = atomSet.getAtoms();
		for (CMLAtom atom : atoms) {
			String id = atom.getId();
			if (!linkListMap.containsKey(id)) {
				missing.add(id);
			}
		}
		return missing;
	}

	private void resolveAmbiguousLinks() {
		CMLElements links = cmlMap.getLinkElements();
		Map> fromLinks = getAmbiguousLinksById(links, Direction.FROM);
		Map> toLinks = getAmbiguousLinksById(links, Direction.TO);
	}

	private Map> getAmbiguousLinksById(CMLElements links, Direction direction) {
		Map> linkListMap = getLinkListById(links, direction);
		List uniqueIds = getUniqueIdsAsInOneLinkAndEqualFromToSets(
				direction, linkListMap);
		for (String id : uniqueIds) {
			linkListMap.remove(id);
		}
//		ToolUtils.debugMap("Ambig "+direction, linkListMap);
		return linkListMap;
	}

	private Map> getLinkListById(
			CMLElements links, Direction direction) {
		Map> linkListMap = new HashMap>();
		for (CMLLink link : links) {
			String[] ids = (direction.equals(Direction.FROM)) ? link.getFromSet() : link.getToSet();
			for (String id : ids) {
				List linkList = linkListMap.get(id);
				if (linkList == null) {
					linkList = new ArrayList();
					linkListMap.put(id, linkList);
				}
				linkList.add(link);
			}
		}
		return linkListMap;
	}

	private List getUniqueIdsAsInOneLinkAndEqualFromToSets(
			Direction direction, Map> linkListMap) {
		List uniqueIds = new ArrayList();
		for (String id : linkListMap.keySet()) {
			List linkList = linkListMap.get(id);
			if (linkList.size() == 1) {
				CMLLink link = linkList.get(0);
				String[] sourceIds = (direction.equals(Direction.FROM)) ? link.getFromSet() : link.getToSet();
				String[] targetIds = (direction.equals(Direction.FROM)) ? link.getToSet() : link.getFromSet();
				if (sourceIds.length == targetIds.length) {
					uniqueIds.add(id);
				}
			}
		}
		return uniqueIds;
	}

	private Set mapUniqueAtomsByTreeString(String title) {
		CMLLink cmlLink;
		Set uniqueAtomTreeStringSet = new HashSet();
		for (String atomTreeString : atomTreeData0.atomSetByAtomTreeString.keySet()) {
			CMLAtomSet atomSetx0 = atomTreeData0.atomSetByAtomTreeString.get(atomTreeString);
			CMLAtomSet atomSetx1 = atomTreeData1.atomSetByAtomTreeString.get(atomTreeString);
			cmlLink = null;
			if (atomSetx1 == null || atomSetx1.size() == 0) {
				// do nothing
			} else if (atomSetx1.size() == atomSetx0.size()) {
				String elementType  = getElementTypeFrom(atomSetx0, atomSetx1);
				cmlLink = LinkTool.makeLink(title+" "+elementType, atomSetx0, atomSetx1);
			} else {
//				atomSetx0.debug("unequal set 0");
//				atomSetx1.debug("unequal set 1");
//				LOG.info(
//					"BUG: Unequal atomSets for link in AtomTreeMatching");
			}
			if (cmlLink != null) {
				cmlMap.addUniqueLink(cmlLink, CMLMap.Direction.NEITHER);
				uniqueAtomTreeStringSet.add(atomTreeString);
			}
		}
		atomTreeData0.removeUniqueAtoms(uniqueAtomTreeStringSet);
		atomTreeData1.removeUniqueAtoms(uniqueAtomTreeStringSet);
		return uniqueAtomTreeStringSet;
	}

	private String getElementTypeFrom(CMLAtomSet atomSetx0, CMLAtomSet atomSetx1) {
		String elementType = null;
		elementType = getElementTypeFromAtomSet(atomSetx0, elementType);
		elementType = getElementTypeFromAtomSet(atomSetx1, elementType);
		return elementType;
	}

	private String getElementTypeFromAtomSet(CMLAtomSet atomSetx0,
			String elementType) {
		for (CMLAtom atom : atomSetx0.getAtoms()) {
			String newElementType = atom.getElementType();
			if (elementType == null) {
				elementType = newElementType;
			} else if (!elementType.equals(newElementType)){
				LOG.error("atomSets : "+elementType+" != "+newElementType);
//				throw new RuntimeException("atomSets : "+elementType+" != "+newElementType);
			}
		}
		return elementType;
	}

	private void mapByUniqueLargestCommonAtomTreeStrings() {
		atomTreeData0.makeSortedListAndAtomSetValues();
		List sortedAtomTreeString0 = atomTreeData0.makeSortedList();
		List sortedAtomTreeString1 = atomTreeData1.makeSortedList();
		List sortedAtomSetValues0 = atomTreeData0.makeSortedAtomSetValues();
		List sortedAtomSetValues1 = atomTreeData1.makeSortedAtomSetValues();
		
//		LOG.debug("sortedAtomTreeString0 "+sortedAtomTreeString0.size());
//		LOG.debug("sortedAtomTreeString1 "+sortedAtomTreeString1.size());
//		LOG.debug("sortedAtomSetValues0 "+sortedAtomSetValues0.size());
//		LOG.debug("sortedAtomSetValues1 "+sortedAtomSetValues1.size());
		
		IntMatrix intMatrix = AtomTree.createSimilarityMatrix(sortedAtomTreeString0, sortedAtomTreeString1);
		LOG.trace("IM "+intMatrix);
		List largestIndexList = IntMatrix.findLargestUniqueElementsInRowColumn(intMatrix);
		addMatrixElementsToMap(
				largestIndexList, sortedAtomSetValues0, sortedAtomSetValues1);
		atomTreeData0.removeUniqueElementsFromMap(0, largestIndexList);
		atomTreeData1.removeUniqueElementsFromMap(1, largestIndexList);
	}

	private void addUniqueLigandsToUniqueAtoms() {
		int safetyCount = SAFETY;
		while (safetyCount-- > 0) {
			atomTreeData0.makeLinkedAndUnlinkedAtomSets(CMLMap.Direction.FROM, cmlMap);
			atomTreeData1.makeLinkedAndUnlinkedAtomSets(CMLMap.Direction.TO, cmlMap);
			if (!expandLigandsFromAtomSets()) break;
		}
	}

	private boolean expandLigandsFromAtomSets() {
		
		MapTool mapTool = MapTool.getOrCreateTool(cmlMap);
		Map fromSetValue2ToSetValueMap = mapTool.getFromSetToSetMap(Direction.FROM);
		List fromSets = atomTreeData0.getSetsFromKeys(fromSetValue2ToSetValueMap);
		List ligandSetList0 = atomTreeData0.getNonUniqueLigandSetList();
		List ligandSetList1 = atomTreeData1.getNonUniqueLigandSetList();
		List> matchedTargetSetList = createSetOfTargetIdsForEachUnmatchedAtomSet(
				fromSetValue2ToSetValueMap, fromSets, ligandSetList0);
		boolean change = mapTargetLigandsOntoFromLigandsAndUpdateMap(
				ligandSetList1, matchedTargetSetList);
		return change;
	}

	private boolean mapTargetLigandsOntoFromLigandsAndUpdateMap(
			List ligandSetList1,
			List> matchedTargetSetList) {
		boolean change = false;
		for (int i = 0; i < matchedTargetSetList.size(); i++) {
			String[] matchedIds = matchedTargetSetList.get(i).toArray(new String[0]);
			CMLAtomSet matchedSet = AtomSetTool.createAtomSet(atomTreeData1.atomSet, matchedIds);
			for (int j = 0; j < ligandSetList1.size(); j++) {
				CMLAtomSet ligandSet1 = ligandSetList1.get(j);
				if (ligandSet1 != null) {
					if (matchedSet.size() == ligandSet1.size()) {
						if (ligandSet1.size() > 0 && ligandSet1.complement(matchedSet).size() == 0) {
							String fromSetS = atomTreeData0.currentAtomSetValueList.get(i);
							String toSetS = atomTreeData1.currentAtomSetValueList.get(j);
							addFromSetToSetLink(fromSetS, toSetS, COMMON_LIGANDS);
							atomTreeData0.removeAtoms(fromSetS);
							atomTreeData1.removeAtoms(toSetS);
							change = true;
							break;
						}
					}
				}
			}
		}
		LOG.trace("....... change "+change);
		return change;
	}

	private List> createSetOfTargetIdsForEachUnmatchedAtomSet(
			Map fromSetValue2ToSetValueMap,
			List fromSets, List ligandSetList0) {
		List> matchedTargetSetList = new ArrayList>();
		for (int i = 0, max = ligandSetList0.size(); i < max; i++) {
			CMLAtomSet ligandSet0 = ligandSetList0.get(i);
			if (ligandSet0 != null) {
				List ligands0 = ligandSet0.getAtoms();
				Set targetAtomSetValueSet = new HashSet();
				String toSetValue = null;
				for (CMLAtom ligand0 : ligands0) {
					for (CMLAtomSet fromSet : fromSets) {
						if (fromSet.contains(ligand0)) {
							String fromSetValue = fromSet.getValue();
							toSetValue = fromSetValue2ToSetValueMap.get(fromSetValue);
							break;
						}
					}
					if (toSetValue == null) {
						LOG.error("**********Cannot find match for "+ligand0.getId());
						targetAtomSetValueSet = null;
						break;
					} else {
						// currently flatten atomSets
						String[] ids = toSetValue.split(CMLConstants.S_WHITEREGEX);
						for (String id : ids) {
							targetAtomSetValueSet.add(id);
						}
					}
				}
				matchedTargetSetList.add(targetAtomSetValueSet);
			}
		}
		return matchedTargetSetList;
	}

	private void addMatrixElementsToMap(
			List largestIndexList,
			List sortedAtomTreeStringi,
			List sortedAtomTreeStringj) {
		for (int jcol = 0; jcol < largestIndexList.size(); jcol++) {
			int irow = largestIndexList.get(jcol);
			if (irow > -1) {
				String si = sortedAtomTreeStringi.get(irow);
				String sj = sortedAtomTreeStringj.get(jcol);
				addFromSetToSetLink(si, sj, COMMON_ATOMTREE);
			}
		}
//		cmlMap.debug("LINK");
	}

	private CMLLink addFromSetToSetLink(String si, String sj, String title) {
		CMLLink link = new CMLLink();
		link.setFromSet(si);
		link.setToSet(sj);
		cmlMap.addLink(link);
		CMLAtomSet fromSet = AtomSetTool.createAtomSet(atomTreeData0.atomSet, si.split(CMLConstants.S_WHITEREGEX));
		CMLAtomSet toSet = AtomSetTool.createAtomSet(atomTreeData1.atomSet, sj.split(CMLConstants.S_WHITEREGEX));
		boolean balanced = fromSet.getAtoms().size() == toSet.getAtoms().size();
		String balancedS = (balanced) ? BALANCED : UNBALANCED;
		String elementType = getElementTypeFrom(fromSet, toSet);
		if (elementType == null) {
			fromSet.debug("FROM");
			toSet.debug("TO");
			throw new RuntimeException("inconsistent atom types");
		}
		link.setTitle(balancedS+" "+title+" "+elementType);
		return link;
	}


}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy