
org.bitbucket.kienerj.indigoutils.SubstructureReplacer Maven / Gradle / Ivy
Show all versions of indigo-utils Show documentation
/*
* Copyright (C) 2013 Joos Kiener
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see .
*/
package org.bitbucket.kienerj.indigoutils;
import com.ggasoftware.indigo.Indigo;
import com.ggasoftware.indigo.IndigoException;
import com.ggasoftware.indigo.IndigoObject;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.TreeMap;
import org.bitbucket.kienerj.indigoutils.exception.ReplaceAllDepthExceedException;
import org.slf4j.ext.XLogger;
import org.slf4j.ext.XLoggerFactory;
/**
* Replaces a given substructure (can be a SMARTS pattern) with a
* replacement. The replacement must be a fully defined molecule. The
* replacement is correctly inserted and connected the target molecule. Only
* valid molecules are returned.
*
* All possible replacements are returned once. There is no specific mapping
* of the pattern to the replacement. This means that when replacing CO with CN,
* O may map to either C or N of the replacement.
If the pattern matches
* multiple times, for each map a separate replacement is returned. This means a
* returned molecule can still contain the pattern.
The returned molecules
* are ordered naturally by their canonical smiles.
*
* Examples:
*
* The following examples substitute SMILES/SMARTS strings for molecules. You'd
* have to actually use (query) molecules (IndigoObject). It's recommended to
* look at the transformations in a real structure viewer (paste below SMILES)
* or on paper to understand how the replacement works. Else replaceResults might be
* unexpected.
*
* Simple Replacement:
* replaceSubstructure("CCOC", "CO", "CN")
* -> ("CCCN", "CCNC")
*
* Replacement that has more atoms than the pattern
* replaceSubstructure("CC(=C)OC", "CO", "CCN")
* -> ("CC(CCN)=C", "CC(NCC)=C", "CC(=C)CNC")
* Note that the 4th possible solution, "CN(=C)CCC", is generated but not returned
* due to invalid valence of the N atoms.
*
* Replacement in a "double-ring":
* replaceSubstructure("CCC1CCC2=CC=CCN2C1", "CN", "CNC")
* -> ("CCC1CC2CC=CC=C(CC1)N2", "CCC1CC2C(=CC=CCN2)CC1", "CCC1CNC2CC=CC=C2CC1")
* here one or both of the rins get bigger by 1 atom.
*
*
*
*
* @author Joos Kiener
*/
public class SubstructureReplacer {
private static final int REPLACE_ALL_DEPTH_THRESHOLD = 5;
private static final String THRESHOLD_EXCEEDED_MESSAGE = "Invalid replace-all pattern."
+ "Could not replace all occurences after %s iterations.";
private static final XLogger logger = XLoggerFactory.getXLogger("SubstructureReplacer");
private final Indigo indigo;
public SubstructureReplacer(Indigo indigo) {
this.indigo = indigo;
}
/**
* Replaces all occurrences of pattern in given molecule.
*
* Returns a full list of all possible complete replacements. See
* {@link #replaceSubstructure(IndigoObject,IndigoObject,IndigoObject) replaceSubstructure(mol, pattern, replacement)}
* for general rules how the replacing is done.
*
* To avoid an endless loop (which would happen if you replaced "C" with
* "CC"), the replacement is stopped if there are more than 5 replacements
* to be made in a single molecule and a
* ReplaceAllDepthExceedException
is thrown.
*
*
*
*
* Example:
*
* replaceAll("COCO", "CO", "CN")
* -> ("CCNN", "CCNO", "C(NC)N", "CNNC", "C(CN)N")
*
*
*
*
* @param mol the target molecule
* @param pattern the substructure to replace
* @param replacement the replacement substructure
* @return a List of molecules with all occurrences of pattern replaced with
* replacement
* @throws ReplaceAllDepthExceedException if no result is found after 5
* replacements
*/
public List replaceAll(IndigoObject mol,
IndigoObject pattern, IndigoObject replacement) {
logger.entry(mol, pattern, replacement);
TreeMap molecules =
perfromReplaceAll(mol, pattern, replacement, REPLACE_ALL_DEPTH_THRESHOLD, 0);
ArrayList results = new ArrayList<>(molecules.values());
logger.exit(results);
return results;
}
/**
*
* Replaces all occurrences of pattern in given molecule.
*
* See
* {@link #replaceAll(IndigoObject,IndigoObject,IndigoObject) replaceAll(mol, pattern, replacement)}.
*
*
* maxDepth> lets you control the number of allowed replacements until
* an Exception is thrown and the replacement terminated. Default value is
* 5.
*
* @param mol the target molecule
* @param pattern the substructure to replace
* @param replacement the replacement substructure
* @param maxDepth maximum number of replacements to be made
* @return a List of molecules with all occurrences of pattern replaced with
* replacement
* @throws ReplaceAllDepthExceedException if no result is found after 5
* replacements
*/
public List replaceAll(IndigoObject mol,
IndigoObject pattern, IndigoObject replacement, int maxDepth) {
logger.entry(mol, pattern, replacement, maxDepth);
TreeMap molecules =
perfromReplaceAll(mol, pattern, replacement, maxDepth, 0);
ArrayList results = new ArrayList<>(molecules.values());
logger.exit(results);
return results;
}
/**
* Replaces occurrences of pattern once in the given molecule. All
* possible replacements are returned as a List of molecules.
*
* There is no specific mapping of the pattern to the replacement.
*
*
* Example:
*
* replaceSubstructure("CCOC", "CO", "CN")
* -> ("CCCN", "CCNC")
*
*
*
* A result molecule can still contain the pattern:
*
*
* Example:
*
* replaceSubstructure("COCO", "CO", "CN")
* -> ("CNCO", "NCCO", ...)
*
*
*
* See
* {@link org.bitbucket.kienerj.indigoutils.SubstructureReplacer main JavaDoc for this class}
* for further information.
*
*
* @param mol the target molecule
* @param pattern the substructure to replace
* @param replacement the replacement substructure
* @return a List of molecules with exactly once occurrence of pattern
* replaced
*/
public List replaceSubstructure(IndigoObject mol,
IndigoObject pattern, IndigoObject replacement) {
logger.entry(mol, pattern, replacement);
TreeMap molecules =
perfromReplacement(mol, pattern, replacement);
ArrayList results = new ArrayList<>(molecules.values());
logger.exit(results);
return results;
}
/**
* Replaces all occurrences of pattern in mol with replacement. If there are
* too many replacements (recursion depth too big) an exception is thrown.
* The depth can be adjusted by the caller.
*
*/
private TreeMap perfromReplaceAll(IndigoObject mol,
IndigoObject pattern, IndigoObject replacement, int depthThreshold,
int depth) {
logger.entry(mol, pattern, replacement, depthThreshold, depth);
/*
* Recursion:
*
* Replaces pattern in mol with perfromReplacement()-emthod and then
* calls this method (eg. recursion) for all returned molecules.
*
* The recursion is terminated and an exception is thrown if the
* depth of the recursion is higher than the given maximum depth.
*/
if (depth > depthThreshold) {
throw new ReplaceAllDepthExceedException(String.format(
THRESHOLD_EXCEEDED_MESSAGE, depthThreshold));
}
TreeMap molecules = new TreeMap<>();
IndigoObject matcher = indigo.substructureMatcher(mol);
String canSmiles = mol.canonicalSmiles();
if (matcher.match(pattern) == null) {
molecules.put(canSmiles, mol);
logger.debug("Added molecule to results: {}", canSmiles);
} else {
logger.debug("Replace All Target SMILES: {}", canSmiles);
TreeMap replaceResults = perfromReplacement(
mol, pattern, replacement);
for (IndigoObject result : replaceResults.values()) {
// recursion here
TreeMap replaceAllResults =
perfromReplaceAll(result, pattern, replacement,
depthThreshold, depth + 1);
molecules.putAll(replaceAllResults);
}
}
logger.exit(molecules);
return molecules;
}
/**
* Replaces 1 occurrence of pattern in mol
*
*/
private TreeMap perfromReplacement(IndigoObject mol,
IndigoObject pattern, IndigoObject replacement) {
logger.entry(mol, pattern, replacement);
mol = mol.clone(); // resets atom indices -> required!!!
TreeMap molecules = new TreeMap<>();
IndigoObject matcher = indigo.substructureMatcher(mol);
IndigoObject matches = matcher.iterateMatches(pattern);
if (matches != null) {
for (IndigoObject match : matches) {
logger.debug("Found pattern in {}. Generating all possible "
+ "replacements...", mol.canonicalSmiles());
HashSet atomIndices = new HashSet<>();
int minIndex = Integer.MAX_VALUE;
int maxIndex = 0;
for (IndigoObject queryAtom : pattern.iterateAtoms()) {
IndigoObject atom = match.mapAtom(queryAtom);
if (atom != null) {
int atomIndex = atom.index();
atomIndices.add(atomIndex);
if (atomIndex < minIndex) {
minIndex = atomIndex;
}
if (atomIndex > maxIndex) {
maxIndex = atomIndex;
}
}
}
HashMap startNeighbors = new HashMap<>();
IndigoObject startAtom = mol.getAtom(minIndex);
for (IndigoObject neighbor : startAtom.iterateNeighbors()) {
int atomIndex = neighbor.index();
// neighbors that will be removed anyway don't count here
if (!atomIndices.contains(atomIndex)) {
startNeighbors.put(atomIndex, neighbor.bond().bondOrder());
}
}
HashMap endNeighbors = new HashMap<>();
IndigoObject endAtom = mol.getAtom(maxIndex);
for (IndigoObject neighbor : endAtom.iterateNeighbors()) {
int atomIndex = neighbor.index();
// neighbors that will be removed anyway don't count here
if (!atomIndices.contains(atomIndex)) {
endNeighbors.put(neighbor.index(), neighbor.bond().bondOrder());
}
}
logger.debug("\tGenerating molecule for lowest with lowest mapping");
IndigoObject lowestWithLowest = mol.clone();
IndigoObject mapping = lowestWithLowest.merge(replacement);
createBonds(lowestWithLowest, replacement, mapping, 0, startNeighbors);
/*
* If replacement is a single atom, no further bonds need to be
* created.
* single atom -> explicit hydrogens don't count hence remove
* them from total atom count.
*
* if min and max index are equal and replacementAtomCount is
* greater than 1, we are replacing a single atom at the "end" of
* a molecule with multiple atoms. Therefore only 1 bond must be
* formed and that already happend above.
* Example: in CC replace C with CC -> C1CC1 but CCC is what we
* want.
*/
int replacementAtomCount = replacement.countAtoms()
- (replacement.countHydrogens() - replacement.countImplicitHydrogens());
if (replacementAtomCount > 1
&& minIndex != maxIndex) {
createBonds(lowestWithLowest, replacement, mapping,
replacement.countAtoms() - 1, endNeighbors);
}
lowestWithLowest.removeAtoms(atomIndices);
// map canonical smiles to molecule object
// so that each result is unique
// note creating canonical smiles can cause an exception
// for invalid molecules (but not always!)
try {
String canSmiles = lowestWithLowest.canonicalSmiles();
molecules.put(canSmiles, lowestWithLowest);
logger.debug("\tLowest with Lowest molecule generated: {}", canSmiles);
} catch (IndigoException ex) {
if (isValidMolecule(lowestWithLowest)) {
throw ex;
}
// else do nothing as invalid molecules are expected and
// we do not want to return them
}
// If replacement is a single atom. there is only 1 possible
// result per substructure match. This result has already
// been created above!
if (replacementAtomCount > 1) {
logger.debug("\tGenerating molecule for lowest with highest mapping");
IndigoObject lowestWithHighest = mol.clone();
IndigoObject mapping2 = lowestWithHighest.merge(replacement);
createBonds(lowestWithHighest, replacement, mapping2, replacement.countAtoms() - 1, startNeighbors);
/*
* if min and max index are equal and replacementAtomCount is
* greater than 1, we are replacing a single atom at the "end" of
* a molecule with multiple atoms. Therefore only 1 bond must be
* formed and that already happend above.
* Example: in CC replace C with CC -> C1CC1 but CCC is what we
* want.
*/
if (minIndex != maxIndex) {
createBonds(lowestWithHighest, replacement, mapping2, 0, endNeighbors);
}
lowestWithHighest.removeAtoms(atomIndices);
// map canonical smiles to molecule object
// so that each result is unique
// note creating canonical smiles can cause an exception
// for invalid molecules (but not always!)
try {
String canSmiles = lowestWithHighest.canonicalSmiles();
molecules.put(canSmiles, lowestWithHighest);
logger.debug("\tLowest with Highest molecule generated: {}", canSmiles);
} catch (IndigoException ex) {
if (isValidMolecule(lowestWithHighest)) {
throw ex;
}
// else do nothing as invalid molecules are expected and
// we do not want to return them
}
}
}
}
removeInvalidResults(molecules);
logger.exit(molecules);
return molecules;
}
/**
* Creates bonds of given order between the given neighbors to the mapped
* atom of the replacement
*/
private void createBonds(IndigoObject mol,
IndigoObject replacement, IndigoObject mapping,
int replacementAtomIndex, HashMap neighbors) {
logger.entry(mol, replacement, mapping, replacementAtomIndex, neighbors);
for (Map.Entry neighbor : neighbors.entrySet()) {
int atomIndex = neighbor.getKey();
int bondOrder = neighbor.getValue();
IndigoObject replacementAtom = mapping.mapAtom(replacement.getAtom(replacementAtomIndex));
replacementAtom.addBond(mol.getAtom(atomIndex), bondOrder);
}
logger.exit();
}
/**
* Removes Molecules with bad valence or ambiguous hydrogens
*
* @param molecules
*/
private void removeInvalidResults(Map molecules) {
logger.entry(molecules);
Iterator> iterator = molecules.entrySet().iterator();
while (iterator.hasNext()) {
IndigoObject mol = iterator.next().getValue();
if (!isValidMolecule(mol)) {
iterator.remove();
}
}
logger.exit();
}
private boolean isValidMolecule(IndigoObject mol) {
logger.entry(mol);
String valence = mol.checkBadValence();
String ambiguousH = mol.checkAmbiguousH();
if (valence.isEmpty() && ambiguousH.isEmpty()) {
logger.exit(true);
return true;
} else {
logger.exit(false);
return false;
}
}
}