com.actelion.research.chem.sar.CoreBasedSARAnalyzer Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of openchemlib Show documentation
Show all versions of openchemlib Show documentation
Open Source Chemistry Library
package com.actelion.research.chem.sar;
import com.actelion.research.chem.*;
import com.actelion.research.chem.coords.CoordinateInventor;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.TreeMap;
public class CoreBasedSARAnalyzer {
public static final boolean DISTINGUISH_STEREO_CENTERS = true;
private static final int MAX_R_GROUPS = 16;
private StereoMolecule mQuery,mFragment;
private SSSearcher mSearcher;
private SSSearcherWithIndex mSearcherWithIndex;
private SARMolecule[] mSARMolecule; // contains info of analyzed molecules, e.g. substituents and corresponding SARScaffold
private TreeMap mScaffoldMap; // map of core structure idcodes to corresponding SARScaffold
private SARScaffoldGroup mScaffoldGroup;
private int[] mPreferredQueryAtomRGroupMatch;
/**
* This class runs a complete structure-activity-relationship (SAR) analysis from molecules that share
* one or multiple common similar scaffold(s). For one given query substructure this class analyses many
* molecules, whether the query substructure is found and which substituents are connected at which positions.
* If the query substructure is found in (matches) multiple molecules, then the matching substructures
* may be the same in all cases, or they may differ between some molecules. This happens, for instance,
* if the query contains wildcard atoms that match to multiple atom types, or if a bridge bond
* matches to atom chains of different lengths.
* The matching substructure of a molecule is called core structure. All molecules that share the same
* core structure are analyzed regarding which core structure atom carries which substituents within
* these molecules. If the substitution is varying within these molecules for a core structrure atom, then
* an R-group is assigned to that position. An atom may get multiple R-groups if attachment positions are
* diastereotop or if it sees multiple substituents in some of its molecules. The core structure with numbered
* attached R-groups at all exit vectors with changing substitution is called a scaffold.
* Thus, a specific core structure always gives rise to a specific scaffold structure.
* Thus, all N scaffolds derived from all N core structures caused by one query structure
* form a scaffold group. R-group numbering between all scaffolds within the same scaffold group
* is compatible. This means that R-groups at equivalent exit vectors of two different scaffolds within
* the same group have the same number. Two exit vectors are considered equivalent, if they are connected
* to atoms, which match to the same query structure atom, and if they are diastereotop, e.g. both connected
* with an up-stereo bond when super-positioning their atom coordinates.
* For any given molecule a scaffold structure is constructed the following way:
* - A substructure search locates all matches of the query structure and selects a preferred match based
* on the substitution pattern. If no match is found, then this molecule is skipped from the analysis.
* - All matching atoms and bonds are taken as a 'core' structure. If the query contains bridge bonds, then
* those molecule atoms that match on the bridge bonds also belong to the core structure.
* - When the core structure is determined for a molecule, then all remaining atoms of the molecule that
* don't belong to the core structure, are part of substituents.
* - For every exit vector of the core structure, i.e. a bond that connects to a substituent atom, the
* substituent structure is determined. A substituent atom may connect back to another exit vector of
* the core structure, causing a ring closure.
* - After the analysis of all molecules that share the same core structure, a copy of the
* core structure is created. Then all exit vectors that have at least two different substituents
* throughout all molecules (e.g. -H and -Me) an R-group is attached to the core structure.
* If all molecules have the same substituent at one core structure position, then no R-group is attached.
* Instead, that substituent itself is attached to the core structure.
* - The core structure with attached R-groups and attached constant substituents constitutes the scaffold
* structure for a particular molecule.
* - All molecules that match to the same query structure don't necessarily share the same scaffold structure,
* e.g. if a query bridge bond matches on a different chain length or if a wildcard atom matches on a
* different atom type, then the constructed scaffold structure differs in these aspects. Also, the
* number of attached R-groups may be different for different scaffolds, if all molecules belonging
* to one scaffold have no substituent at a position that is substituted on a related scaffold's molecules.
* However, it is assured, that R-group numbering is always the same among the entire scaffold group.
* Summary: If molecules match to the same query structure, they belong to the same scaffold group,
* but nonetheless, their assigned scaffold structures may differ concerning atom types, ring sizes, and
* count of attached R-groups. The R-group numbering (R1, R2, ...), however, is compatible, i.e. R-groups
* at equivalent positions have the same number.
* @param query substructure with valid atom coordinates that defines one or multiple scaffolds (e.g. via atom lists or bond bridges)
*/
public CoreBasedSARAnalyzer(StereoMolecule query, int moleculeCount) {
mQuery = query;
mQuery.ensureHelperArrays(Molecule.cHelperNeighbours);
mSARMolecule = new SARMolecule[moleculeCount];
mScaffoldMap = new TreeMap<>();
mScaffoldGroup = new SARScaffoldGroup(query);
mFragment = new StereoMolecule(); // used as molecule buffer
mPreferredQueryAtomRGroupMatch = new int[MAX_R_GROUPS];
Arrays.fill(mPreferredQueryAtomRGroupMatch, -1);
}
/**
* Adds a molecule to the SAR-analyzer:
* - determines core structure from the preferred query match
* - if this core structure was not seen yet, creates a new scaffold object for this core structure
* - creates new SAR-molecule data object with scaffold and substituent information
* Use this version of addMolecule() if you don't have pre-calculated fragment fingerprints
* for your molecules available.
* @param mol
* @param index
* @return
*/
public int setMolecule(StereoMolecule mol, int index) {
if (mSearcher == null) {
mSearcher = new SSSearcher();
mSearcher.setFragment(mQuery);
}
mSearcher.setMolecule(mol);
int matchCount = mSearcher.findFragmentInMolecule(SSSearcher.cCountModeRigorous, SSSearcher.cDefaultMatchMode);
if (matchCount == 0)
return 0;
setMolecule(mol, mSearcher, index);
return matchCount;
}
/**
* Adds a molecule to the SAR-analyzer:
* - determines core structure from the preferred query match
* - if this core structure was not seen yet, creates a new scaffold object for this core structure
* - creates new SAR-molecule data object with scaffold and substituent information
* Use this version of addMolecule() if you have in memory molecules and pre-calculated fragment fingerprints.
* @param mol
* @param ffp
* @param index
* @return
*/
public int setMolecule(StereoMolecule mol, long[] ffp, int index) {
if (mSearcherWithIndex == null) {
mSearcherWithIndex = new SSSearcherWithIndex();
mSearcherWithIndex.setFragment(mQuery, (long[])null);
}
mSearcherWithIndex.setMolecule(mol, ffp);
int matchCount = mSearcherWithIndex.findFragmentInMolecule(SSSearcher.cCountModeRigorous, SSSearcher.cDefaultMatchMode);
if (matchCount == 0)
return 0;
setMolecule(mol, mSearcherWithIndex.getGraphMatcher(), index);
return matchCount;
}
/**
* Adds a molecule to the SAR-analyzer:
* - determines core structure from the preferred query match
* - if this core structure was not seen yet, creates a new scaffold object for this core structure
* - creates new SAR-molecule data object with scaffold and substituent information
* Use this version of addMolecule() if you have idcodes, coords, and pre-calculated fragment fingerprints
* of your molecules.
* @param idcode
* @param coords
* @param ffp
* @param index
* @return
*/
public int setMolecule(byte[] idcode, byte[] coords, long[] ffp, int index) {
if (mSearcherWithIndex == null) {
mSearcherWithIndex = new SSSearcherWithIndex();
mSearcherWithIndex.setFragment(mQuery, (long[])null);
}
mSearcherWithIndex.setMolecule(idcode, ffp);
int matchCount = mSearcherWithIndex.findFragmentInMolecule(SSSearcher.cCountModeRigorous, SSSearcher.cDefaultMatchMode);
if (matchCount == 0)
return 0;
setMolecule(new IDCodeParser(true).getCompactMolecule(idcode, coords), mSearcherWithIndex.getGraphMatcher(), index);
return matchCount;
}
private void setMolecule(StereoMolecule mol, SSSearcher searcher, int index) {
int match = findPreferredMatch(mol, searcher.getMatchList());
int[] queryToMolAtom = searcher.getMatchList().get(match);
// Mark all atoms belonging to core fragment
boolean[] isCoreAtom = new boolean[mol.getAtoms()];
for (int i=0; i= 129 && atomicNo <= 144)
isCoreAtom[connAtom] = true;
}
}
}
}
boolean[] isBridgeAtom = searcher.getMatchingBridgeBondAtoms(match);
if (isBridgeAtom != null)
for (int i=0; i MAX_R_GROUPS) {
for (SARMolecule molecule: mSARMolecule)
if (molecule != null
&& molecule.getScaffold().getRGroupCount() > MAX_R_GROUPS)
molecule.clear();
rGroupCountExceeded = true;
}
scaffold.addRGroupsToCoreStructure();
}
for (SARMolecule molecule: mSARMolecule)
if (molecule != null)
molecule.correctSubstituentRingClosureLabels();
return !rGroupCountExceeded;
}
/**
* Uses a simple strategy to determine the preferred match:
* It preferrers matches that carry substituents at low atom indexes.
* @param mol
* @param matchList
* @return
*/
private int findPreferredMatch(StereoMolecule mol, ArrayList matchList) {
if (matchList.size() == 1)
return 0;
int bestMatch = -1;
int bestScore = Integer.MIN_VALUE;
int[] bestQueryAtomRGroupMatch = null;
mol.ensureHelperArrays(Molecule.cHelperNeighbours);
for (int i=0; i0)
score -= atom * addedValence;
}
}
// In case of 2-step SAR deconvolutions, where we may have R-groups as substituents,
// we try to choose those matches, which have the same R-groups at
// the same positions.
int[] queryAtomRGroupMatch = getExistingQueryAtomRGroupMatch(match, isUsedAtom, mol);
if (queryAtomRGroupMatch != null) {
int matchingRGroupCount = 0;
for (int k=0; k= 129 && atomicNo <= 144) {
if (rGroupToQueryAtom == null) {
rGroupToQueryAtom = new int[MAX_R_GROUPS];
Arrays.fill(rGroupToQueryAtom, -1);
}
int rGroupNo = (atomicNo >= 142) ? atomicNo - 142 : atomicNo - 126; // 0-based
rGroupToQueryAtom[rGroupNo] = i;
}
}
}
}
}
return rGroupToQueryAtom;
}
private void adaptCoreAtomCoordsFromQuery(StereoMolecule query, StereoMolecule core, int[] queryToCoreAtom, boolean hasBridgeAtoms) {
if (!hasBridgeAtoms) {
// just copy query atom coordinates and mark them to be untouched for later coordinate invention
for (int queryAtom = 0; queryAtom