All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.actelion.research.chem.sar.CoreBasedSARAnalyzer Maven / Gradle / Ivy

There is a newer version: 2024.12.1
Show newest version
package com.actelion.research.chem.sar;

import com.actelion.research.chem.*;
import com.actelion.research.chem.coords.CoordinateInventor;

import java.util.ArrayList;
import java.util.Arrays;
import java.util.TreeMap;

public class CoreBasedSARAnalyzer {
	public static final boolean DISTINGUISH_STEREO_CENTERS = true;
	private static final int MAX_R_GROUPS = 16;

	private StereoMolecule mQuery,mFragment;
	private SSSearcher mSearcher;
	private SSSearcherWithIndex mSearcherWithIndex;
	private SARMoleculeData[] mMoleculeData;  // contains info of analyzed molecules, e.g. substituents and corresponding ScaffoldData
	private TreeMap mScaffoldMap;  // map of idccodes of core structures to corresponding ScaffoldData
	private ScaffoldGroup mScaffoldGroup;
	private int[] mPreferredQueryAtomRGroupMatch;

	/**
	 * This class runs a complete structure-activity-relationship (SAR) analysis from molecules that share
	 * one or multiple common similar scaffold(s). For one given query substructure this class analyses many
	 * molecules, whether the query substructure is found and which substituents are connected at which positions.
	 * If the query structure matches on a substructure of multiple molecules, then these molecule substructures
	 * may be the same in all cases, or it may differ between some molecules. This happens, for instance,
	 * if the query contains wildcard atoms that match to multiple atom types, or if a bridge bond
	 * matches to atom chains of different lengths.
* The matching substructure of a molecule is called core structure. All molecules that share the same * core structure are analyzed regarding which core structure atom carries which substituents within * these molecules. If the substitution is varying within these molecules for a core structrure atom, then * an R-group is assigned to that position. An atom may get multiple R-groups if attachment positions are * diastereotop or if it sees multiple substituents in some of its molecules. The core structure with numbered * attached R-groups at all exit vectors with changing substitution is called a scaffold. * Thus, a specific core structure always gives rise to a specific scaffold structure. * Thus, all N scaffolds derived from all N core structures caused by one query structure * form a scaffold group. R-group numbering between all scaffolds within the same scaffold group * is compatible. This means that R-groups at equivalent exit vectors of two different scaffolds within * the same group have the same number. Two exit vectors are considered equivalent, if they are connected * to atom, which matching the same query structure atom, and if they are diastereotop, e.g. both connected * with an up-stereo bond when super-positioning their atom coordinates.
* For any given molecule a scaffold structure is constructed the following way:
* - A substructure search locates all matches of the query structure and selects a preferred match based * on the substitution pattern. If no match is found, then this molecule is skipped from the analysis.
* - All matching atoms and bonds are taken as a 'core' structure. If the query contains bridge bonds, then * those molecule atoms that match on the bridge bonds also belong to the core structure.
* - When the core structure is determined for a molecule, then all remaining atoms of the molecule that * don't belong to the core structure, are part of substituents.
* - For every exit vector of the core structure, i.e. a bond that connects to a substituent atom, the * substituent structure is determined. A substituent atom may connect back to another exit vector of * the core structure, causing a ring closure.
* - After the analysis of all molecules that share the same core structure, a copy of the * core structure is created. Then all exit vectors that have at least two different substituents * throughout all molecules (e.g. -H and -Me) an R-group is attached to the core structure. * If all molecules have the same substituent at one core structure position, then no R-group is attached. * Instead, that substituent itself is attached to the core structure.
* - The core structure with attached R-groups and attached constant substituents constitutes the scaffold * structure for a particular molecule.
* - All molecules that match to the same query structure don't necessarily share the same scaffold structure, * e.g. if a query bridge bond matches on a different chain length or if a wildcard atom matches on a * different atom type, then the constructed scaffold structure differs in these aspects. Also, the * number of attached R-groups may be different for different scaffolds, if all molecules belonging * to one scaffold have no substituent at a position that is substituted on a related scaffold's molecules. * However, it is assured, that R-group numbering is always the same among the entire scaffold group.
* Summary: If molecules match to the same query structure, they belong to the same scaffold group, * but nonetheless, their assigned scaffold structures may differ concerning atom types, ring sizes, and * count of attached R-groups. The R-group numbering (R1, R2, ...), however, is compatible, i.e. R-groups * at equivalent positions have the same number.
* @param query substructure with valid atom coordinates that defines one or multiple scaffolds (e.g. via atom lists or bond bridges) */ public CoreBasedSARAnalyzer(StereoMolecule query, int moleculeCount) { mQuery = query; mQuery.ensureHelperArrays(Molecule.cHelperNeighbours); mMoleculeData = new SARMoleculeData[moleculeCount]; mScaffoldMap = new TreeMap<>(); mScaffoldGroup = new ScaffoldGroup(query); mFragment = new StereoMolecule(); // used as molecule buffer mPreferredQueryAtomRGroupMatch = new int[MAX_R_GROUPS]; Arrays.fill(mPreferredQueryAtomRGroupMatch, -1); } /** * Adds a molecule to the SAR-analyzer:
* - determines core structure from the preferred query match
* - if this core structure was not seen yet, creates a new scaffold object for this core structure
* - creates new SAR-molecule data object with scaffold and substituent information
* Use this version of addMolecule() if you don't have pre-calculated fragment fingerprints * for your molecules available. * @param mol * @param index * @return */ public int setMolecule(StereoMolecule mol, int index) { if (mSearcher == null) { mSearcher = new SSSearcher(); mSearcher.setFragment(mQuery); } mSearcher.setMolecule(mol); int matchCount = mSearcher.findFragmentInMolecule(SSSearcher.cCountModeRigorous, SSSearcher.cDefaultMatchMode); if (matchCount == 0) return 0; setMolecule(mol, mSearcher, index); return matchCount; } /** * Adds a molecule to the SAR-analyzer:
* - determines core structure from the preferred query match
* - if this core structure was not seen yet, creates a new scaffold object for this core structure
* - creates new SAR-molecule data object with scaffold and substituent information
* Use this version of addMolecule() if you have in memory molecules and pre-calculated fragment fingerprints. * @param mol * @param ffp * @param index * @return */ public int setMolecule(StereoMolecule mol, long[] ffp, int index) { if (mSearcherWithIndex == null) { mSearcherWithIndex = new SSSearcherWithIndex(); mSearcherWithIndex.setFragment(mQuery, (long[])null); } mSearcherWithIndex.setMolecule(mol, ffp); int matchCount = mSearcherWithIndex.findFragmentInMolecule(SSSearcher.cCountModeRigorous, SSSearcher.cDefaultMatchMode); if (matchCount == 0) return 0; setMolecule(mol, mSearcherWithIndex.getGraphMatcher(), index); return matchCount; } /** * Adds a molecule to the SAR-analyzer:
* - determines core structure from the preferred query match
* - if this core structure was not seen yet, creates a new scaffold object for this core structure
* - creates new SAR-molecule data object with scaffold and substituent information
* Use this version of addMolecule() if you have idcodes, coords, and pre-calculated fragment fingerprints * of your molecules. * @param idcode * @param coords * @param ffp * @param index * @return */ public int setMolecule(byte[] idcode, byte[] coords, long[] ffp, int index) { if (mSearcherWithIndex == null) { mSearcherWithIndex = new SSSearcherWithIndex(); mSearcherWithIndex.setFragment(mQuery, (long[])null); } mSearcherWithIndex.setMolecule(idcode, ffp); int matchCount = mSearcherWithIndex.findFragmentInMolecule(SSSearcher.cCountModeRigorous, SSSearcher.cDefaultMatchMode); if (matchCount == 0) return 0; setMolecule(new IDCodeParser(true).getCompactMolecule(idcode, coords), mSearcherWithIndex.getGraphMatcher(), index); return matchCount; } private void setMolecule(StereoMolecule mol, SSSearcher searcher, int index) { int match = findPreferredMatch(mol, searcher.getMatchList()); int[] queryToMolAtom = searcher.getMatchList().get(match); // Mark all atoms belonging to core fragment boolean[] isCoreAtom = new boolean[mol.getAtoms()]; for (int i=0; i= 129 && atomicNo <= 144) isCoreAtom[connAtom] = true; } } } } boolean[] isBridgeAtom = searcher.getMatchingBridgeBondAtoms(match); if (isBridgeAtom != null) for (int i=0; i MAX_R_GROUPS) { for (SARMoleculeData moleculeData:mMoleculeData) if (moleculeData != null && moleculeData.getScaffoldData().getRGroupCount() > MAX_R_GROUPS) moleculeData.clear(); rGroupCountExceeded = true; } scaffoldData.addRGroupsToCoreStructure(); } for (SARMoleculeData moleculeData:mMoleculeData) if (moleculeData != null) moleculeData.correctSubstituentRingClosureLabels(); return !rGroupCountExceeded; } /** * Uses a simple strategy to determine the preferred match: * It preferrers matches that carry substituents at low atom indexes. * @param mol * @param matchList * @return */ private int findPreferredMatch(StereoMolecule mol, ArrayList matchList) { if (matchList.size() == 1) return 0; int bestMatch = -1; int bestScore = Integer.MIN_VALUE; int[] bestQueryAtomRGroupMatch = null; mol.ensureHelperArrays(Molecule.cHelperNeighbours); for (int i=0; i0) score -= atom * addedValence; } } // In case of 2-step SAR deconvolutions, where we may have R-groups as substituents, // we try to choose those matches, which have the same R-groups at // the same positions. int[] queryAtomRGroupMatch = getExistingQueryAtomRGroupMatch(match, isUsedAtom, mol); if (queryAtomRGroupMatch != null) { int matchingRGroupCount = 0; for (int k=0; k= 129 && atomicNo <= 144) { if (rGroupToQueryAtom == null) { rGroupToQueryAtom = new int[MAX_R_GROUPS]; Arrays.fill(rGroupToQueryAtom, -1); } int rGroupNo = (atomicNo >= 142) ? atomicNo - 142 : atomicNo - 126; // 0-based rGroupToQueryAtom[rGroupNo] = i; } } } } } return rGroupToQueryAtom; } private void adaptCoreAtomCoordsFromQuery(StereoMolecule query, StereoMolecule core, int[] queryToCoreAtom, boolean hasBridgeAtoms) { if (!hasBridgeAtoms) { // just copy query atom coordinates and mark them to be untouched for later coordinate invention for (int queryAtom = 0; queryAtom




© 2015 - 2025 Weber Informatics LLC | Privacy Policy