com.actelion.research.chem.sar.CoreBasedSARAnalyzer Maven / Gradle / Ivy

Go to download

Show more of this group Show more artifacts with this name
Show all versions of openchemlib Show documentation

Open Source Chemistry Library

There is a newer version: 2024.11.2

package com.actelion.research.chem.sar;

import com.actelion.research.chem.*;
import com.actelion.research.chem.coords.CoordinateInventor;

import java.util.ArrayList;
import java.util.Arrays;
import java.util.TreeMap;

public class CoreBasedSARAnalyzer {
	public static final boolean DISTINGUISH_STEREO_CENTERS = true;
	private static final int MAX_R_GROUPS = 16;

	private StereoMolecule mQuery,mFragment;
	private SSSearcher mSearcher;
	private SSSearcherWithIndex mSearcherWithIndex;
	private SARMolecule[] mSARMolecule;  // contains info of analyzed molecules, e.g. substituents and corresponding SARScaffold
	private TreeMap mScaffoldMap;  // map of core structure idcodes to corresponding SARScaffold
	private SARScaffoldGroup mScaffoldGroup;
	private int[] mPreferredQueryAtomRGroupMatch;

	/**
	 * This class runs a complete structure-activity-relationship (SAR) analysis from molecules that share
	 * one or multiple common similar scaffold(s). For one given query substructure this class analyses many
	 * molecules, whether the query substructure is found and which substituents are connected at which positions.
	 * If the query substructure is found in (matches) multiple molecules, then the matching substructures
	 * may be the same in all cases, or they may differ between some molecules. This happens, for instance,
	 * if the query contains wildcard atoms that match to multiple atom types, or if a bridge bond
	 * matches to atom chains of different lengths.

	 * The matching substructure of a molecule is called core structure. All molecules that share the same
	 * core structure are analyzed regarding which core structure atom carries which substituents within
	 * these molecules. If the substitution is varying within these molecules for a core structrure atom, then
	 * an R-group is assigned to that position. An atom may get multiple R-groups if attachment positions are
	 * diastereotop or if it sees multiple substituents in some of its molecules. The core structure with numbered
	 * attached R-groups at all exit vectors with changing substitution is called a scaffold.
	 * Thus, a specific core structure always gives rise to a specific scaffold structure.
	 * Thus, all N scaffolds derived from all N core structures caused by one query structure
	 * form a scaffold group. R-group numbering between all scaffolds within the same scaffold group
	 * is compatible. This means that R-groups at equivalent exit vectors of two different scaffolds within
	 * the same group have the same number. Two exit vectors are considered equivalent, if they are connected
	 * to atoms, which match to the same query structure atom, and if they are diastereotop, e.g. both connected
	 * with an up-stereo bond when super-positioning their atom coordinates.

	 * For any given molecule a scaffold structure is constructed the following way:

	 * - A substructure search locates all matches of the query structure and selects a preferred match based
	 *   on the substitution pattern. If no match is found, then this molecule is skipped from the analysis.

	 * - All matching atoms and bonds are taken as a 'core' structure. If the query contains bridge bonds, then
	 *   those molecule atoms that match on the bridge bonds also belong to the core structure.

	 * - When the core structure is determined for a molecule, then all remaining atoms of the molecule that
	 *   don't belong to the core structure, are part of substituents.

	 * - For every exit vector of the core structure, i.e. a bond that connects to a substituent atom, the
	 *   substituent structure is determined. A substituent atom may connect back to another exit vector of
	 *   the core structure, causing a ring closure.

	 * - After the analysis of all molecules that share the same core structure, a copy of the
	 *   core structure is created. Then all exit vectors that have at least two different substituents
	 *   throughout all molecules (e.g. -H and -Me) an R-group is attached to the core structure.
	 *   If all molecules have the same substituent at one core structure position, then no R-group is attached.
	 *   Instead, that substituent itself is attached to the core structure.

	 * - The core structure with attached R-groups and attached constant substituents constitutes the scaffold
	 *   structure for a particular molecule.

	 * - All molecules that match to the same query structure don't necessarily share the same scaffold structure,
	 *   e.g. if a query bridge bond matches on a different chain length or if a wildcard atom matches on a
	 *   different atom type, then the constructed scaffold structure differs in these aspects. Also, the
	 *   number of attached R-groups may be different for different scaffolds, if all molecules belonging
	 *   to one scaffold have no substituent at a position that is substituted on a related scaffold's molecules.
	 *   However, it is assured, that R-group numbering is always the same among the entire scaffold group.

	 *   Summary: If molecules match to the same query structure, they belong to the same scaffold group,
	 *   but nonetheless, their assigned scaffold structures may differ concerning atom types, ring sizes, and
	 *   count of attached R-groups. The R-group numbering (R1, R2, ...), however, is compatible, i.e. R-groups
	 *   at equivalent positions have the same number.

	 * @param query substructure with valid atom coordinates that defines one or multiple scaffolds (e.g. via atom lists or bond bridges)
	 */
	public CoreBasedSARAnalyzer(StereoMolecule query, int moleculeCount) {
		mQuery = query;
		mQuery.ensureHelperArrays(Molecule.cHelperNeighbours);

		mSARMolecule = new SARMolecule[moleculeCount];
		mScaffoldMap = new TreeMap<>();
		mScaffoldGroup = new SARScaffoldGroup(query);

		mFragment = new StereoMolecule();   // used as molecule buffer

		mPreferredQueryAtomRGroupMatch = new int[MAX_R_GROUPS];
		Arrays.fill(mPreferredQueryAtomRGroupMatch, -1);
	}

	/**
	 * Adds a molecule to the SAR-analyzer:

	 * - determines core structure from the preferred query match

	 * - if this core structure was not seen yet, creates a new scaffold object for this core structure

	 * - creates new SAR-molecule data object with scaffold and substituent information

	 * Use this version of addMolecule() if you don't have pre-calculated fragment fingerprints
	 * for your molecules available.
	 * @param mol
	 * @param index
	 * @return
	 */
	public int setMolecule(StereoMolecule mol, int index) {
		if (mSearcher == null) {
			mSearcher = new SSSearcher();
			mSearcher.setFragment(mQuery);
		}

		mSearcher.setMolecule(mol);
		int matchCount = mSearcher.findFragmentInMolecule(SSSearcher.cCountModeRigorous, SSSearcher.cDefaultMatchMode);
		if (matchCount == 0)
			return 0;

		setMolecule(mol, mSearcher, index);

		return matchCount;
	}

	/**
	 * Adds a molecule to the SAR-analyzer:

	 * - determines core structure from the preferred query match

	 * - if this core structure was not seen yet, creates a new scaffold object for this core structure

	 * - creates new SAR-molecule data object with scaffold and substituent information

	 * Use this version of addMolecule() if you have in memory molecules and pre-calculated fragment fingerprints.
	 * @param mol
	 * @param ffp
	 * @param index
	 * @return
	 */
	public int setMolecule(StereoMolecule mol, long[] ffp, int index) {
		if (mSearcherWithIndex == null) {
			mSearcherWithIndex = new SSSearcherWithIndex();
			mSearcherWithIndex.setFragment(mQuery, (long[])null);
		}

		mSearcherWithIndex.setMolecule(mol, ffp);
		int matchCount = mSearcherWithIndex.findFragmentInMolecule(SSSearcher.cCountModeRigorous, SSSearcher.cDefaultMatchMode);
		if (matchCount == 0)
			return 0;

		setMolecule(mol, mSearcherWithIndex.getGraphMatcher(), index);

		return matchCount;
	}

	/**
	 * Adds a molecule to the SAR-analyzer:

	 * - determines core structure from the preferred query match

	 * - if this core structure was not seen yet, creates a new scaffold object for this core structure

	 * - creates new SAR-molecule data object with scaffold and substituent information

	 * Use this version of addMolecule() if you have idcodes, coords, and pre-calculated fragment fingerprints
	 * of your molecules.
	 * @param idcode
	 * @param coords
	 * @param ffp
	 * @param index
	 * @return
	 */
	public int setMolecule(byte[] idcode, byte[] coords, long[] ffp, int index) {
		if (mSearcherWithIndex == null) {
			mSearcherWithIndex = new SSSearcherWithIndex();
			mSearcherWithIndex.setFragment(mQuery, (long[])null);
		}

		mSearcherWithIndex.setMolecule(idcode, ffp);
		int matchCount = mSearcherWithIndex.findFragmentInMolecule(SSSearcher.cCountModeRigorous, SSSearcher.cDefaultMatchMode);
		if (matchCount == 0)
			return 0;

		setMolecule(new IDCodeParser(true).getCompactMolecule(idcode, coords), mSearcherWithIndex.getGraphMatcher(), index);

		return matchCount;
	}

	private void setMolecule(StereoMolecule mol, SSSearcher searcher, int index) {
		int match = findPreferredMatch(mol, searcher.getMatchList());

		int[] queryToMolAtom = searcher.getMatchList().get(match);

		// Mark all atoms belonging to core fragment
		boolean[] isCoreAtom = new boolean[mol.getAtoms()];
		for (int i=0; i= 129 && atomicNo <= 144)
							isCoreAtom[connAtom] = true;
					}
				}
			}
		}

		boolean[] isBridgeAtom = searcher.getMatchingBridgeBondAtoms(match);
		if (isBridgeAtom != null)
			for (int i=0; i MAX_R_GROUPS) {
				for (SARMolecule molecule: mSARMolecule)
					if (molecule != null
					 && molecule.getScaffold().getRGroupCount() > MAX_R_GROUPS)
						molecule.clear();

				rGroupCountExceeded = true;
				}

			scaffold.addRGroupsToCoreStructure();
			}

		for (SARMolecule molecule: mSARMolecule)
			if (molecule != null)
				molecule.correctSubstituentRingClosureLabels();

		return !rGroupCountExceeded;
		}

	/**
	 * Uses a simple strategy to determine the preferred match:
	 * It preferrers matches that carry substituents at low atom indexes.
	 * @param mol
	 * @param matchList
	 * @return
	 */
	private int findPreferredMatch(StereoMolecule mol, ArrayList matchList) {
		if (matchList.size() == 1)
			return 0;

		int bestMatch = -1;
		int bestScore = Integer.MIN_VALUE;
		int[] bestQueryAtomRGroupMatch = null;

		mol.ensureHelperArrays(Molecule.cHelperNeighbours);

		for (int i=0; i0)
						score -= atom * addedValence;
				}
			}

			// In case of 2-step SAR deconvolutions, where we may have R-groups as substituents,
			// we try to choose those matches, which have the same R-groups at
			// the same positions.
			int[] queryAtomRGroupMatch = getExistingQueryAtomRGroupMatch(match, isUsedAtom, mol);
			if (queryAtomRGroupMatch != null) {
				int matchingRGroupCount = 0;
				for (int k=0; k= 129 && atomicNo <= 144) {
							if (rGroupToQueryAtom == null) {
								rGroupToQueryAtom = new int[MAX_R_GROUPS];
								Arrays.fill(rGroupToQueryAtom, -1);
							}
							int rGroupNo = (atomicNo >= 142) ? atomicNo - 142 : atomicNo - 126; // 0-based
							rGroupToQueryAtom[rGroupNo] = i;
						}
					}
				}
			}
		}
		return rGroupToQueryAtom;
	}

	private void adaptCoreAtomCoordsFromQuery(StereoMolecule query, StereoMolecule core, int[] queryToCoreAtom, boolean hasBridgeAtoms) {
		if (!hasBridgeAtoms) {
			// just copy query atom coordinates and mark them to be untouched for later coordinate invention
			for (int queryAtom = 0; queryAtom

    

    

    
            
    
            

    
        
            
                Related Artifacts
                
                     mysql-connector-java mysql
 facebook-messenger com.github.codedrinker
 selenium-java org.seleniumhq.selenium
 instagram-java com.github.sola92
 gson com.google.code.gson
 poi org.apache.poi
 httpclient org.apache.httpcomponents
 json org.json
 facebook-java-api com.google.code.facebook-java-api
 poi-ooxml org.apache.poi
 jackson-databind com.fasterxml.jackson.core
 junit junit
 primefaces org.primefaces
 ojdbc7 com.github.noraui
 jfoenix com.jfoenix
 testng org.testng
 json-simple com.googlecode.json-simple
 selenium-server org.seleniumhq.selenium
 itextpdf com.itextpdf
 spring-core org.springframework
                
            
        
        
            
                Related Groups
                
                     org.springframework
 org.apache.poi
 org.hibernate
 org.springframework.boot
 com.fasterxml.jackson.core
 com.itextpdf
 org.seleniumhq.selenium
 mysql
 org.finos.legend.engine
 org.apache.httpcomponents
 org.apache.logging.log4j
 org.openjfx
 org.apache.commons
 org.json
 com.google.guava
 com.google.zxing
 net.sf.jasperreports
 javax.xml.bind
 ojdbc
 com.google.code.facebook-java-api