com.actelion.research.chem.mmp.MMPFragmenter Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of openchemlib Show documentation
Show all versions of openchemlib Show documentation
Open Source Chemistry Library
/*
* Copyright (c) 2017
* Actelion Pharmaceuticals Ltd.
* Gewerbestrasse 16
* CH-4123 Allschwil, Switzerland
*
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice, this
* list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
* 3. Neither the name of the copyright holder nor the
* names of its contributors may be used to endorse or promote products
* derived from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
* ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
* @author Gregori Gerebtzoff
*/
package com.actelion.research.chem.mmp;
import com.actelion.research.chem.Canonizer;
import com.actelion.research.chem.IDCodeParser;
import com.actelion.research.chem.StereoMolecule;
import java.util.*;
import java.util.Map.Entry;
public class MMPFragmenter {
private static final int SINGLE_CUT = 1;
private static final int DOUBLE_CUT = 2;
private static final int NUMBER_OF_CUTS = DOUBLE_CUT; // desired number of cuts; currently supported: SINGLE_CUT, DOUBLE_CUT
public static final Integer KEYS_MIN_ATOMS = 4; // set to null to bypass
private static final Integer VALUE_MAX_ATOMS = null; // set to null to bypass (default:15)
private static final String r1H = createR1HMoleculeID(); // R1-H molecule IDCode (for Hydrogen replacements)
public static final String FRAGMENT_DELIMITER = "#"; // Symbol used for fragment delimiter
public static final int FRAGMENT_ATOMIC_NO = 142; // Atomic number used for fragment delimiter
private StereoMolecule mol; // molecule might get modified upon initialization (Hydrogens are removed)
private List moleculeFragmentsID; // {R-group, "clean" (without R-groups)} fragmentsID for Hydrogen replacements
private HashMap moleculeFragments; // unique list of R-groups- containing fragmentsID for Hydrogen replacements (IDCode, StereoMol)
private List moleculeIndexesID; // container of keysID-valueID for single cuts and double cuts
private List moleculeIndexesIDByte; // container of keysIDBytes-valueIDBytes for single cuts and double cuts
private Integer nRotBonds = null;
private ArrayList rotBondsIndex;
/**
* Generates the idCode for the R-[H] molecule
* @return idCode String
*/
static public String createR1HMoleculeID() {
StereoMolecule r1H = new StereoMolecule();
int atom1Index = r1H.addAtom(FRAGMENT_ATOMIC_NO);
r1H.setAtomCustomLabel(atom1Index, FRAGMENT_DELIMITER);
int atom2Index = r1H.addAtom(1);
r1H.setAtomCustomLabel(atom2Index, "[H]");
r1H.addBond(atom1Index, atom2Index, StereoMolecule.cBondTypeSingle);
return getIDCodeWithCustomLabels(r1H);
}
/**
* Helper function to generate idCodes with custom labels
* @param mol
* @return idCode String
*/
static private String getIDCodeWithCustomLabels(StereoMolecule mol) {
Canonizer canonizer = new Canonizer(mol, Canonizer.ENCODE_ATOM_CUSTOM_LABELS);
return canonizer.getIDCode();
}
static public class MoleculeIndexID {
private String[] keysID;
private int[] keysIndex;
private String valueID;
private int valueIndex;
private int[] keysIDAtoms;
private int valueIDAtoms;
private IDCodeParser idCodeParser = new IDCodeParser();
private int[] bondIndexes; // bond indexes for bonds between two heavy atoms
private int[] valueAtomIndexes; // atom indexes of the key for heavy atom - hydrogen bonds
private List coordinates; // coordinates of the middle of the bond between key and value
private Integer[] chemicalSpaceSizes;
public MoleculeIndexID(){ }
/**
* Adds one keys-value combination
* @param keysID Array of one (single cut) or two (double cut) idCodes of the 'key' (constant part of the molecule)
* @param keysIndex Array of one (single cut) or two (double cut) indexes of the 'key' (from mmpUniqueFragments)
* @param valueID idCode of the 'value' (variable part of the molecule)
* @param valueIndex Index of the 'value'
* @param keysIDAtoms Number of heavy atoms of the 'key(s)'
* @param valueIDAtoms Number of heavy atoms of the 'value'
* @param bondIndexes Array of one (single cut) or two (double cut) bond indexes where the cuts occur
* @param valueAtomIndexes Atom indexes of the 'key' for heavy atom - hydrogen bonds
*/
public MoleculeIndexID(String[] keysID, int[] keysIndex, String valueID, int valueIndex, int[] keysIDAtoms, int valueIDAtoms, int[] bondIndexes, int[] valueAtomIndexes){
this.keysID = keysID;
this.keysIndex = keysIndex;
this.valueID = valueID;
this.valueIndex = valueIndex;
this.keysIDAtoms = keysIDAtoms;
if (keysIDAtoms == null) {
this.keysIDAtoms = new int[keysID.length];
for (int i=0; i();
}
/**
* Adds one keys-value combination
* @param keysID Array of one (single cut) or two (double cut) idCodes of the 'key' (constant part of the molecule)
* @param valueID idCode of the 'value' (variable part of the molecule)
* @param keysIDAtoms Number of heavy atoms of the 'key(s)'
* @param valueIDAtoms Number of heavy atoms of the 'value'
* @param bondIndexes Array of one (single cut) or two (double cut) bond indexes where the cuts occur
* @param valueAtomIndexes Atom indexes of the 'key' for heavy atom - hydrogen bonds
*/
public MoleculeIndexID(String[] keysID, String valueID, int[] keysIDAtoms, int valueIDAtoms, int[] bondIndexes, int[] valueAtomIndexes){
this.keysID = keysID;
this.valueID = valueID;
this.keysIDAtoms = keysIDAtoms; // number of atoms minus one (because of R1)
if (keysIDAtoms == null) {
this.keysIDAtoms = new int[keysID.length];
for (int i=0; i();
}
public String[] getKeysID() {
return keysID;
}
public String getValueID() {
return valueID;
}
public int[] getKeysIDAtoms() {
return keysIDAtoms;
}
public int getValueIDAtoms() {
return valueIDAtoms;
}
public int[] getKeysIndex() {
return keysIndex;
}
public void setKeysIndex(int[] keysIndex) {
this.keysIndex = keysIndex;
}
public int getValueIndex() {
return valueIndex;
}
public void setValueIndex(int valueIndex) {
this.valueIndex = valueIndex;
}
public int[] getBondIndexes() {
return bondIndexes;
}
public int[] getValueAtomIndexes() {
return valueAtomIndexes;
}
public void setCoordinates(double x, double y) {
this.coordinates.add(new Double[]{x, y});
}
public List getCoordinates() {
return coordinates;
}
public void setChemicalSpaceSize(Integer[] chemicalSpaceSizes) {
this.chemicalSpaceSizes = chemicalSpaceSizes;
}
public Integer[] getChemicalSpaceSizes() {
return chemicalSpaceSizes;
}
}
public class MoleculeIndexIDByte {
private byte[][] keysIDByte;
private byte[] valueIDByte;
public MoleculeIndexIDByte(){ }
public MoleculeIndexIDByte(byte[][] keysIDByte, byte[] valueIDByte){
this.keysIDByte = keysIDByte;
this.valueIDByte = valueIDByte;
}
public byte[][] getKeysIDByte() {
return keysIDByte;
}
public byte[] getValueIDByte() {
return valueIDByte;
}
}
public MMPFragmenter(StereoMolecule mol) {
this.mol = removeHydrogens(mol);
this.moleculeFragmentsID = new ArrayList();
this.moleculeFragments = new HashMap();
this.moleculeIndexesID = new ArrayList();
this.moleculeIndexesIDByte = new ArrayList();
}
public List getMoleculeIndexesID() {
return getMoleculeIndexesID(true);
}
/**
* Returns an ArrayList of MoleculeIndexID containing
* IDCodes of keys and values of single and double cuts.
* @param generateWholeMoleculeVariations true/false to generate whole molecule variations, used for identification of H-replacements
*/
public List getMoleculeIndexesID(boolean generateWholeMoleculeVariations) {
if (nRotBonds == null) {
fragmentMolecule(generateWholeMoleculeVariations);
}
return moleculeIndexesID;
}
/**
* Returns an ArrayList of {R-group, "clean" (without R-groups)}
* fragmentsID used for Hydrogen replacements.
*/
public List getMoleculeFragmentsID() {
if (nRotBonds == null) {
fragmentMolecule(false);
}
return moleculeFragmentsID;
}
public List getMoleculeIndexesIDByte() {
return getMoleculeIndexesIDByte(true);
}
/**
* Returns an ArrayList of MoleculeIndexIDByte containing
* bytes of IDCodes of keys and values of single and double cuts.
* @param generateWholeMoleculeVariations true/false to generate whole molecule variations, used for identification of H-replacements
*/
public List getMoleculeIndexesIDByte(boolean generateWholeMoleculeVariations) {
if (nRotBonds == null) {
fragmentMolecule(generateWholeMoleculeVariations);
}
if (nRotBonds > 0 && moleculeIndexesIDByte.size() == 0) {
for (MoleculeIndexID moleculeIndexID:moleculeIndexesID) {
String[] keys = moleculeIndexID.keysID;
String value = moleculeIndexID.valueID;
byte[][] keysByte = null;
if (keys.length == SINGLE_CUT) {
keysByte = new byte[][]{keys[0].getBytes(), keys[1].getBytes()};
}
else if (keys.length == DOUBLE_CUT) {
keysByte = new byte[][]{keys[0].getBytes(), keys[1].getBytes(), keys[2].getBytes()};
}
byte[] valueByte = value.getBytes();
MoleculeIndexIDByte moleculeIndexByte = new MoleculeIndexIDByte(keysByte, valueByte);
moleculeIndexesIDByte.add(moleculeIndexByte);
}
}
return moleculeIndexesIDByte;
}
/**
* Returns the indexes of the rotatable bonds
* @return Array of integers
*/
public ArrayList getRotBondsIndex() {
if (rotBondsIndex == null) {
rotBondsIndex = new ArrayList();
for (int bond=0; bond= 142) {
count++;
}
}
return count;
}
private StereoMolecule addRGroups(StereoMolecule mol) {
return addRGroups(mol, false);
}
/**
* Tags R-Groups with correct label (#1, #2)
* @param mol
* @param inverse true/false if the middle fragment is inverted
* @return Modified StereoMolecule
*/
private StereoMolecule addRGroups(StereoMolecule mol, boolean inverse) {
int rGroup = 1; // atomic number of R1
mol.ensureHelperArrays(StereoMolecule.cHelperNeighbours);
if (inverse == false) {
for (int atom=0; atom= 142) {
mol.setAtomicNo(atom, FRAGMENT_ATOMIC_NO + rGroup - 1);
mol.setAtomCustomLabel(atom, FRAGMENT_DELIMITER + Integer.toString(rGroup));
rGroup++;
}
}
}
else {
for (int atom=mol.getAtoms()-1; atom>=0; atom--) {
if (mol.getAtomicNo(atom) == 0 || mol.getAtomicNo(atom) >= 142) {
mol.setAtomicNo(atom, FRAGMENT_ATOMIC_NO + rGroup - 1);
mol.setAtomCustomLabel(atom, FRAGMENT_DELIMITER + Integer.toString(rGroup));
rGroup++;
}
}
}
return mol;
}
/**
* Creates a new MoleculeIndexID from fragments
* @param fragments Array of two (single cut) or three (double cut) fragments
* @param bondIndexes Array of one or two bond indexes
* @param valueAtomIndexes Array of atom indexes for Hydrogen replacements
* @param cutType SINGLE_CUT or DOUBLE_CUT
* @return a MoleculeIndexID
*/
private MoleculeIndexID processFragments(StereoMolecule[] fragments, int[] bondIndexes, int[] valueAtomIndexes, int cutType) {
MoleculeIndexID retVal = null;
if (cutType == SINGLE_CUT && fragments.length > 1) {
String idCode1 = getIDCodeWithCustomLabels(fragments[0]);
String idCode2 = getIDCodeWithCustomLabels(fragments[1]);
retVal = new MoleculeIndexID(new String[]{idCode1}, idCode2, new int[]{fragments[0].getAtoms()-1}, fragments[1].getAtoms()-1, bondIndexes, valueAtomIndexes);
if (KEYS_MIN_ATOMS == null || fragments[0].getAtoms() >= KEYS_MIN_ATOMS) {
moleculeFragments.put(idCode1, fragments[0]);
}
if (KEYS_MIN_ATOMS == null || fragments[1].getAtoms() >= KEYS_MIN_ATOMS) {
moleculeFragments.put(idCode2, fragments[1]);
}
}
else if (cutType == DOUBLE_CUT && fragments.length > 2) {
for (StereoMolecule mol:fragments) {
mol.ensureHelperArrays(StereoMolecule.cHelperNeighbours);
}
if (countRGroups(fragments[1]) == 2) {
fragments = new StereoMolecule[]{fragments[1], fragments[0], fragments[2]};
}
else if (countRGroups(fragments[2]) == 2) {
fragments = new StereoMolecule[]{fragments[2], fragments[0], fragments[1]};
}
Canonizer canonizer = new Canonizer(fragments[0]);
int[] graphIndexes = canonizer.getGraphIndexes();
int[] rGroupIndexes = new int[2];
int rGroupCounter = 0;
for (int atom=0; atom= 142 || fragments[0].getAtomicNo(atom) == 0) {
rGroupIndexes[rGroupCounter] = atom;
rGroupCounter++;
}
}
if (graphIndexes[rGroupIndexes[0]] < graphIndexes[rGroupIndexes[1]]) {
retVal = new MoleculeIndexID(new String[]{getIDCodeWithCustomLabels(fragments[1]), getIDCodeWithCustomLabels(fragments[2])}, getIDCodeWithCustomLabels(addRGroups(fragments[0])), new int[]{fragments[1].getAtoms()-1, fragments[2].getAtoms()-1}, fragments[0].getAtoms()-1, bondIndexes, valueAtomIndexes);
}
else {
retVal = new MoleculeIndexID( new String[]{getIDCodeWithCustomLabels(fragments[2]), getIDCodeWithCustomLabels(fragments[1])}, getIDCodeWithCustomLabels(addRGroups(fragments[0], true)), new int[]{fragments[2].getAtoms()-1, fragments[1].getAtoms()-1}, fragments[0].getAtoms()-1, new int[]{bondIndexes[1], bondIndexes[0]}, new int[]{valueAtomIndexes[1], valueAtomIndexes[0]});
}
}
return retVal;
}
/**
* Removes hydrogens from a StereoMolecule (in case the source is a SDF file with explicit hydrogens)
* @param mol
* @return a modified StereoMolecule
*/
private StereoMolecule removeHydrogens(StereoMolecule mol) {
mol.ensureHelperArrays(StereoMolecule.cHelperNeighbours | StereoMolecule.cHelperParities);
mol.setAllAtoms(mol.getAtoms()); // this way the hydrogens will just be ignored and overwritten by new atoms later
mol.setAllBonds(mol.getBonds());
return mol;
}
public void fragmentMolecule() {
fragmentMolecule(true);
}
/**
* Fragments one StereoMolecule into keys-value pairs of StereoMolecules;
* - for single cuts, keys is an array containing one fragment
* ('constant' part of the molecule), value the 'variable' part;
* - for double cuts, keys in an array containing two fragments
* (constant parts of the molecule, i.e. 'left' and 'right' part), value the 'middle' part.
* An array containing all 'single cut' fragments is available for Hydrogen replacements.
* @param generateWholeMoleculeVariations true/false to generate whole molecule variations, used for identification of H-replacements
*/
public void fragmentMolecule(boolean generateWholeMoleculeVariations) {
int[] rGroupsIndex = new int[NUMBER_OF_CUTS*2];
int[] newBondsIndex = new int[NUMBER_OF_CUTS];
Set hFragments = new HashSet();
StereoMolecule editableMol = new StereoMolecule();
mol.copyMolecule(editableMol);
editableMol.ensureHelperArrays(StereoMolecule.cHelperNeighbours);
// We add R1-R1 and R2-R2 fragments to the molecule;
// These fragments will be used to decompose the molecule and allow recycling of the molecule
for (int i=0; i= KEYS_MIN_ATOMS) && (VALUE_MAX_ATOMS == null || singleCutFragments.valueIDAtoms <= VALUE_MAX_ATOMS)) {
moleculeIndexesID.add(singleCutFragments);
}
if ((KEYS_MIN_ATOMS == null || singleCutFragments.valueIDAtoms >= KEYS_MIN_ATOMS) && (VALUE_MAX_ATOMS == null || singleCutFragments.keysIDAtoms[0] <= VALUE_MAX_ATOMS)) {
MoleculeIndexID invertedSingleCutFragments = new MoleculeIndexID(new String[]{singleCutFragments.valueID}, singleCutFragments.keysID[0], new int[]{singleCutFragments.valueIDAtoms}, singleCutFragments.keysIDAtoms[0], new int[]{rotBondsIndex.get(firstCut)}, new int[]{atom0});
moleculeIndexesID.add(invertedSingleCutFragments);
}
if (KEYS_MIN_ATOMS == null || singleCutFragments.keysIDAtoms[0] >= KEYS_MIN_ATOMS) {
hFragments.add(singleCutFragments.keysID[0]);
}
if (KEYS_MIN_ATOMS == null || singleCutFragments.valueIDAtoms >= KEYS_MIN_ATOMS) {
hFragments.add(singleCutFragments.valueID);
}
if (NUMBER_OF_CUTS >= DOUBLE_CUT) {
for (int secondCut=firstCut+1; secondCut= KEYS_MIN_ATOMS && doubleCutFragments.keysIDAtoms[1] >= KEYS_MIN_ATOMS)) && (VALUE_MAX_ATOMS == null || doubleCutFragments.valueIDAtoms <= VALUE_MAX_ATOMS)) {
moleculeIndexesID.add(doubleCutFragments);
}
editableMol.setBondAtom(1, rotBondsIndex.get(secondCut), atom2);
editableMol.setBondAtom(0, newBondsIndex[1], rGroupsIndex[2]);
}
}
editableMol.setBondAtom(1, rotBondsIndex.get(firstCut), atom1); // X-Y
editableMol.setBondAtom(0, newBondsIndex[0], rGroupsIndex[0]); // R1a-R1b
}
for (Entry cursor : moleculeFragments.entrySet()) {
String fragmentID = cursor.getKey();
StereoMolecule moleculeFragment = cursor.getValue();
for (int atom=moleculeFragment.getAtoms()-1; atom>=0; atom--) {
if (moleculeFragment.getAtomicNo(atom) == 0 || moleculeFragment.getAtomicNo(atom) >= 142) {
moleculeFragment.setAtomicNo(atom, 1); //fragment.deleteAtom(atom); // this doesn't work for Cl-R1: both atoms get deleted!!
moleculeFragment.setAtomCustomLabel(atom, (String)null); // otherwise the atom still contains the custom label, and the IDCode won't be canonical (not identical to the whole molecule)
break;
}
}
moleculeFragmentsID.add(new String[]{fragmentID, moleculeFragment.getIDCode()});
}
if (generateWholeMoleculeVariations) {
generateWholeMoleculeVariations();
}
}
/**
* Generates all variations of Hydrogen replacements on the whole molecule;
* fragments are added to the moleculeIndexesID.
*/
public void generateWholeMoleculeVariations() {
HashMap> wholeMoleculeVariations = new HashMap>(); // in case of symmetric part of a molecule, two variations might be identical -> we use HashMap to avoid it
int atomCount = mol.getAtoms();
StereoMolecule editableMol = new StereoMolecule();
mol.copyMolecule(editableMol);
int rGroupIndex = editableMol.addAtom(FRAGMENT_ATOMIC_NO);
editableMol.setAtomCustomLabel(rGroupIndex, FRAGMENT_DELIMITER);
editableMol.ensureHelperArrays(StereoMolecule.cHelperNeighbours);
for (int atom=0; atom 0) {
int newBond = editableMol.addBond(rGroupIndex, atom, StereoMolecule.cBondTypeSingle);
String idCode = getIDCodeWithCustomLabels(editableMol);
ArrayList atomIndexes = new ArrayList();
if (wholeMoleculeVariations.containsKey(idCode)) {
atomIndexes = wholeMoleculeVariations.get(idCode);
}
atomIndexes.add(atom);
wholeMoleculeVariations.put(idCode, atomIndexes);
editableMol.deleteBond(newBond);
}
}
for (Entry> cursor: wholeMoleculeVariations.entrySet()) {
int[] atomIndexes = new int[cursor.getValue().size()];
for(int i=0; i