com.actelion.research.chem.SmilesParser Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of openchemlib Show documentation
Show all versions of openchemlib Show documentation
Open Source Chemistry Library
/*
* Copyright (c) 1997 - 2016
* Actelion Pharmaceuticals Ltd.
* Gewerbestrasse 16
* CH-4123 Allschwil, Switzerland
*
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice, this
* list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
* 3. Neither the name of the the copyright holder nor the
* names of its contributors may be used to endorse or promote products
* derived from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
* ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
* @author Thomas Sander
*/
package com.actelion.research.chem;
import com.actelion.research.chem.coords.CoordinateInventor;
import com.actelion.research.chem.reaction.Reaction;
import com.actelion.research.util.ArrayUtils;
import java.nio.charset.StandardCharsets;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.TreeMap;
public class SmilesParser {
protected static final int SMARTS_MODE_MASK = 3;
public static final int SMARTS_MODE_IS_SMILES = 0;
public static final int SMARTS_MODE_GUESS = 1;
public static final int SMARTS_MODE_IS_SMARTS = 2;
public static final int MODE_SKIP_COORDINATE_TEMPLATES = 4;
public static final int MODE_MAKE_HYDROGEN_EXPLICIT = 8;
public static final int MODE_NO_CACTUS_SYNTAX = 16; // if not set, then some CACTVS SMARTS extensions will be recognized and translated as close as possible
public static final int MODE_SINGLE_DOT_SEPARATOR = 32; // CONSIDER single dots '.' (rather than '..') as moelcule separator when parsing reactions
public static final int MODE_CREATE_SMARTS_WARNING = 64;
public static final int MODE_ENUMERATE_SMARTS = 128;
private static final int INITIAL_CONNECTIONS = 16;
private static final int MAX_CONNECTIONS = 100; // largest allowed one in SMILES is 99
private static final int BRACKET_LEVELS = 32;
private static final int MAX_AROMATIC_RING_SIZE = 15;
// Unspecified hydrogen count within brackets means :=0 for SMILES and no-H-restriction for SMARTS.
// Therefore, we have to distinguish from explicit H0, which defined query feature for SMARTS.
protected static final int HYDROGEN_IMPLICIT_ZERO = 9;
private StereoMolecule mMol;
private boolean[] mIsAromaticBond;
private int mMode,mSmartsMode,mAromaticAtoms,mAromaticBonds,mCoordinateMode;
private long mRandomSeed;
private final boolean mCreateSmartsWarnings,mMakeHydrogenExplicit,mSingleDotSeparator;
private StringBuilder mSmartsWarningBuffer;
private boolean mSmartsFeatureFound;
private ArrayList mEnumerationPositionList;
/**
* Creates a new SmilesParser that doesn't allow SMARTS features to be present in
* parsed strings. SMARTS features cause an exception. The fragment flag of created
* molecules is never set.
*/
public SmilesParser() {
this(SMARTS_MODE_IS_SMILES);
}
/**
* Creates a new SmilesParser that may or may not allow SMARTS features to be present in
* parsed strings. If smartsMode is SMARTS_MODE_IS_SMILES, then any SMARTS features cause
* an exception. If smartsMode is SMARTS_MODE_IS_SMARTS, then the input string is considered
* a SMARTS, e.g. 'CC' is taken as fragment of two non-aromatic carbon atoms connected by a
* single bond and without any implicit hydrogen atoms. If smartsMode is SMARTS_MODE_IS_GUESS,
* then the molecule is considered a substructure if any SMARTS features are discovered.
* Depending on whether SMARTS features are found, created molecules have the fragment flag set
* or not set.
* @param mode one of SMARTS_MODE... and optionally other mode flags
*/
public SmilesParser(int mode) {
mMode = mode & ~SMARTS_MODE_MASK;
mSmartsMode = mode & SMARTS_MODE_MASK;
mSingleDotSeparator = (mode & MODE_SINGLE_DOT_SEPARATOR) != 0;
mCreateSmartsWarnings = (mode & MODE_CREATE_SMARTS_WARNING) != 0;
mMakeHydrogenExplicit = ((mode & MODE_MAKE_HYDROGEN_EXPLICIT) != 0);
mCoordinateMode = CoordinateInventor.MODE_DEFAULT;
if ((mode & MODE_SKIP_COORDINATE_TEMPLATES) != 0)
mCoordinateMode |= CoordinateInventor.MODE_SKIP_DEFAULT_TEMPLATES;
if (mMakeHydrogenExplicit)
mCoordinateMode &= ~CoordinateInventor.MODE_REMOVE_HYDROGEN;
}
/**
* Depending on the parse() parameters, the SmilesParser may or may not generate new atom coordinates
* after parsing the SMILES. In difficult cases the employed CoordinateInventor uses random decisions
* when optimizing colliding coordinates. In strained and bridged ring systems, generated coordinates
* may not correctly represent all E/Z-bond configurations.
* Calling this method with a seed != 0 causes the creation of reproducible atom coordinates.
* @param seed value different from 0 in order to always create the same reproducible atom coordinates
*/
public void setRandomSeed(long seed) {
mRandomSeed = seed;
}
public StereoMolecule parseMolecule(String smiles) {
return smiles == null ? null : parseMolecule(smiles.getBytes(StandardCharsets.UTF_8));
}
/**
* Convenience method to quickly obtain a StereoMolecule from a SMILES string.
* If you process many SMILES, then the parse() methods are preferred, because
* they avoid the steady instantiation new StereoMolecules.
* @param smiles
* @return
*/
public StereoMolecule parseMolecule(byte[] smiles) {
StereoMolecule mol = new StereoMolecule();
try {
parse(mol, smiles);
}
catch (Exception e) {
return null;
}
return mol;
}
public static boolean isReactionSmiles(byte[] smiles) {
return isReactionSmiles(smiles, null);
}
public static boolean isReactionSmiles(byte[] smiles, int[] catalystCountHolder) {
int count = 0;
int index = -1;
while (count < 3) {
index = ArrayUtils.indexOf(smiles, (byte)'>', index + 1);
while (index>0 && smiles[index - 1] == (byte)'-')
index = ArrayUtils.indexOf(smiles, (byte)'>', index + 1);
if (index == -1)
break;
count++;
if (catalystCountHolder != null && count == 1) {
catalystCountHolder[0] = 0;
if (index+1') {
catalystCountHolder[0] = 1;
for (int i=index+1; i' || smiles[i-1] == '-'); i++)
if (smiles[i] == '.' && smiles[i-1] != '.')
catalystCountHolder[0]++;
}
}
}
return count == 2;
}
public Reaction parseReaction(String smiles) throws Exception {
return smiles == null ? null : parseReaction(smiles.getBytes(StandardCharsets.UTF_8));
}
public Reaction parseReaction(byte[] smiles) throws Exception {
int index1 = ArrayUtils.indexOf(smiles, (byte)'>');
while (index1 > 0 && smiles[index1-1] == (byte)'-')
index1 = ArrayUtils.indexOf(smiles, (byte)'>', index1+1);
int index2 = (index1 == -1) ? -1 : ArrayUtils.indexOf(smiles, (byte)'>', index1+1);
while (index2 > 0 && smiles[index2-1] == (byte)'-')
index2 = ArrayUtils.indexOf(smiles, (byte)'>', index2+1);
if (index2 == -1)
throw new Exception("Missing one or both separators ('>').");
if (ArrayUtils.indexOf(smiles, (byte)'>', index2+1) != -1)
throw new Exception("Found more than 2 separators ('>').");
Reaction rxn = new Reaction();
int part = 0;
int index = 0;
int closingGroupBracketIndex = -1;
while (index < smiles.length) {
while (index'
&& !(smiles[end] == '.' && ((mSingleDotSeparator && closingGroupBracketIndex==-1) || closingGroupBracketIndex==end-1 || end+1==smiles.length || smiles[end+1] == '.')))
end++;
int molend = end;
if (closingGroupBracketIndex == end-1) {
molend--;
closingGroupBracketIndex = -1;
}
if (index != molend) {
StereoMolecule mol = new StereoMolecule();
parse(mol, smiles, index, molend);
if (mSmartsMode == SMARTS_MODE_GUESS && mSmartsFeatureFound)
return new SmilesParser(mMode | SMARTS_MODE_IS_SMARTS).parseReaction(smiles);
if (part == 0)
rxn.addReactant(mol);
else if (part == 1)
rxn.addCatalyst(mol);
else
rxn.addProduct(mol);
}
index = end;
while (index < smiles.length && smiles[index] == '>') {
index++;
part++;
}
}
return rxn;
}
protected ArrayList getEnumerationPositionList() {
return mEnumerationPositionList;
}
protected void setEnumerationPositionList(ArrayList l) {
mEnumerationPositionList = l;
}
public String[] enumerateSmarts(String smarts) throws Exception {
mEnumerationPositionList = new ArrayList<>();
mSmartsMode = SMARTS_MODE_IS_SMARTS;
mMode |= MODE_ENUMERATE_SMARTS;
ArrayList smartsList = new ArrayList<>();
smartsList.add(smarts);
try {
parse(new StereoMolecule(), smarts);
}
catch (Exception e) {
System.out.println(e.getMessage());
}
EnumerationPosition[] options = mEnumerationPositionList.toArray(new EnumerationPosition[0]);
Arrays.sort(options);
for (EnumerationPosition option : options) {
ArrayList enumeration = new ArrayList<>();
for (String s : smartsList)
option.enumerate(this, s.getBytes(StandardCharsets.UTF_8), enumeration);
smartsList = enumeration;
}
return smartsList.toArray(new String[0]);
}
/**
* If createSmartsWarning in the constructor was passed as true, then this method
* returns a list of all SMARTS features, which could not be interpreted in the most recently
* parsed SMILES/SMARTS pattern.
* @return
*/
public String getSmartsWarning() {
return mSmartsWarningBuffer == null ? "" : "Unresolved SMARTS features:"+mSmartsWarningBuffer;
}
/**
* Parses the given smiles into the molecule, creates proper atom coordinates
* to reflect correct double bond geometries and translates tetrahedral and allene
* parities into up/down-bonds. SMARTS features are neglected unless
* setAllowSmartsFeatures(true) was called before parsing.
* @param mol
* @param smiles
* @throws Exception
*/
public void parse(StereoMolecule mol, String smiles) throws Exception {
parse(mol, smiles.getBytes(StandardCharsets.UTF_8), true, true);
}
public void parse(StereoMolecule mol, byte[] smiles) throws Exception {
parse(mol, smiles, true, true);
}
public void parse(StereoMolecule mol, byte[] smiles, int position, int endIndex) throws Exception {
parse(mol, smiles, position, endIndex, true, true);
}
public void parse(StereoMolecule mol, byte[] smiles, boolean createCoordinates, boolean readStereoFeatures) throws Exception {
parse(mol, smiles, 0, smiles.length, createCoordinates, readStereoFeatures);
}
public void parse(StereoMolecule mol, byte[] smiles, int position, int endIndex, boolean createCoordinates, boolean readStereoFeatures) throws Exception {
mMol = mol;
mMol.clear();
if (mSmartsWarningBuffer != null)
mSmartsWarningBuffer.setLength(0);
mAromaticAtoms = 0;
mSmartsFeatureFound = false;
boolean allowSmarts = (mSmartsMode != SMARTS_MODE_IS_SMILES);
TreeMap parityMap = null;
int[] baseAtom = new int[BRACKET_LEVELS];
baseAtom[0] = -1;
int[] ringClosureAtom = new int[INITIAL_CONNECTIONS];
int[] ringClosurePosition = new int[INITIAL_CONNECTIONS];
int[] ringClosureBondType = new int[INITIAL_CONNECTIONS];
int[] ringClosureBondQueryFeatures = new int[INITIAL_CONNECTIONS];
for (int i = 0; i();
// using position as hydrogenPosition is close enough
int hydrogenCount = (atomParser.explicitHydrogens == HYDROGEN_IMPLICIT_ZERO) ? 0 : atomParser.explicitHydrogens;
parityMap.put(atom, new THParity(atom, position - 2, fromAtom, hydrogenCount, position - 1, atomParser.isClockwise));
}
}
continue;
}
if (theChar == '.') {
baseAtom[bracketLevel] = -1;
bondType = Molecule.cBondTypeDeleted;
continue;
}
if (isBondSymbol(theChar)) {
if (squareBracketOpen)
throw new Exception("SmilesParser: unexpected bond symbol inside square brackets: '"+theChar+"', position:"+(position-1));
int excludedBonds = 0;
while (isBondSymbol(theChar)) {
if (theChar == '!') {
theChar = (char)smiles[position++];
if (theChar == '@')
bondQueryFeatures |= Molecule.cBondQFNotRing;
else if ((theChar == '-' && smiles[position] == '>')
|| (theChar == '<' && smiles[position] == '-')) {
excludedBonds |= Molecule.cBondTypeMetalLigand;
position++;
}
else if (theChar == '-')
excludedBonds |= Molecule.cBondQFSingle;
else if (theChar == '=')
excludedBonds |= Molecule.cBondQFDouble;
else if (theChar == '#')
excludedBonds |= Molecule.cBondQFTriple;
else if (theChar == '$')
excludedBonds |= Molecule.cBondQFQuadruple;
else if (theChar == ':')
excludedBonds |= Molecule.cBondQFDelocalized;
else
throw new Exception("SmilesParser: bond symbol '"+theChar+"' not allowed after '!'. Position:"+(position-1));
}
else {
if (theChar == '@')
bondQueryFeatures |= Molecule.cBondQFRing;
else if (theChar == '=')
bondType = Molecule.cBondTypeDouble;
else if (theChar == '#')
bondType = Molecule.cBondTypeTriple;
else if (theChar == '$')
bondType = Molecule.cBondTypeQuadruple;
else if (theChar == ':')
bondType = Molecule.cBondTypeDelocalized;
else if (theChar == '~')
bondQueryFeatures |= Molecule.cBondQFSingle | Molecule.cBondQFDouble | Molecule.cBondQFTriple | Molecule.cBondQFDelocalized;
else if (theChar == '/') {
if (readStereoFeatures)
bondType = Molecule.cBondTypeUp; // encode slash temporarily in bondType
}
else if (theChar == '\\') {
if (readStereoFeatures)
bondType = Molecule.cBondTypeDown; // encode slash temporarily in bondType
}
// Smiles extention 'dative bond'
else if ((theChar == '-' && smiles[position] == '>')
|| (theChar == '<' && smiles[position] == '-')) {
bondType = Molecule.cBondTypeMetalLigand;
position++;
}
if (smiles[position] == ',') {
bondQueryFeatures |= bondSymbolToQueryFeature(bondType == Molecule.cBondTypeMetalLigand ? '>' : theChar);
while (smiles[position] == ',') {
if ((smiles[position+1] == '<' && smiles[position+2] == '-')
|| (smiles[position+1] == '-' && smiles[position+2] == '>')) {
bondQueryFeatures |= bondSymbolToQueryFeature('>');
position += 3;
}
else {
bondQueryFeatures |= bondSymbolToQueryFeature((char)smiles[position+1]);
position += 2;
}
}
}
}
if (smiles[position] == ';') {
position++;
theChar = (char)smiles[position++];
continue;
}
if (excludedBonds != 0)
bondQueryFeatures |= Molecule.cBondQFBondTypes & ~excludedBonds;
break;
}
continue;
}
if (theChar <= ' ') { // we stop reading at whitespace
position = endIndex;
continue;
}
if (Character.isDigit(theChar)) {
int number = theChar - '0';
if (squareBracketOpen) {
while (position < endIndex
&& Character.isDigit(smiles[position])) {
number = 10 * number + smiles[position] - '0';
position++;
}
atomMass = number;
}
else {
int bondTypePosition = isDoubleDigit ? position - 3 : position - 2;
boolean hasBondType = (smiles[bondTypePosition] == '-'
|| smiles[bondTypePosition] == '/'
|| smiles[bondTypePosition] == '\\'
|| smiles[bondTypePosition] == '='
|| smiles[bondTypePosition] == '#'
|| smiles[bondTypePosition] == '$'
|| smiles[bondTypePosition] == ':'
|| smiles[bondTypePosition] == '>'
|| smiles[bondTypePosition] == '~');
if (isDoubleDigit
&& position < endIndex
&& Character.isDigit(smiles[position])) {
number = 10 * number + smiles[position] - '0';
isDoubleDigit = false;
position++;
}
if (number >= ringClosureAtom.length) {
if (number >=MAX_CONNECTIONS)
throw new Exception("SmilesParser: ringClosureAtom number out of range: "+number);
int oldSize = ringClosureAtom.length;
int newSize = ringClosureAtom.length;
while (newSize <= number)
newSize = Math.min(MAX_CONNECTIONS, newSize + INITIAL_CONNECTIONS);
ringClosureAtom = Arrays.copyOf(ringClosureAtom, newSize);
ringClosurePosition = Arrays.copyOf(ringClosurePosition, newSize);
ringClosureBondType = Arrays.copyOf(ringClosureBondType, newSize);
ringClosureBondQueryFeatures = Arrays.copyOf(ringClosureBondQueryFeatures, newSize);
for (int i=oldSize; i= 1)
mMol.setAtomQueryFeature(atom, Molecule.cAtomQFNot0Hydrogen, true);
if (explicitHydrogen >= 2)
mMol.setAtomQueryFeature(atom, Molecule.cAtomQFNot1Hydrogen, true);
if (explicitHydrogen >= 3)
mMol.setAtomQueryFeature(atom, Molecule.cAtomQFNot2Hydrogen, true);
if (explicitHydrogen >= 4)
mMol.setAtomQueryFeature(atom, Molecule.cAtomQFNot3Hydrogen, true);
}
}
if (!mMakeHydrogenExplicit && (mSmartsFeatureFound || mSmartsMode == SMARTS_MODE_IS_SMARTS))
mMol.removeExplicitHydrogens();
mMol.ensureHelperArrays(Molecule.cHelperNeighbours);
correctValenceExceededNitrogen(); // convert pyridine oxides and nitro into polar structures with valid nitrogen valences
locateAromaticDoubleBonds(allowSmarts, mSmartsFeatureFound);
mMol.removeAtomCustomLabels();
mMol.setHydrogenProtection(false);
if (readStereoFeatures) {
assignKnownEZBondParities();
if (parityMap != null) {
for (THParity parity:parityMap.values())
mMol.setAtomParity(handleHydrogenAtomMap[parity.mCentralAtom], parity.calculateParity(handleHydrogenAtomMap), false);
mMol.setParitiesValid(0);
}
}
// defines unknown EZ parities as such, i.e. prevent coordinate generation to create implicit EZ-parities
mMol.setParitiesValid(0);
if (createCoordinates) {
CoordinateInventor inventor = new CoordinateInventor(mCoordinateMode);
if (mRandomSeed != 0)
inventor.setRandomSeed(mRandomSeed);
inventor.invent(mMol);
if (readStereoFeatures)
mMol.setUnknownParitiesToExplicitlyUnknown();
}
if (mSmartsFeatureFound || mSmartsMode == SMARTS_MODE_IS_SMARTS) {
mMol.setFragment(true);
mMol.validateAtomQueryFeatures();
mMol.validateBondQueryFeatures();
}
}
/**
* @return true if the previously parsed SMILES contained a SMARTS feature and was not parsed with SMARTS_MODE_IS_SMILES
*/
public boolean isSmarts() {
return mSmartsFeatureFound;
}
private boolean isBondSymbol(char theChar) {
return theChar == '-'
|| theChar == '='
|| theChar == '#'
|| theChar == '$'
|| theChar == ':'
|| theChar == '/'
|| theChar == '\\'
|| theChar == '<'
|| theChar == '~'
|| theChar == '!'
|| theChar == '@';
}
private int bondSymbolToQueryFeature(char symbol) {
return symbol == '=' ? Molecule.cBondQFDouble
: symbol == '#' ? Molecule.cBondQFTriple
: symbol == '$' ? Molecule.cBondQFQuadruple
: symbol == ':' ? Molecule.cBondQFDelocalized
: symbol == '>' ? Molecule.cBondQFMetalLigand
: symbol == '~' ? Molecule.cBondQFBondTypes : Molecule.cBondQFSingle;
}
protected void smartsWarning(String feature) {
if (mCreateSmartsWarnings) {
if (mSmartsWarningBuffer == null)
mSmartsWarningBuffer = new StringBuilder();
mSmartsWarningBuffer.append(" ");
mSmartsWarningBuffer.append(feature);
}
}
private void locateAromaticDoubleBonds(boolean allowSmartsFeatures, boolean smartsFeatureFound) throws Exception {
mMol.ensureHelperArrays(Molecule.cHelperNeighbours);
mIsAromaticBond = new boolean[mMol.getBonds()];
mAromaticBonds = 0;
// all explicitly defined aromatic bonds are taken
for (int bond=0; bond=0) System.arraycopy(mIsAromaticBond, 0, isAromaticBond, 0, mMol.getBonds());
// Some Smiles contain 'aromatic' rings with atoms not being compatible
// with a PI-bond. These include: tertiary non-charged nitrogen, [nH],
// sulfur, non-charged oxygen, charged carbon, etc...
// All these atoms and attached bonds are marked as handled to avoid
// attached bonds to be promoted (changed to double bond) later.
for (int ring=0; ring=4; qualifyingNo--) {
do {
qualifyingBondFound = false;
for (int bond=0; bond= 2)
if (!connectConjugatedRadicalPairs(isAromaticBond))
break;
if (allowSmartsFeatures) {
if (mAromaticAtoms != 0) {
for (int atom=0; atom 0)
return false;
}
int explicitHydrogens = (mMol.getAtomCustomLabel(atom) == null || mMol.getAtomCustomLabelBytes(atom)[0] == HYDROGEN_IMPLICIT_ZERO) ?
0 : mMol.getAtomCustomLabelBytes(atom)[0];
int freeValence = mMol.getFreeValence(atom) - explicitHydrogens;
if (freeValence < 1)
return false;
if (mMol.getAtomicNo(atom) == 16
|| mMol.getAtomicNo(atom) == 34
|| mMol.getAtomicNo(atom) == 52) {
if (mMol.getConnAtoms(atom) == 2 && mMol.getAtomCharge(atom) <= 0)
return false;
return freeValence != 2; // e.g. -S(=O)- correction to account for tetravalent S,Se
}
return true;
}
private void promoteBond(int bond) {
if (mMol.getBondType(bond) == Molecule.cBondTypeSingle)
mMol.setBondType(bond, Molecule.cBondTypeDouble);
for (int i=0; i<2; i++) {
int bondAtom = mMol.getBondAtom(i, bond);
if (mMol.isMarkedAtom(bondAtom)) {
mMol.setAtomMarker(bondAtom, false);
mAromaticAtoms--;
}
for (int j=0; j 3
&& mMol.getAtomPi(atom) > 0) {
for (int i=0; i 1)
&& mMol.isElectronegative(connAtom)) {
if (mMol.getBondType(connBond) == Molecule.cBondTypeTriple)
mMol.setBondType(connBond, Molecule.cBondTypeDouble);
else
mMol.setBondType(connBond, Molecule.cBondTypeSingle);
mMol.setAtomCharge(atom, mMol.getAtomCharge(atom) + 1);
mMol.setAtomCharge(connAtom, mMol.getAtomCharge(connAtom) - 1);
mMol.setAtomAbnormalValence(atom, -1);
break;
}
}
}
}
}
private boolean assignKnownEZBondParities() {
mMol.ensureHelperArrays(Molecule.cHelperRings);
boolean paritiesFound = false;
int[] refAtom = new int[2];
int[] refBond = new int[2];
int[] otherAtom = new int[2];
for (int bond=0; bond {
int mPosition,mCount;
/**
* @param position position of first option in original smarts
*/
public EnumerationPosition(int position) {
mPosition = position;
mCount = 1;
}
public void increase() {
mCount++;
}
public void enumerate(SmilesParser parser, byte[] smarts, ArrayList enumeration) throws Exception {
ArrayList optionList = new ArrayList<>();
int start = mPosition;
SmilesAtomParser atomParser = new SmilesAtomParser(parser, mMode | mSmartsMode);
int end = atomParser.parseAtomInsideBrackets(smarts, start+1, smarts.length, true, true)-1;
if (smarts[end] != ']') { // we have multiple options and create an option list
optionList.add(new String(smarts, start, end-start));
while (smarts[end] != ']') {
start = end+1;
end = atomParser.parseAtomInsideBrackets(smarts, start+1, smarts.length, true, true)-1;
optionList.add(new String(smarts, start, end-start));
}
}
for (String option : optionList)
enumeration.add(new String(smarts, 0, mPosition) + option + new String(smarts, end, smarts.length-end));
}
@Override
public int compareTo(EnumerationPosition o) {
return Integer.compare(o.mPosition, mPosition);
}
}
private static class ParityNeighbour {
int mAtom,mPosition;
public ParityNeighbour(int atom, int position) {
mAtom = atom;
mPosition = position;
}
}
private static class THParity {
private static final int PSEUDO_ATOM_HYDROGEN = Integer.MAX_VALUE - 1;
private static final int PSEUDO_ATOM_LONE_PAIR = Integer.MAX_VALUE;
int mCentralAtom,mCentralAtomPosition;
boolean mIsClockwise,mError;
ArrayList mNeighbourList;
/**
* Instantiates a new parity object during smiles traversal.
* @param centralAtom index of atom processed
* @param centralAtomPosition position in SMILES of central atom
* @param fromAtom index of parent atom of centralAtom (-1 if centralAtom is first atom in smiles)
* @param explicitHydrogen Daylight syntax: hydrogen atoms defined within square bracket of other atom
* @param hydrogenPosition position in SMILES of central atom
* @param isClockwise true if central atom is marked with @@ rather than @
*/
public THParity(int centralAtom, int centralAtomPosition, int fromAtom, int explicitHydrogen, int hydrogenPosition, boolean isClockwise) {
if (explicitHydrogen != 0 && explicitHydrogen != 1) {
mError = true;
}
else {
mCentralAtom = centralAtom;
mCentralAtomPosition = centralAtomPosition;
mIsClockwise = isClockwise;
mNeighbourList = new ArrayList<>();
// If we have a fromAtom, an explicit hydrogen, or a lone pair,
// then add it as a normal neighbour.
if (fromAtom != -1)
addNeighbor(fromAtom, centralAtomPosition-1, false);
if (fromAtom != -1 && explicitHydrogen == 1)
addNeighbor(PSEUDO_ATOM_HYDROGEN, centralAtomPosition+1, false);
}
}
/**
* Adds a currently traversed neighbor or ring closure to parity object,
* which belongs to the neighbor's parent atom.
* In case of a ring closure the bond closure digit's position in the smiles
* rather than the neighbor's position is the relevant position used for parity
* determination.
* We need to track the atom, because neighbors are not necessarily added in atom
* sequence (ring closure with connection back to stereo center).
* @param atom
* @param position
*/
public void addNeighbor(int atom, int position, boolean unused) {
if (!mError) {
if (mNeighbourList.size() == 4) {
mError = true;
return;
}
mNeighbourList.add(new ParityNeighbour(atom, position));
}
}
public int calculateParity(int[] handleHydrogenAtomMap) {
if (mError)
return Molecule.cAtomParityUnknown;
// We need to translate smiles-parse-time atom indexes to those that the molecule
// uses after calling handleHydrogens, which is called from ensureHelperArrays().
for (ParityNeighbour neighbour:mNeighbourList)
if (neighbour.mAtom != PSEUDO_ATOM_HYDROGEN && neighbour.mAtom != PSEUDO_ATOM_LONE_PAIR)
neighbour.mAtom = handleHydrogenAtomMap[neighbour.mAtom];
if (mNeighbourList.size() == 3)
// All hydrogens atoms within SMILES all stereo centers all hydrogens must be explicit (as explicit atoms or as H count in square brackets).
// Therefore, three neighbour atoms is a rare situation, e.g. CC[S@](=O)C or frozen out CC[N@H]C
// In these cases we add the electron pair as pseudo neighbour
mNeighbourList.add(new ParityNeighbour(PSEUDO_ATOM_LONE_PAIR, mCentralAtomPosition));
else if (mNeighbourList.size() != 4)
return Molecule.cAtomParityUnknown;
/*
System.out.println();
System.out.println("central:"+mCentralAtom+(mIsClockwise?" @@":" @")+" from:"
+((mFromAtom == -1)?"none":Integer.toString(mFromAtom))+" with "+mImplicitHydrogen+" hydrogens");
System.out.print("neighbors: "+mNeighborAtom[0]+"("+mNeighborPosition[0]+(mNeighborIsHydrogen[0]?",H":",non-H")+")");
for (int i=1; i mNeighbourList.get(i).mAtom)
inversion = !inversion;
if (mNeighbourList.get(j).mPosition > mNeighbourList.get(i).mPosition)
inversion = !inversion;
}
}
return inversion;
}
}
private static void testStereo() {
final String[][] data = { { "F/C=C/I", "F/C=C/I" },
{ "F/C=C\\I", "F/C=C\\I" },
{ "C(=C/I)/F", "F/C=C\\I" },
{ "[H]C(/F)=C/I", "F/C=C\\I" },
{ "C(=C\\1)/I.F1", "F/C=C/I" },
{ "C(=C1)/I.F/1", "F/C=C/I" },
{ "C(=C\\F)/1.I1", "F/C=C/I" },
{ "C(=C\\F)1.I\\1", "F/C=C/I" },
{ "C\\1=C/I.F1", "F/C=C/I" },
{ "C1=C/I.F/1", "F/C=C/I" },
{ "C(=C\\1)/2.F1.I2", "F/C=C/I" },
{ "C/2=C\\1.F1.I2", "F/C=C/I" },
{ "C/1=C/C=C/F.I1", "F/C=C/C=C\\I" },
{ "C1=C/C=C/F.I\\1", "F/C=C/C=C\\I" },
{ "C(/I)=C/C=C/1.F1", "F/C=C/C=C\\I" },
{ "C(/I)=C/C=C1.F\\1", "F/C=C/C=C\\I" },
{ "[C@](Cl)(F)(I)1.Br1", "F[C@](Cl)(Br)I" },
{ "Br[C@](Cl)(I)1.F1", "F[C@](Cl)(Br)I" },
{ "[C@H](F)(I)1.Br1", "F[C@H](Br)I" },
{ "Br[C@@H](F)1.I1", "F[C@H](Br)I" },
{ "C[S@@](CC)=O", "CC[S@](C)=O" },
{ "[S@](=O)(C)CC", "CC[S](C)=O" } };
StereoMolecule mol = new StereoMolecule();
for (String[] test:data) {
try {
new SmilesParser().parse(mol, test[0]);
String smiles = new IsomericSmilesCreator(mol).getSmiles();
System.out.print("IN:"+test[0]+" OUT:"+smiles);
if (!test[1].equals(smiles))
System.out.println(" EXPECTED: "+test[1]+" ERROR!");
else
System.out.println(" OK");
}
catch (Exception e) {
e.printStackTrace();
}
}
}
public static void main(String[] args) {
testStereo();
System.out.println("ID-code equivalence test:");
final String[][] data = { { "N[C@@]([H])(C)C(=O)O", "S-alanine", "gGX`BDdwMUM@@" },
{ "N[C@@H](C)C(=O)O", "S-alanine", "gGX`BDdwMUM@@" },
{ "N[C@H](C(=O)O)C", "S-alanine", "gGX`BDdwMUM@@" },
{ "[H][C@](N)(C)C(=O)O", "S-alanine", "gGX`BDdwMUM@@" },
{ "[C@H](N)(C)C(=O)O", "S-alanine", "gGX`BDdwMUM@@" },
{ "N[C@]([H])(C)C(=O)O", "R-alanine", "gGX`BDdwMUL`@" },
{ "N[C@H](C)C(=O)O", "R-alanine", "gGX`BDdwMUL`@" },
{ "N[C@@H](C(=O)O)C", "R-alanine", "gGX`BDdwMUL`@" },
{ "[H][C@@](N)(C)C(=O)O", "R-alanine", "gGX`BDdwMUL`@" },
{ "[C@@H](N)(C)C(=O)O", "R-alanine", "gGX`BDdwMUL`@" },
{ "C[C@H]1CCCCO1", "S-Methyl-pyran", "gOq@@eLm]UUH`@" },
{ "O1CCCC[C@@H]1C", "S-Methyl-pyran", "gOq@@eLm]UUH`@" },
{ "[C@H](F)(B)O", "S-Methyl-oxetan", "gCaDDICTBSURH@" },
{ "C1CO[C@H]1C", "S-Methyl-oxetan", "gKQ@@eLmUTb@" },
{ "C1CO[C@@H](C)1", "S-Methyl-oxetan", "gKQ@@eLmUTb@" },
{ "[C@H]1(C)CCO1", "S-Methyl-oxetan", "gKQ@@eLmUTb@" },
{ "[H][C@]1(C)CCO1", "S-Methyl-oxetan", "gKQ@@eLmUTb@" },
{ "[H][C@@]1(CCO1)C", "S-Methyl-oxetan", "gKQ@@eLmUTb@" },
{ "[C@@]1([H])(C)CCO1", "S-Methyl-oxetan", "gKQ@@eLmUTb@" },
{ "[C@]1(C)([H])CCO1", "S-Methyl-oxetan", "gKQ@@eLmUTb@" },
{ "C1[C@@H]2COC2=N1", "oxetan-azetin", "gGy@LDimDvfja`@" },
{ "CC(C)[C@@]12C[C@@H]1[C@@H](C)C(=O)C2", "alpha-thujone", "dmLH@@RYe~IfyjjjkDaIh@" },
{ "CN1CCC[C@H]1c2cccnc2", "Nicotine", "dcm@@@{IDeCEDUSh@UUECP@" },
{ "CC[C@H](O1)CC[C@@]12CCCO2", "2S,5R-Chalcogran", "dmLD@@qJZY|fFZjjjdbH`@" },
{ "CCCC", "butane", "gC`@Dij@@" },
{ "C1C.CC1", "butane", "gC`@Dij@@" },
{ "[CH3][CH2][CH2][CH3]", "butane", "gC`@Dij@@" },
{ "C-C-C-C", "butane", "gC`@Dij@@" },
{ "C12.C1.CC2", "butane", "gC`@Dij@@" },
{ "[Na+].[Cl-]", "NaCl", "eDARHm@zd@@" },
{ "[Na+]-[Cl-]", "NaCl", "error" },
{ "[Na+]1.[Cl-]1", "NaCl", "error" },
{ "c1ccccc1", "benzene", "gFp@DiTt@@@" },
{ "C1=C-C=C-C=C1", "benzene", "gFp@DiTt@@@" },
{ "C1:C:C:C:C:C:1", "benzene", "gFp@DiTt@@@" },
{ "c1ccncc1", "pyridine", "gFx@@eJf`@@@" },
{ "[nH]1cccc1", "pyrrole", "gKX@@eKcRp@" },
{ "N1C=C-C=C1", "pyrrole", "gKX@@eKcRp@" },
{ "[H]n1cccc1", "pyrrole", "gKX@@eKcRp@" },
{ "[H]n1cccc1", "pyrrole", "gKX@@eKcRp@" },
{ "c1cncc1", "pyrrole no [nH]", "error" },
{ "[13CH4]", "C13-methane", "fH@FJp@" },
{ "[35ClH]", "35-chlorane", "fHdP@qX`" },
{ "[35Cl-]", "35-chloride", "fHtPxAbq@" },
{ "[Na+].[O-]c1ccccc1", "Na-phenolate", "daxHaHCPBXyAYUn`@@@" },
{ "c1cc([O-].[Na+])ccc1", "Na-phenolate", "daxHaHCPBXyAYUn`@@@" },
{ "C[C@@](C)(O1)C[C@@H](O)[C@@]1(O2)[C@@H](C)[C@@H]3CC=C4[C@]3(C2)C(=O)C[C@H]5[C@H]4CC[C@@H](C6)[C@]5(C)Cc(n7)c6nc(C[C@@]89(C))c7C[C@@H]8CC[C@@H]%10[C@@H]9C[C@@H](O)[C@@]%11(C)C%10=C[C@H](O%12)[C@]%11(O)[C@H](C)[C@]%12(O%13)[C@H](O)C[C@@]%13(C)CO",
"Cephalostatin-1",
"gdKe@h@@K`H@XjKHuYlnoP\\bbdRbbVTLbTrJbRaQRRRbTJTRTrfrfTTOBPHtFODPhLNSMdIERYJmShLfs]aqy|uUMUUUUUUE@UUUUMUUUUUUTQUUTPR`nDdQQKB|RIFbiQeARuQt`rSSMNtGS\\ct@@" },
};
StereoMolecule mol = new StereoMolecule();
for (String[] test:data) {
try {
new SmilesParser().parse(mol, test[0]);
String idcode = new Canonizer(mol).getIDCode();
if (test[2].equals("error"))
System.out.println("Should create error! "+test[1]+" smiles:"+test[0]+" idcode:"+idcode);
else if (!test[2].equals(idcode))
System.out.println("ERROR! "+test[1]+" smiles:"+test[0]+" is:"+idcode+" must:"+test[2]);
}
catch (Exception e) {
if (!test[2].equals("error"))
System.out.println("ERROR! "+test[1]+" smiles:"+test[0]+" exception:"+e.getMessage());
}
}
}
}