
com.actelion.research.chem.SmilesParser Maven / Gradle / Ivy
/*
* Copyright (c) 1997 - 2016
* Actelion Pharmaceuticals Ltd.
* Gewerbestrasse 16
* CH-4123 Allschwil, Switzerland
*
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice, this
* list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
* 3. Neither the name of the the copyright holder nor the
* names of its contributors may be used to endorse or promote products
* derived from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
* ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
* @author Thomas Sander
*/
package com.actelion.research.chem;
import com.actelion.research.chem.coords.CoordinateInventor;
import com.actelion.research.chem.reaction.Reaction;
import com.actelion.research.util.ArrayUtils;
import com.actelion.research.util.SortedList;
import java.nio.charset.StandardCharsets;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.TreeMap;
public class SmilesParser {
private static final int SMARTS_MODE_MASK = 3;
public static final int SMARTS_MODE_IS_SMILES = 0;
public static final int SMARTS_MODE_GUESS = 1;
public static final int SMARTS_MODE_IS_SMARTS = 2;
public static final int MODE_SKIP_COORDINATE_TEMPLATES = 4;
public static final int MODE_MAKE_HYDROGEN_EXPLICIT = 8;
public static final int MODE_NO_CACTUS_SYNTAX = 16; // if not set, then some CACTVS SMARTS extensions will be recognized and translated as close as possible
public static final int MODE_SINGLE_DOT_SEPARATOR = 32; // CONSIDER single dots '.' (rather than '..') as moelcule separator when parsing reactions
public static final int MODE_CREATE_SMARTS_WARNING = 64;
private static final int INITIAL_CONNECTIONS = 16;
private static final int MAX_CONNECTIONS = 100; // largest allowed one in SMILES is 99
private static final int BRACKET_LEVELS = 32;
private static final int MAX_AROMATIC_RING_SIZE = 15;
private static final int HYDROGEN_ANY = -1;
// Unspecified hydrogen count within brackets means :=0 for SMILES and no-H-restriction for SMARTS.
// Therefore, we have to distinguish from explicit H0, which defined query feature for SMARTS.
private static final int HYDROGEN_IMPLICIT_ZERO = 9;
private StereoMolecule mMol;
private boolean[] mIsAromaticBond;
private int mAromaticAtoms,mAromaticBonds,mCoordinateMode;
private final int mSmartsMode,mMode;
private long mRandomSeed;
private final boolean mCreateSmartsWarnings,mMakeHydrogenExplicit,mAllowCactvs,mSingleDotSeparator;
private StringBuilder mSmartsWarningBuffer;
private boolean mSmartsFeatureFound;
/**
* Creates a new SmilesParser that doesn't allow SMARTS features to be present in
* parsed strings. SMARTS features cause an exception. The fragment flag of created
* molecules is never set.
*/
public SmilesParser() {
this(SMARTS_MODE_IS_SMILES);
}
/**
* Creates a new SmilesParser that may or may not allow SMARTS features to be present in
* parsed strings. If smartsMode is SMARTS_MODE_IS_SMILES, then any SMARTS features cause
* an exception. If smartsMode is SMARTS_MODE_IS_SMARTS, then the input string is considered
* a SMARTS, e.g. 'CC' is taken as fragment of two non-aromatic carbon atoms connected by a
* single bond and without any implicit hydrogen atoms. If smartsMode is SMARTS_MODE_IS_GUESS,
* then the molecule is considered a substructure if any SMARTS features are discovered.
* Depending on whether SMARTS features are found, created molecules have the fragment flag set
* or not set.
* @param mode one of SMARTS_MODE... and optionally other mode flags
*/
public SmilesParser(int mode) {
mMode = mode & ~SMARTS_MODE_MASK;
mSmartsMode = mode & SMARTS_MODE_MASK;
mAllowCactvs = (mode & MODE_NO_CACTUS_SYNTAX) == 0;
mSingleDotSeparator = (mode & MODE_SINGLE_DOT_SEPARATOR) != 0;
mCreateSmartsWarnings = (mode & MODE_CREATE_SMARTS_WARNING) != 0;
mMakeHydrogenExplicit = ((mode & MODE_MAKE_HYDROGEN_EXPLICIT) != 0);
mCoordinateMode = CoordinateInventor.MODE_DEFAULT;
if ((mode & MODE_SKIP_COORDINATE_TEMPLATES) != 0)
mCoordinateMode |= CoordinateInventor.MODE_SKIP_DEFAULT_TEMPLATES;
if (mMakeHydrogenExplicit)
mCoordinateMode &= ~CoordinateInventor.MODE_REMOVE_HYDROGEN;
}
/**
* Depending on the parse() parameters, the SmilesParser may or may not generate new atom coordinates
* after parsing the SMILES. In difficult cases the employed CoordinateInventor uses random decisions
* when optimizing colliding coordinates. In strained and bridged ring systems, generated coordinates
* may not correctly represent all E/Z-bond configurations.
* Calling this method with a seed != 0 causes the creation of reproducible atom coordinates.
* @param seed value different from 0 in order to always create the same reproducible atom coordinates
*/
public void setRandomSeed(long seed) {
mRandomSeed = seed;
}
public StereoMolecule parseMolecule(String smiles) {
return smiles == null ? null : parseMolecule(smiles.getBytes(StandardCharsets.UTF_8));
}
/**
* Convenience method to quickly obtain a StereoMolecule from a SMILES string.
* If you process many SMILES, then the parse() methods are preferred, because
* they avoid the steady instantiation new StereoMolecules.
* @param smiles
* @return
*/
public StereoMolecule parseMolecule(byte[] smiles) {
StereoMolecule mol = new StereoMolecule();
try {
parse(mol, smiles);
}
catch (Exception e) {
return null;
}
return mol;
}
public static boolean isReactionSmiles(byte[] smiles) {
return isReactionSmiles(smiles, null);
}
public static boolean isReactionSmiles(byte[] smiles, int[] catalystCountHolder) {
int count = 0;
int index = -1;
while (count < 3) {
index = ArrayUtils.indexOf(smiles, (byte)'>', index + 1);
while (index>0 && smiles[index - 1] == (byte)'-')
index = ArrayUtils.indexOf(smiles, (byte)'>', index + 1);
if (index == -1)
break;
count++;
if (catalystCountHolder != null && count == 1) {
catalystCountHolder[0] = 0;
if (index+1') {
catalystCountHolder[0] = 1;
for (int i=index+1; i' || smiles[i-1] == '-'); i++)
if (smiles[i] == '.' && smiles[i-1] != '.')
catalystCountHolder[0]++;
}
}
}
return count == 2;
}
public Reaction parseReaction(String smiles) throws Exception {
return smiles == null ? null : parseReaction(smiles.getBytes(StandardCharsets.UTF_8));
}
public Reaction parseReaction(byte[] smiles) throws Exception {
int index1 = ArrayUtils.indexOf(smiles, (byte)'>');
while (index1 > 0 && smiles[index1-1] == (byte)'-')
index1 = ArrayUtils.indexOf(smiles, (byte)'>', index1+1);
int index2 = (index1 == -1) ? -1 : ArrayUtils.indexOf(smiles, (byte)'>', index1+1);
while (index2 > 0 && smiles[index2-1] == (byte)'-')
index2 = ArrayUtils.indexOf(smiles, (byte)'>', index2+1);
if (index2 == -1)
throw new Exception("Missing one or both separators ('>').");
if (ArrayUtils.indexOf(smiles, (byte)'>', index2+1) != -1)
throw new Exception("Found more than 2 separators ('>').");
Reaction rxn = new Reaction();
int part = 0;
int index = 0;
int closingGroupBracketIndex = -1;
while (index < smiles.length) {
while (index'
&& !(smiles[end] == '.' && ((mSingleDotSeparator && closingGroupBracketIndex==-1) || closingGroupBracketIndex==end-1 || end+1==smiles.length || smiles[end+1] == '.')))
end++;
int molend = end;
if (closingGroupBracketIndex == end-1) {
molend--;
closingGroupBracketIndex = -1;
}
if (index != molend) {
StereoMolecule mol = new StereoMolecule();
parse(mol, smiles, index, molend);
if (mSmartsMode == SMARTS_MODE_GUESS && mSmartsFeatureFound)
return new SmilesParser(mMode | SMARTS_MODE_IS_SMARTS).parseReaction(smiles);
if (part == 0)
rxn.addReactant(mol);
else if (part == 1)
rxn.addCatalyst(mol);
else
rxn.addProduct(mol);
}
index = end;
while (index < smiles.length && smiles[index] == '>') {
index++;
part++;
}
}
return rxn;
}
/**
* If createSmartsWarning in the constructor was passed as true, then this method
* returns a list of all SMARTS features, which could not be interpreted in the most recently
* parsed SMILES/SMARTS pattern.
* @return
*/
public String getSmartsWarning() {
return mSmartsWarningBuffer == null ? "" : "Unresolved SMARTS features:"+mSmartsWarningBuffer;
}
/**
* Parses the given smiles into the molecule, creates proper atom coordinates
* to reflect correct double bond geometries and translates tetrahedral and allene
* parities into up/down-bonds. SMARTS features are neglected unless
* setAllowSmartsFeatures(true) was called before parsing.
* @param mol
* @param smiles
* @throws Exception
*/
public void parse(StereoMolecule mol, String smiles) throws Exception {
parse(mol, smiles.getBytes(StandardCharsets.UTF_8), true, true);
}
public void parse(StereoMolecule mol, byte[] smiles) throws Exception {
parse(mol, smiles, true, true);
}
public void parse(StereoMolecule mol, byte[] smiles, int position, int endIndex) throws Exception {
parse(mol, smiles, position, endIndex, true, true);
}
public void parse(StereoMolecule mol, byte[] smiles, boolean createCoordinates, boolean readStereoFeatures) throws Exception {
parse(mol, smiles, 0, smiles.length, createCoordinates, readStereoFeatures);
}
public void parse(StereoMolecule mol, byte[] smiles, int position, int endIndex, boolean createCoordinates, boolean readStereoFeatures) throws Exception {
mMol = mol;
mMol.clear();
if (mSmartsWarningBuffer != null)
mSmartsWarningBuffer.setLength(0);
mAromaticAtoms = 0;
mSmartsFeatureFound = false;
boolean allowSmarts = (mSmartsMode != SMARTS_MODE_IS_SMILES);
TreeMap parityMap = null;
int[] baseAtom = new int[BRACKET_LEVELS];
baseAtom[0] = -1;
int[] ringClosureAtom = new int[INITIAL_CONNECTIONS];
int[] ringClosurePosition = new int[INITIAL_CONNECTIONS];
int[] ringClosureBondType = new int[INITIAL_CONNECTIONS];
int[] ringClosureBondQueryFeatures = new int[INITIAL_CONNECTIONS];
for (int i = 0; i atomList = new SortedList<>();
SmilesRange range = new SmilesRange(smiles);
AtomInfo atomInfo = new AtomInfo();
ArrayList recursiveGroupList = new ArrayList<>();
int[] skipCount = new int[1];
while (smiles[position] <= 32)
position++;
while (position < endIndex) {
char theChar = (char)smiles[position++];
// if there is an atom symbol,
if (Character.isLetter(theChar)
|| theChar == '*'
|| theChar == '?'
|| (theChar == '!' && allowSmarts && squareBracketOpen)
|| (theChar == '#' && allowSmarts && squareBracketOpen)) {
int atomicNo = -1;
int charge = 0;
int mapNo = 0;
int abnormalValence = -1;
int explicitHydrogens = HYDROGEN_ANY;
boolean parityFound = false;
boolean isClockwise = false;
long atomQueryFeatures = 0; // translated from obvious SMARTS features
if (squareBracketOpen) {
if (theChar == '*') {
atomicNo = 6;
atomQueryFeatures |= Molecule.cAtomQFAny;
}
else if (theChar == '?') {
atomicNo = 0;
}
else {
boolean isNotList = (theChar == '!');
if (isNotList) {
mSmartsFeatureFound = true;
atomQueryFeatures |= Molecule.cAtomQFAny;
position++;
}
// Handle this before checking for atom symbols, because R (ring count) takes precedence to R1 - R16 (substituent pseudo label)
if (smiles[position-1] == 'R' && allowSmarts && (Character.isDigit(smiles[position]) || (mAllowCactvs && smiles[position] == '{'))) {
atomicNo = 6;
atomQueryFeatures |= Molecule.cAtomQFAny;
position--;
if (isNotList)
position--;
}
else {
if (!parseAtomInBrackets(smiles, position-1, endIndex, atomInfo))
throw new Exception("SmilesParser: Unexpected character in atom definition:'"+((char)smiles[position-1])+"' position:"+(position-1));
atomicNo = atomInfo.atomicNo;
position += atomInfo.labelLength - 1;
if (mSmartsMode != SMARTS_MODE_IS_SMARTS)
explicitHydrogens = HYDROGEN_IMPLICIT_ZERO; // in case we have SMILES; neglected, if we process a SMARTS, which we may learn later when hitting a query feature
// If we have a comma after the first atom label, then we need to parse a (positive) atom list.
// In this case we also have to set aromaticity query features from upper and lower case symbols.
if (allowSmarts && (smiles[position] == ',' || isNotList)) {
boolean mayBeAromatic = atomInfo.mayBeAromatic;
boolean mayBeAliphatic = atomInfo.mayBeAliphatic;
int start = position - atomInfo.labelLength;
while (start < endIndex) {
if (!parseAtomInBrackets(smiles, start, endIndex, atomInfo)) {
if (!isNotList)
throw new Exception("SmilesParser: Unexpected character in atom list:'"+((char)smiles[start])+"'. Position:"+start);
// a not-list may be followed by ';' and another atom condition, while a positive list must not end with ','
break;
}
if (atomInfo.atomicNo == 1) {
if (!isNotList) // in not-lists we are allowed to remove hydrogens!
throw new Exception("SmilesParser: Hydrogen is not supported in positive atom lists:'"+new String(Arrays.copyOfRange(smiles, start, endIndex))+"'. Position:"+start);
}
else {
atomList.add(atomInfo.atomicNo);
mayBeAromatic |= atomInfo.mayBeAromatic;
mayBeAliphatic |= atomInfo.mayBeAliphatic;
}
start += atomInfo.labelLength;
if (smiles[start] != (isNotList ? ';' : ',')) // positive list: ',' e.g. "N,O"; negative lists: ';' e.g. "!#7;!#8"
break;
if (isNotList && smiles[start+1] != '!')
break;
start++;
if (smiles[start] == '!')
start++;
}
if (atomList.size() > 1) {
explicitHydrogens = HYDROGEN_ANY; // don't use implicit zero with atom lists
if (!mayBeAliphatic)
atomQueryFeatures |= Molecule.cAtomQFAromatic;
else if (!mayBeAromatic)
atomQueryFeatures |= Molecule.cAtomQFNotAromatic;
}
position = start;
}
}
}
while (squareBracketOpen) {
if (smiles[position] == '@') {
position++;
if (smiles[position] == '@') {
isClockwise = true;
position++;
}
parityFound = true;
continue;
}
if (smiles[position] == ':') {
position++;
while (Character.isDigit(smiles[position])) {
mapNo = 10 * mapNo + smiles[position] - '0';
position++;
}
continue;
}
if (smiles[position] == '[')
throw new Exception("SmilesParser: nested square brackets found. Position:"+position);
if (smiles[position] == ']') {
position++;
squareBracketOpen = false;
continue;
}
charge = parseCharge(smiles, position, skipCount);
if (skipCount[0] != 0) {
position += skipCount[0];
// explicit charge=0 is usually meant as query feature
if (charge == 0)
atomQueryFeatures |= Molecule.cAtomQFNotChargeNeg | Molecule.cAtomQFNotChargePos;
continue;
}
boolean isNot = (smiles[position] == '!');
if (isNot)
position++;
if (smiles[position] == 'H') {
position++;
position += range.parse(position, 1, 1);
long flags = 0;
if (range.min <= 0 && range.max >= 0)
flags |= Molecule.cAtomQFNot0Hydrogen;
if (range.min <= 1 && range.max >= 1)
flags |= Molecule.cAtomQFNot1Hydrogen;
if (range.min <= 2 && range.max >= 2)
flags |= Molecule.cAtomQFNot2Hydrogen;
if (range.min <= 3 && range.max >= 3)
flags |= Molecule.cAtomQFNot3Hydrogen;
if (isNot) {
atomQueryFeatures |= flags;
explicitHydrogens = HYDROGEN_ANY;
}
else {
if (range.isSingle()) {
explicitHydrogens = range.min;
}
else {
atomQueryFeatures |= (Molecule.cAtomQFHydrogen & ~flags);
explicitHydrogens = HYDROGEN_ANY;
}
}
continue;
}
if (smiles[position] == 'D') { // non-H-neighbours
position++;
position += range.parse(position, 1, 1);
long flags = 0;
if (range.min <= 0 && range.max >= 0)
flags |= Molecule.cAtomQFNot0Neighbours;
if (range.min <= 1 && range.max >= 1)
flags |= Molecule.cAtomQFNot1Neighbour;
if (range.min <= 2 && range.max >= 2)
flags |= Molecule.cAtomQFNot2Neighbours;
if (range.min <= 3 && range.max >= 3)
flags |= Molecule.cAtomQFNot3Neighbours;
if (range.min <= 4 && range.max >= 4)
flags |= Molecule.cAtomQFNot4Neighbours;
if (flags != 0) {
if (isNot)
atomQueryFeatures |= flags;
else if ((atomQueryFeatures & Molecule.cAtomQFNeighbours) != 0)
atomQueryFeatures &= ~flags;
else {
flags = flags ^ Molecule.cAtomQFNeighbours;
atomQueryFeatures |= flags;
}
}
continue;
}
if (smiles[position] == 'z' && mAllowCactvs) { // electro-negative neighbour count (CACTVS extension)
position++;
position += range.parse(position, 1, 4);
long flags = 0;
if (range.min <= 0 && range.max >= 0)
flags |= Molecule.cAtomQFNot0ENeighbours;
if (range.min <= 1 && range.max >= 1)
flags |= Molecule.cAtomQFNot1ENeighbour;
if (range.min <= 2 && range.max >= 2)
flags |= Molecule.cAtomQFNot2ENeighbours;
if (range.min <= 3 && range.max >= 3)
flags |= Molecule.cAtomQFNot3ENeighbours;
if (range.min <= 4 && range.max >= 4)
flags |= Molecule.cAtomQFNot4ENeighbours;
if (flags != 0) {
if (isNot)
atomQueryFeatures |= flags;
else if ((atomQueryFeatures & Molecule.cAtomQFENeighbours) != 0)
atomQueryFeatures &= ~flags;
else {
flags = flags ^ Molecule.cAtomQFENeighbours;
atomQueryFeatures |= flags;
}
}
continue;
}
if (smiles[position] == 'X') { // neighbour count including implicit hydrogens
position++;
position += range.parse(position, 1, 1);
byte[] valences = Molecule.cAtomValence[atomicNo];
if (valences == null)
continue;
int valence = valences[0];
// if we have a locally defined charge, we update the valance properly
int localCharge = parseCharge(smiles, position, skipCount);
if (skipCount[0] != 0) {
if (Molecule.isAtomicNoElectronegative(atomicNo))
valence += localCharge;
else if (atomicNo == 6)
valence -= Math.abs(localCharge);
else
valence -= localCharge;
}
long flags = 0;
// we convert into pi-electron count using standard valence
if (valence-range.min <= 0 && valence-range.max >= 0)
flags |= Molecule.cAtomQFNot0PiElectrons;
if (valence-range.min <= 1 && valence-range.max >= 1)
flags |= Molecule.cAtomQFNot1PiElectron;
if (valence-range.min <= 2 && valence-range.max >= 2)
flags |= Molecule.cAtomQFNot2PiElectrons;
if (flags != 0) {
if (isNot)
atomQueryFeatures |= flags;
else if ((atomQueryFeatures & Molecule.cAtomQFPiElectrons) != 0)
atomQueryFeatures &= ~flags;
else {
flags = flags ^ Molecule.cAtomQFPiElectrons;
atomQueryFeatures |= flags;
}
}
continue;
}
if (smiles[position] == 'A' || smiles[position] == 'a') {
position++;
atomQueryFeatures |= (isNot ^ smiles[position] == 'A') ? Molecule.cAtomQFNotAromatic : Molecule.cAtomQFAromatic;
continue;
}
if (smiles[position] == 'R') {
position++;
position += range.parse(position, 1, 3);
long flags = 0;
if (range.min <= 0 && range.max >= 0)
flags |= Molecule.cAtomQFNotChain;
if (range.min <= 1 && range.max >= 1)
flags |= Molecule.cAtomQFNot2RingBonds;
if (range.min <= 2 && range.max >= 2)
flags |= Molecule.cAtomQFNot3RingBonds;
if (range.min <= 3 && range.max >= 3)
flags |= Molecule.cAtomQFNot4RingBonds;
if (range.max > 3)
smartsWarning((isNot?"!R":"R")+range.max);
if (flags != 0) {
if (isNot)
atomQueryFeatures |= flags;
else if ((atomQueryFeatures & Molecule.cAtomQFRingState) != 0)
atomQueryFeatures &= ~flags;
else {
flags = flags ^ Molecule.cAtomQFRingState;
atomQueryFeatures |= flags;
}
}
continue;
}
if (smiles[position] == 'r') {
position++;
position += range.parse(position, 1, 1);
if (range.isDefault) {
if (isNot)
atomQueryFeatures |= Molecule.cBondQFRingState & ~Molecule.cAtomQFNotChain;
else
atomQueryFeatures |= Molecule.cAtomQFNotChain;
continue;
}
int ringSize = range.min;
if (range.isRange())
smartsWarning((isNot ? "!r" : "r") + range.toString());
if (!isNot && ringSize >= 3 && ringSize <= 7)
atomQueryFeatures |= (ringSize << Molecule.cAtomQFSmallRingSizeShift);
else if (!range.isRange())
smartsWarning((isNot ? "!r" : "r") + ringSize);
continue;
}
if (smiles[position] == 'v') {
position++;
position += range.parse(position, 1, 1);
int valence = range.min;
if (range.isRange())
smartsWarning((isNot ? "!v" : "v") + range.toString());
if (!isNot && valence <= 14)
abnormalValence = valence;
else if (!range.isRange())
smartsWarning((isNot ? "!v" : "v") + valence);
continue;
}
if (smiles[position] == '$') { // recursive SMARTS
// if (!isNot)
// throw new Exception("SmilesParser: non-negated recursive SMARTS relating to preceding atom are not supported yet. Position:"+position);
position += parseRecursiveGroup(smiles, position, recursiveGroupList);
continue;
}
if (allowSmarts && (smiles[position] == ';' || smiles[position] == '&')) { // we interpret high and low precendence AND the same way
mSmartsFeatureFound = true;
position++;
continue;
}
if (allowSmarts && (smiles[position] == ',' && isRepeatedAllowedORFeature(smiles, position, skipCount))) { // we allow OR-logic for some query options if they have the same type
mSmartsFeatureFound = true;
position += skipCount[0] + 1;
continue;
}
throw new Exception("SmilesParser: unexpected character inside brackets: '"+(char)smiles[position]+"', position:"+position);
}
}
else if (theChar == '*') {
atomicNo = 6;
atomQueryFeatures |= Molecule.cAtomQFAny;
}
else if (theChar == '?') {
atomicNo = 0;
}
else if ((theChar == 'A' || theChar == 'a') && allowSmarts) {
atomicNo = 6;
atomQueryFeatures |= Molecule.cAtomQFAny;
atomQueryFeatures |= theChar == 'A' ? Molecule.cAtomQFNotAromatic : Molecule.cAtomQFAromatic;
mSmartsFeatureFound = true;
}
else {
switch (Character.toUpperCase(theChar)) {
case 'B':
if (position < endIndex && smiles[position] == 'r') {
atomicNo = 35;
position++;
}
else
atomicNo = 5;
break;
case 'C':
if (position < endIndex && smiles[position] == 'l') {
atomicNo = 17;
position++;
}
else
atomicNo = 6;
break;
case 'F':
atomicNo = 9;
break;
case 'I':
atomicNo = 53;
break;
case 'N':
atomicNo = 7;
break;
case 'O':
atomicNo = 8;
break;
case 'P':
atomicNo = 15;
break;
case 'S':
atomicNo = 16;
break;
}
}
///////////////////////////////////////////////////////////////////////////////
// At this position the atom is determined and the square bracket is closed! //
///////////////////////////////////////////////////////////////////////////////
if (atomicNo == -1 && theChar != '?')
throw new Exception("SmilesParser: unknown element label found. Position:"+(position-1));
int atom = mMol.addAtom(atomicNo); // this may be a hydrogen, if defined as [H]
mMol.setAtomCharge(atom, charge);
mMol.setAtomMapNo(atom, mapNo, false);
mMol.setAtomAbnormalValence(atom, abnormalValence);
if (atomQueryFeatures != 0) {
mSmartsFeatureFound = true;
if ((atomQueryFeatures & Molecule.cAtomQFAromatic) != 0) {
atomQueryFeatures &= ~Molecule.cAtomQFAromatic;
mMol.setAtomMarker(atom, true);
mAromaticAtoms++;
}
else {
mMol.setAtomMarker(atom, false);
}
mMol.setAtomQueryFeature(atom, atomQueryFeatures, true);
}
if (atomList.size() != 0) {
mSmartsFeatureFound = true;
int[] list = new int[atomList.size()];
for (int i=0; i();
// using position as hydrogenPosition is close enough
int hydrogenCount = (explicitHydrogens == HYDROGEN_IMPLICIT_ZERO) ? 0 : explicitHydrogens;
parityMap.put(atom, new THParity(atom, position - 2, fromAtom, hydrogenCount, position - 1, isClockwise));
}
}
continue;
}
if (theChar == '$') { // recursive SMARTS
if (!recursiveGroupList.isEmpty())
throw new Exception("SmilesParser: multiple recursive SMARTS without preceding atom are not supported yet. Position:"+(position-1));
baseAtom[bracketLevel] = mol.getAllAtoms();
position += parseRecursiveGroup(smiles, position-1, recursiveGroupList);
mol.addMolecule(recursiveGroupList.get(0));
recursiveGroupList.clear();
continue;
}
if (theChar == '.') {
baseAtom[bracketLevel] = -1;
bondType = Molecule.cBondTypeDeleted;
continue;
}
if (isBondSymbol(theChar)) {
if (squareBracketOpen)
throw new Exception("SmilesParser: unexpected bond symbol inside square brackets: '"+theChar+"', position:"+(position-1));
int excludedBonds = 0;
while (isBondSymbol(theChar)) {
if (theChar == '!') {
theChar = (char)smiles[position++];
if (theChar == '@')
bondQueryFeatures |= Molecule.cBondQFNotRing;
else if ((theChar == '-' && smiles[position] == '>')
|| (theChar == '<' && smiles[position] == '-')) {
excludedBonds |= Molecule.cBondTypeMetalLigand;
position++;
}
else if (theChar == '-')
excludedBonds |= Molecule.cBondQFSingle;
else if (theChar == '=')
excludedBonds |= Molecule.cBondQFDouble;
else if (theChar == '#')
excludedBonds |= Molecule.cBondQFTriple;
else if (theChar == '$')
excludedBonds |= Molecule.cBondQFQuadruple;
else if (theChar == ':')
excludedBonds |= Molecule.cBondQFDelocalized;
else
throw new Exception("SmilesParser: bond symbol '"+theChar+"' not allowed after '!'. Position:"+(position-1));
}
else {
if (theChar == '@')
bondQueryFeatures |= Molecule.cBondQFRing;
else if (theChar == '=')
bondType = Molecule.cBondTypeDouble;
else if (theChar == '#')
bondType = Molecule.cBondTypeTriple;
else if (theChar == '$')
bondType = Molecule.cBondTypeQuadruple;
else if (theChar == ':')
bondType = Molecule.cBondTypeDelocalized;
else if (theChar == '~')
bondQueryFeatures |= Molecule.cBondQFSingle | Molecule.cBondQFDouble | Molecule.cBondQFTriple | Molecule.cBondQFDelocalized;
else if (theChar == '/') {
if (readStereoFeatures)
bondType = Molecule.cBondTypeUp; // encode slash temporarily in bondType
}
else if (theChar == '\\') {
if (readStereoFeatures)
bondType = Molecule.cBondTypeDown; // encode slash temporarily in bondType
}
// Smiles extention 'dative bond'
else if ((theChar == '-' && smiles[position] == '>')
|| (theChar == '<' && smiles[position] == '-')) {
bondType = Molecule.cBondTypeMetalLigand;
position++;
}
if (smiles[position] == ',') {
bondQueryFeatures |= bondSymbolToQueryFeature(bondType == Molecule.cBondTypeMetalLigand ? '>' : theChar);
while (smiles[position] == ',') {
if ((smiles[position+1] == '<' && smiles[position+2] == '-')
|| (smiles[position+1] == '-' && smiles[position+2] == '>')) {
bondQueryFeatures |= bondSymbolToQueryFeature('>');
position += 3;
}
else {
bondQueryFeatures |= bondSymbolToQueryFeature((char)smiles[position+1]);
position += 2;
}
}
}
}
if (smiles[position] == ';') {
position++;
theChar = (char)smiles[position++];
continue;
}
if (excludedBonds != 0)
bondQueryFeatures |= Molecule.cBondQFBondTypes & ~excludedBonds;
break;
}
continue;
}
if (theChar <= ' ') { // we stop reading at whitespace
position = endIndex;
continue;
}
if (Character.isDigit(theChar)) {
int number = theChar - '0';
if (squareBracketOpen) {
while (position < endIndex
&& Character.isDigit(smiles[position])) {
number = 10 * number + smiles[position] - '0';
position++;
}
atomMass = number;
}
else {
int bondTypePosition = isDoubleDigit ? position - 3 : position - 2;
boolean hasBondType = (smiles[bondTypePosition] == '-'
|| smiles[bondTypePosition] == '/'
|| smiles[bondTypePosition] == '\\'
|| smiles[bondTypePosition] == '='
|| smiles[bondTypePosition] == '#'
|| smiles[bondTypePosition] == '$'
|| smiles[bondTypePosition] == ':'
|| smiles[bondTypePosition] == '>'
|| smiles[bondTypePosition] == '~');
if (isDoubleDigit
&& position < endIndex
&& Character.isDigit(smiles[position])) {
number = 10 * number + smiles[position] - '0';
isDoubleDigit = false;
position++;
}
if (number >= ringClosureAtom.length) {
if (number >=MAX_CONNECTIONS)
throw new Exception("SmilesParser: ringClosureAtom number out of range: "+number);
int oldSize = ringClosureAtom.length;
int newSize = ringClosureAtom.length;
while (newSize <= number)
newSize = Math.min(MAX_CONNECTIONS, newSize + INITIAL_CONNECTIONS);
ringClosureAtom = Arrays.copyOf(ringClosureAtom, newSize);
ringClosurePosition = Arrays.copyOf(ringClosurePosition, newSize);
ringClosureBondType = Arrays.copyOf(ringClosureBondType, newSize);
ringClosureBondQueryFeatures = Arrays.copyOf(ringClosureBondQueryFeatures, newSize);
for (int i=oldSize; i= 1)
mMol.setAtomQueryFeature(atom, Molecule.cAtomQFNot0Hydrogen, true);
if (explicitHydrogen >= 2)
mMol.setAtomQueryFeature(atom, Molecule.cAtomQFNot1Hydrogen, true);
if (explicitHydrogen >= 3)
mMol.setAtomQueryFeature(atom, Molecule.cAtomQFNot2Hydrogen, true);
if (explicitHydrogen >= 4)
mMol.setAtomQueryFeature(atom, Molecule.cAtomQFNot3Hydrogen, true);
}
}
if (!mMakeHydrogenExplicit && (mSmartsFeatureFound || mSmartsMode == SMARTS_MODE_IS_SMARTS))
mMol.removeExplicitHydrogens();
mMol.ensureHelperArrays(Molecule.cHelperNeighbours);
correctValenceExceededNitrogen(); // convert pyridine oxides and nitro into polar structures with valid nitrogen valences
locateAromaticDoubleBonds(allowSmarts, mSmartsFeatureFound);
mMol.removeAtomCustomLabels();
mMol.setHydrogenProtection(false);
if (readStereoFeatures) {
assignKnownEZBondParities();
if (parityMap != null) {
for (THParity parity:parityMap.values())
mMol.setAtomParity(parity.mCentralAtom, parity.calculateParity(handleHydrogenAtomMap), false);
mMol.setParitiesValid(0);
}
}
// defines unknown EZ parities as such, i.e. prevent coordinate generation to create implicit EZ-parities
mMol.setParitiesValid(0);
if (createCoordinates) {
CoordinateInventor inventor = new CoordinateInventor(mCoordinateMode);
if (mRandomSeed != 0)
inventor.setRandomSeed(mRandomSeed);
inventor.invent(mMol);
if (readStereoFeatures)
mMol.setUnknownParitiesToExplicitlyUnknown();
}
if (mSmartsFeatureFound || mSmartsMode == SMARTS_MODE_IS_SMARTS) {
mMol.setFragment(true);
mMol.validateAtomQueryFeatures();
mMol.validateBondQueryFeatures();
}
}
/**
* @return true if the previously parsed SMILES contained a SMARTS feature and was not parsed with SMARTS_MODE_IS_SMILES
*/
public boolean isSmarts() {
return mSmartsFeatureFound;
}
/**
* @param smiles
* @param position position of potential first charge symbol '+' or '-'
* @param characterCount receives number of characters needed for charge encoding
* @return extracted charge; 0: no charge defined or explicit charge=0 - distinguish by characterCount
*/
private int parseCharge(byte[] smiles, int position, int[] characterCount) {
characterCount[0] = 0;
if (smiles[position] == '+' || smiles[position] == '-') {
byte symbol = smiles[position];
int charge = 1;
characterCount[0]++;
while (smiles[position+characterCount[0]] == symbol) {
charge++;
characterCount[0]++;
}
if (charge == 1 && Character.isDigit(smiles[position+1])) {
charge = smiles[position+1] - '0';
characterCount[0]++;
}
return symbol == '+' ? charge : -charge;
}
return 0;
}
private boolean isBondSymbol(char theChar) {
return theChar == '-'
|| theChar == '='
|| theChar == '#'
|| theChar == '$'
|| theChar == ':'
|| theChar == '/'
|| theChar == '\\'
|| theChar == '<'
|| theChar == '~'
|| theChar == '!'
|| theChar == '@';
}
/**
* If two subsequent features are delimited by comma (OR-logic), then we allow these
* - if they have the same type (and atom label, if an atom label is preceding), e.g. 'NX' in NX3 and NX4+
* - if the feature supports the logic of adding query features to previously given ones (D,R,X,z)
* @param smiles
* @param commaPosition
* @param skipCount int[1] to hold the number of characters to skip for atom label (0 if there is no atom label)
* @return true, if comma (OR-logic) is an allowed delimiter here
*/
private boolean isRepeatedAllowedORFeature(byte[] smiles, int commaPosition, int[] skipCount) {
if (commaPosition < 3)
return false;
int index1 = commaPosition - 1;
if (smiles[index1] == '+' || smiles[index1] == '-')
index1--;
if (!Character.isDigit(smiles[index1]))
return false;
index1--;
if (smiles[index1] != 'D'
&& smiles[index1] != 'R'
&& smiles[index1] != 'X'
&& smiles[index1] != 'z')
return false;
skipCount[0] = 0;
while (index1 > 0 && Character.isLetter(smiles[index1-1])) {
index1--;
skipCount[0]++;
}
int index2 = commaPosition + 1;
while (Character.isLetter(smiles[index1])) {
if (smiles.length <= index2 || smiles[index1] != smiles[index2])
return false;
index1++;
index2++;
}
return true;
}
private boolean parseAtomInBrackets(byte[] smiles, int position, int endIndex, AtomInfo info) throws Exception {
info.mayBeAromatic = true;
info.mayBeAliphatic = true;
if (smiles[position] == '#') {
position++;
mSmartsFeatureFound = true;
info.atomicNo = 0;
info.labelLength = 1;
while (position < endIndex
&& Character.isDigit(smiles[position])) {
info.atomicNo = 10 * info.atomicNo + smiles[position] - '0';
info.labelLength++;
position++;
}
if (info.atomicNo == 0 || info.atomicNo >= Molecule.cAtomLabel.length)
throw new Exception("SmilesParser: Atomic number out of range. position:"+(position-1));
return true;
}
if (smiles[position] >= 'A' && smiles[position] <= 'Z') {
info.labelLength = (smiles[position+1] >= 'a' && smiles[position+1] <= 'z') ? 2 : 1;
info.atomicNo = Molecule.getAtomicNoFromLabel(new String(smiles, position, info.labelLength, StandardCharsets.UTF_8));
info.mayBeAromatic = false;
return true;
}
if (smiles[position] >= 'a' && smiles[position] <= 'z') {
info.labelLength = (smiles[position+1] >= 'a' && smiles[position+1] <= 'z') ? 2 : 1;
info.atomicNo = Molecule.getAtomicNoFromLabel(new String(smiles, position, info.labelLength, StandardCharsets.UTF_8));
info.mayBeAliphatic = false;
return true;
}
return false;
}
private int bondSymbolToQueryFeature(char symbol) {
return symbol == '=' ? Molecule.cBondQFDouble
: symbol == '#' ? Molecule.cBondQFTriple
: symbol == '$' ? Molecule.cBondQFQuadruple
: symbol == ':' ? Molecule.cBondQFDelocalized
: symbol == '>' ? Molecule.cBondQFMetalLigand
: symbol == '~' ? Molecule.cBondQFBondTypes : Molecule.cBondQFSingle;
}
private void smartsWarning(String feature) {
if (mCreateSmartsWarnings) {
if (mSmartsWarningBuffer == null)
mSmartsWarningBuffer = new StringBuilder();
mSmartsWarningBuffer.append(" ");
mSmartsWarningBuffer.append(feature);
}
}
private int parseRecursiveGroup(byte[] smiles, int dollarIndex, ArrayList groupList) throws Exception {
if (smiles[dollarIndex+1] != '(')
throw new Exception("SmilesParser: '$' for recursive SMARTS must be followed by '('. position:"+dollarIndex);
int openBrackets = 1;
int endIndex = dollarIndex+2;
while (endIndex < smiles.length && openBrackets > 0) {
if (smiles[endIndex] == '(')
openBrackets++;
else if (smiles[endIndex] == ')')
openBrackets--;
endIndex++;
}
if (openBrackets > 0)
throw new Exception("SmilesParser: Missing closing ')' for recursive SMARTS. '('-position:"+(dollarIndex+1));
StereoMolecule group = new StereoMolecule(16, 16);
new SmilesParser(mMode | mSmartsMode).parse(group, smiles, dollarIndex+2, endIndex-1);
groupList.add(group);
if (smiles[dollarIndex-1] == '!')
for (int atom=0; atom=0) System.arraycopy(mIsAromaticBond, 0, isAromaticBond, 0, mMol.getBonds());
// Some Smiles contain 'aromatic' rings with atoms not being compatible
// with a PI-bond. These include: tertiary non-charged nitrogen, [nH],
// sulfur, non-charged oxygen, charged carbon, etc...
// All these atoms and attached bonds are marked as handled to avoid
// attached bonds to be promoted (changed to double bond) later.
for (int ring=0; ring=4; qualifyingNo--) {
do {
qualifyingBondFound = false;
for (int bond=0; bond= 2)
if (!connectConjugatedRadicalPairs(isAromaticBond))
break;
if (allowSmartsFeatures) {
if (mAromaticAtoms != 0) {
for (int atom=0; atom 0)
return false;
}
int explicitHydrogens = (mMol.getAtomCustomLabel(atom) == null || mMol.getAtomCustomLabelBytes(atom)[0] == HYDROGEN_IMPLICIT_ZERO) ?
0 : mMol.getAtomCustomLabelBytes(atom)[0];
int freeValence = mMol.getFreeValence(atom) - explicitHydrogens;
if (freeValence < 1)
return false;
if (mMol.getAtomicNo(atom) == 16
|| mMol.getAtomicNo(atom) == 34
|| mMol.getAtomicNo(atom) == 52) {
if (mMol.getConnAtoms(atom) == 2 && mMol.getAtomCharge(atom) <= 0)
return false;
return freeValence != 2; // e.g. -S(=O)- correction to account for tetravalent S,Se
}
return true;
}
private void promoteBond(int bond) {
if (mMol.getBondType(bond) == Molecule.cBondTypeSingle)
mMol.setBondType(bond, Molecule.cBondTypeDouble);
for (int i=0; i<2; i++) {
int bondAtom = mMol.getBondAtom(i, bond);
if (mMol.isMarkedAtom(bondAtom)) {
mMol.setAtomMarker(bondAtom, false);
mAromaticAtoms--;
}
for (int j=0; j 3
&& mMol.getAtomPi(atom) > 0) {
for (int i=0; i 1)
&& mMol.isElectronegative(connAtom)) {
if (mMol.getBondType(connBond) == Molecule.cBondTypeTriple)
mMol.setBondType(connBond, Molecule.cBondTypeDouble);
else
mMol.setBondType(connBond, Molecule.cBondTypeSingle);
mMol.setAtomCharge(atom, mMol.getAtomCharge(atom) + 1);
mMol.setAtomCharge(connAtom, mMol.getAtomCharge(connAtom) - 1);
mMol.setAtomAbnormalValence(atom, -1);
break;
}
}
}
}
}
private boolean assignKnownEZBondParities() {
mMol.ensureHelperArrays(Molecule.cHelperRings);
boolean paritiesFound = false;
int[] refAtom = new int[2];
int[] refBond = new int[2];
int[] otherAtom = new int[2];
for (int bond=0; bond mNeighbourList;
/**
* Instantiates a new parity object during smiles traversal.
* @param centralAtom index of atom processed
* @param centralAtomPosition position in SMILES of central atom
* @param fromAtom index of parent atom of centralAtom (-1 if centralAtom is first atom in smiles)
* @param explicitHydrogen Daylight syntax: hydrogen atoms defined within square bracket of other atom
* @param hydrogenPosition position in SMILES of central atom
* @param isClockwise true if central atom is marked with @@ rather than @
*/
public THParity(int centralAtom, int centralAtomPosition, int fromAtom, int explicitHydrogen, int hydrogenPosition, boolean isClockwise) {
if (explicitHydrogen != 0 && explicitHydrogen != 1) {
mError = true;
}
else {
mCentralAtom = centralAtom;
mCentralAtomPosition = centralAtomPosition;
mIsClockwise = isClockwise;
mNeighbourList = new ArrayList<>();
// If we have a fromAtom, an explicit hydrogen, or a lone pair,
// then add it as a normal neighbour.
if (fromAtom != -1)
addNeighbor(fromAtom, centralAtomPosition-1, false);
if (fromAtom != -1 && explicitHydrogen == 1)
addNeighbor(PSEUDO_ATOM_HYDROGEN, centralAtomPosition+1, false);
}
}
/**
* Adds a currently traversed neighbor or ring closure to parity object,
* which belongs to the neighbor's parent atom.
* In case of a ring closure the bond closure digit's position in the smiles
* rather than the neighbor's position is the relevant position used for parity
* determination.
* We need to track the atom, because neighbors are not necessarily added in atom
* sequence (ring closure with connection back to stereo center).
* @param atom
* @param position
*/
public void addNeighbor(int atom, int position, boolean unused) {
if (!mError) {
if (mNeighbourList.size() == 4) {
mError = true;
return;
}
mNeighbourList.add(new ParityNeighbour(atom, position));
}
}
public int calculateParity(int[] handleHydrogenAtomMap) {
if (mError)
return Molecule.cAtomParityUnknown;
// We need to translate smiles-parse-time atom indexes to those that the molecule
// uses after calling handleHydrogens, which is called from ensureHelperArrays().
for (ParityNeighbour neighbour:mNeighbourList)
if (neighbour.mAtom != PSEUDO_ATOM_HYDROGEN && neighbour.mAtom != PSEUDO_ATOM_LONE_PAIR)
neighbour.mAtom = handleHydrogenAtomMap[neighbour.mAtom];
if (mNeighbourList.size() == 3)
// All hydrogens atoms within SMILES all stereo centers all hydrogens must be explicit (as explicit atoms or as H count in square brackets).
// Therefore, three neighbour atoms is a rare situation, e.g. CC[S@](=O)C or frozen out CC[N@H]C
// In these cases we add the electron pair as pseudo neighbour
mNeighbourList.add(new ParityNeighbour(PSEUDO_ATOM_LONE_PAIR, mCentralAtomPosition));
else if (mNeighbourList.size() != 4)
return Molecule.cAtomParityUnknown;
/*
System.out.println();
System.out.println("central:"+mCentralAtom+(mIsClockwise?" @@":" @")+" from:"
+((mFromAtom == -1)?"none":Integer.toString(mFromAtom))+" with "+mImplicitHydrogen+" hydrogens");
System.out.print("neighbors: "+mNeighborAtom[0]+"("+mNeighborPosition[0]+(mNeighborIsHydrogen[0]?",H":",non-H")+")");
for (int i=1; i mNeighbourList.get(i).mAtom)
inversion = !inversion;
if (mNeighbourList.get(j).mPosition > mNeighbourList.get(i).mPosition)
inversion = !inversion;
}
}
return inversion;
}
}
private static void testStereo() {
final String[][] data = { { "F/C=C/I", "F/C=C/I" },
{ "F/C=C\\I", "F/C=C\\I" },
{ "C(=C/I)/F", "F/C=C\\I" },
{ "[H]C(/F)=C/I", "F/C=C\\I" },
{ "C(=C\\1)/I.F1", "F/C=C/I" },
{ "C(=C1)/I.F/1", "F/C=C/I" },
{ "C(=C\\F)/1.I1", "F/C=C/I" },
{ "C(=C\\F)1.I\\1", "F/C=C/I" },
{ "C\\1=C/I.F1", "F/C=C/I" },
{ "C1=C/I.F/1", "F/C=C/I" },
{ "C(=C\\1)/2.F1.I2", "F/C=C/I" },
{ "C/2=C\\1.F1.I2", "F/C=C/I" },
{ "C/1=C/C=C/F.I1", "F/C=C/C=C\\I" },
{ "C1=C/C=C/F.I\\1", "F/C=C/C=C\\I" },
{ "C(/I)=C/C=C/1.F1", "F/C=C/C=C\\I" },
{ "C(/I)=C/C=C1.F\\1", "F/C=C/C=C\\I" },
{ "[C@](Cl)(F)(I)1.Br1", "F[C@](Cl)(Br)I" },
{ "Br[C@](Cl)(I)1.F1", "F[C@](Cl)(Br)I" },
{ "[C@H](F)(I)1.Br1", "F[C@H](Br)I" },
{ "Br[C@@H](F)1.I1", "F[C@H](Br)I" },
{ "C[S@@](CC)=O", "CC[S@](C)=O" },
{ "[S@](=O)(C)CC", "CC[S](C)=O" } };
StereoMolecule mol = new StereoMolecule();
for (String[] test:data) {
try {
new SmilesParser().parse(mol, test[0]);
String smiles = new IsomericSmilesCreator(mol).getSmiles();
System.out.print("IN:"+test[0]+" OUT:"+smiles);
if (!test[1].equals(smiles))
System.out.println(" EXPECTED: "+test[1]+" ERROR!");
else
System.out.println(" OK");
}
catch (Exception e) {
e.printStackTrace();
}
}
}
public static void main(String[] args) {
testStereo();
System.out.println("ID-code equivalence test:");
final String[][] data = { { "N[C@@]([H])(C)C(=O)O", "S-alanine", "gGX`BDdwMUM@@" },
{ "N[C@@H](C)C(=O)O", "S-alanine", "gGX`BDdwMUM@@" },
{ "N[C@H](C(=O)O)C", "S-alanine", "gGX`BDdwMUM@@" },
{ "[H][C@](N)(C)C(=O)O", "S-alanine", "gGX`BDdwMUM@@" },
{ "[C@H](N)(C)C(=O)O", "S-alanine", "gGX`BDdwMUM@@" },
{ "N[C@]([H])(C)C(=O)O", "R-alanine", "gGX`BDdwMUL`@" },
{ "N[C@H](C)C(=O)O", "R-alanine", "gGX`BDdwMUL`@" },
{ "N[C@@H](C(=O)O)C", "R-alanine", "gGX`BDdwMUL`@" },
{ "[H][C@@](N)(C)C(=O)O", "R-alanine", "gGX`BDdwMUL`@" },
{ "[C@@H](N)(C)C(=O)O", "R-alanine", "gGX`BDdwMUL`@" },
{ "C[C@H]1CCCCO1", "S-Methyl-pyran", "gOq@@eLm]UUH`@" },
{ "O1CCCC[C@@H]1C", "S-Methyl-pyran", "gOq@@eLm]UUH`@" },
{ "[C@H](F)(B)O", "S-Methyl-oxetan", "gCaDDICTBSURH@" },
{ "C1CO[C@H]1C", "S-Methyl-oxetan", "gKQ@@eLmUTb@" },
{ "C1CO[C@@H](C)1", "S-Methyl-oxetan", "gKQ@@eLmUTb@" },
{ "[C@H]1(C)CCO1", "S-Methyl-oxetan", "gKQ@@eLmUTb@" },
{ "[H][C@]1(C)CCO1", "S-Methyl-oxetan", "gKQ@@eLmUTb@" },
{ "[H][C@@]1(CCO1)C", "S-Methyl-oxetan", "gKQ@@eLmUTb@" },
{ "[C@@]1([H])(C)CCO1", "S-Methyl-oxetan", "gKQ@@eLmUTb@" },
{ "[C@]1(C)([H])CCO1", "S-Methyl-oxetan", "gKQ@@eLmUTb@" },
{ "C1[C@@H]2COC2=N1", "oxetan-azetin", "gGy@LDimDvfja`@" },
{ "CC(C)[C@@]12C[C@@H]1[C@@H](C)C(=O)C2", "alpha-thujone", "dmLH@@RYe~IfyjjjkDaIh@" },
{ "CN1CCC[C@H]1c2cccnc2", "Nicotine", "dcm@@@{IDeCEDUSh@UUECP@" },
{ "CC[C@H](O1)CC[C@@]12CCCO2", "2S,5R-Chalcogran", "dmLD@@qJZY|fFZjjjdbH`@" },
{ "CCCC", "butane", "gC`@Dij@@" },
{ "C1C.CC1", "butane", "gC`@Dij@@" },
{ "[CH3][CH2][CH2][CH3]", "butane", "gC`@Dij@@" },
{ "C-C-C-C", "butane", "gC`@Dij@@" },
{ "C12.C1.CC2", "butane", "gC`@Dij@@" },
{ "[Na+].[Cl-]", "NaCl", "eDARHm@zd@@" },
{ "[Na+]-[Cl-]", "NaCl", "error" },
{ "[Na+]1.[Cl-]1", "NaCl", "error" },
{ "c1ccccc1", "benzene", "gFp@DiTt@@@" },
{ "C1=C-C=C-C=C1", "benzene", "gFp@DiTt@@@" },
{ "C1:C:C:C:C:C:1", "benzene", "gFp@DiTt@@@" },
{ "c1ccncc1", "pyridine", "gFx@@eJf`@@@" },
{ "[nH]1cccc1", "pyrrole", "gKX@@eKcRp@" },
{ "N1C=C-C=C1", "pyrrole", "gKX@@eKcRp@" },
{ "[H]n1cccc1", "pyrrole", "gKX@@eKcRp@" },
{ "[H]n1cccc1", "pyrrole", "gKX@@eKcRp@" },
{ "c1cncc1", "pyrrole no [nH]", "error" },
{ "[13CH4]", "C13-methane", "fH@FJp@" },
{ "[35ClH]", "35-chlorane", "fHdP@qX`" },
{ "[35Cl-]", "35-chloride", "fHtPxAbq@" },
{ "[Na+].[O-]c1ccccc1", "Na-phenolate", "daxHaHCPBXyAYUn`@@@" },
{ "c1cc([O-].[Na+])ccc1", "Na-phenolate", "daxHaHCPBXyAYUn`@@@" },
{ "C[C@@](C)(O1)C[C@@H](O)[C@@]1(O2)[C@@H](C)[C@@H]3CC=C4[C@]3(C2)C(=O)C[C@H]5[C@H]4CC[C@@H](C6)[C@]5(C)Cc(n7)c6nc(C[C@@]89(C))c7C[C@@H]8CC[C@@H]%10[C@@H]9C[C@@H](O)[C@@]%11(C)C%10=C[C@H](O%12)[C@]%11(O)[C@H](C)[C@]%12(O%13)[C@H](O)C[C@@]%13(C)CO",
"Cephalostatin-1",
"gdKe@h@@K`H@XjKHuYlnoP\\bbdRbbVTLbTrJbRaQRRRbTJTRTrfrfTTOBPHtFODPhLNSMdIERYJmShLfs]aqy|uUMUUUUUUE@UUUUMUUUUUUTQUUTPR`nDdQQKB|RIFbiQeARuQt`rSSMNtGS\\ct@@" },
};
StereoMolecule mol = new StereoMolecule();
for (String[] test:data) {
try {
new SmilesParser().parse(mol, test[0]);
String idcode = new Canonizer(mol).getIDCode();
if (test[2].equals("error"))
System.out.println("Should create error! "+test[1]+" smiles:"+test[0]+" idcode:"+idcode);
else if (!test[2].equals(idcode))
System.out.println("ERROR! "+test[1]+" smiles:"+test[0]+" is:"+idcode+" must:"+test[2]);
}
catch (Exception e) {
if (!test[2].equals("error"))
System.out.println("ERROR! "+test[1]+" smiles:"+test[0]+" exception:"+e.getMessage());
}
}
}
}
class SmilesRange {
private final byte[] smiles;
private int pos;
public int min,max;
public boolean isDefault;
public SmilesRange(byte[] smiles) {
this.smiles = smiles;
}
public int parse(int position, int defaultMin, int defaultMax) {
isDefault = false;
pos = position;
if (Character.isDigit(smiles[position])) {
int val = parseInt();
min = max = val;
// If we have the same query feature, comma delimited and with different number, then we extend the range...
int firstLetter = position-1;
while (firstLetter > 1 && Character.isLetterOrDigit(smiles[firstLetter-1]))
firstLetter--;
while (smiles[pos] == ',') {
boolean lettersMatch = true;
int letterCount = position-firstLetter;
for (int i=0; i val)
min = val;
else if (max < val)
max = val;
}
return pos - position;
}
if (smiles[position] == '{'
&& Character.isDigit(smiles[position+1])) {
pos++;
min = parseInt();
if (smiles[pos++] != '-')
return 0; // unexpected
if (!Character.isDigit(smiles[pos]))
return 0; // unexpected
max = parseInt();
if (smiles[pos++] != '}')
return 0; // unexpected
return pos - position;
}
min = defaultMin;
max = defaultMax;
isDefault = true;
return 0;
}
public boolean isSingle() {
return max == min;
}
public boolean isRange() {
return max > min;
}
public String toString() {
return "{"+min+"-"+max+"}";
}
private int parseInt() {
int num = smiles[pos++] - '0';
if (Character.isDigit(smiles[pos]))
num = 10 * num + (smiles[pos++] - '0');
return num;
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy