com.actelion.research.chem.SmilesParser Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of openchemlib Show documentation
Show all versions of openchemlib Show documentation
Open Source Chemistry Library
/*
* Copyright (c) 1997 - 2016
* Actelion Pharmaceuticals Ltd.
* Gewerbestrasse 16
* CH-4123 Allschwil, Switzerland
*
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice, this
* list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
* 3. Neither the name of the the copyright holder nor the
* names of its contributors may be used to endorse or promote products
* derived from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
* ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
* @author Thomas Sander
*/
package com.actelion.research.chem;
import com.actelion.research.chem.coords.CoordinateInventor;
import com.actelion.research.chem.reaction.Reaction;
import com.actelion.research.util.ArrayUtils;
import com.actelion.research.util.SortedList;
import java.nio.charset.StandardCharsets;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.TreeMap;
public class SmilesParser {
private static final int SMARTS_MODE_MASK = 3;
public static final int SMARTS_MODE_IS_SMILES = 0;
public static final int SMARTS_MODE_GUESS = 1;
public static final int SMARTS_MODE_IS_SMARTS = 2;
public static final int MODE_SKIP_COORDINATE_TEMPLATES = 4;
public static final int MODE_MAKE_HYDROGEN_EXPLICIT = 8;
public static final int MODE_NO_CACTUS_SYNTAX = 16; // if not set, then some CACTVS SMARTS extensions will be recognized and translated as close as possible
public static final int MODE_SINGLE_DOT_SEPARATOR = 32; // CONSIDER single dots '.' (rather than '..') as moelcule separator when parsing reactions
public static final int MODE_CREATE_SMARTS_WARNING = 64;
private static final int INITIAL_CONNECTIONS = 16;
private static final int MAX_CONNECTIONS = 100; // largest allowed one in SMILES is 99
private static final int BRACKET_LEVELS = 32;
private static final int MAX_AROMATIC_RING_SIZE = 15;
private static final int HYDROGEN_ANY = -1;
// Unspecified hydrogen count within brackets means :=0 for SMILES and no-H-restriction for SMARTS.
// Therefore, we have to distinguish from explicit H0, which defined query feature for SMARTS.
private static final int HYDROGEN_IMPLICIT_ZERO = 9;
private StereoMolecule mMol;
private boolean[] mIsAromaticBond;
private int mAromaticAtoms,mAromaticBonds,mCoordinateMode;
private final int mSmartsMode,mMode;
private long mRandomSeed;
private final boolean mCreateSmartsWarnings,mMakeHydrogenExplicit,mAllowCactvs,mSingleDotSeparator;
private StringBuilder mSmartsWarningBuffer;
private boolean mSmartsFeatureFound;
/**
* Creates a new SmilesParser that doesn't allow SMARTS features to be present in
* parsed strings. SMARTS features cause an exception. The fragment flag of created
* molecules is never set.
*/
public SmilesParser() {
this(SMARTS_MODE_IS_SMILES);
}
/**
* Creates a new SmilesParser that may or may not allow SMARTS features to be present in
* parsed strings. If smartsMode is SMARTS_MODE_IS_SMILES, then any SMARTS features cause
* an exception. If smartsMode is SMARTS_MODE_IS_SMARTS, then the input string is considered
* a SMARTS, e.g. 'CC' is taken as fragment of two non-aromatic carbon atoms connected by a
* single bond and without any implicit hydrogen atoms. If smartsMode is SMARTS_MODE_IS_GUESS,
* then the molecule is considered a substructure if any SMARTS features are discovered.
* Depending on whether SMARTS features are found, created molecules have the fragment flag set
* or not set.
* @param mode one of SMARTS_MODE... and optionally other mode flags
*/
public SmilesParser(int mode) {
mMode = mode & ~SMARTS_MODE_MASK;
mSmartsMode = mode & SMARTS_MODE_MASK;
mAllowCactvs = (mode & MODE_NO_CACTUS_SYNTAX) == 0;
mSingleDotSeparator = (mode & MODE_SINGLE_DOT_SEPARATOR) != 0;
mCreateSmartsWarnings = (mode & MODE_CREATE_SMARTS_WARNING) != 0;
mMakeHydrogenExplicit = ((mode & MODE_MAKE_HYDROGEN_EXPLICIT) != 0);
mCoordinateMode = CoordinateInventor.MODE_DEFAULT;
if ((mode & MODE_SKIP_COORDINATE_TEMPLATES) != 0)
mCoordinateMode |= CoordinateInventor.MODE_SKIP_DEFAULT_TEMPLATES;
if (mMakeHydrogenExplicit)
mCoordinateMode &= ~CoordinateInventor.MODE_REMOVE_HYDROGEN;
}
/**
* Depending on the parse() parameters, the SmilesParser may or may not generate new atom coordinates
* after parsing the SMILES. In difficult cases the employed CoordinateInventor uses random decisions
* when optimizing colliding coordinates. In strained and bridged ring systems, generated coordinates
* may not correctly represent all E/Z-bond configurations.
* Calling this method with a seed != 0 causes the creation of reproducible atom coordinates.
* @param seed value different from 0 in order to always create the same reproducible atom coordinates
*/
public void setRandomSeed(long seed) {
mRandomSeed = seed;
}
public StereoMolecule parseMolecule(String smiles) {
return smiles == null ? null : parseMolecule(smiles.getBytes(StandardCharsets.UTF_8));
}
/**
* Convenience method to quickly obtain a StereoMolecule from a SMILES string.
* If you process many SMILES, then the parse() methods are preferred, because
* they avoid the steady instantiation new StereoMolecules.
* @param smiles
* @return
*/
public StereoMolecule parseMolecule(byte[] smiles) {
StereoMolecule mol = new StereoMolecule();
try {
parse(mol, smiles);
}
catch (Exception e) {
return null;
}
return mol;
}
public static boolean isReactionSmiles(byte[] smiles) {
return isReactionSmiles(smiles, null);
}
public static boolean isReactionSmiles(byte[] smiles, int[] catalystCountHolder) {
int count = 0;
int index = -1;
while (count < 3) {
index = ArrayUtils.indexOf(smiles, (byte)'>', index + 1);
while (index>0 && smiles[index - 1] == (byte)'-')
index = ArrayUtils.indexOf(smiles, (byte)'>', index + 1);
if (index == -1)
break;
count++;
if (catalystCountHolder != null && count == 1) {
catalystCountHolder[0] = 0;
if (index+1') {
catalystCountHolder[0] = 1;
for (int i=index+1; i' || smiles[i-1] == '-'); i++)
if (smiles[i] == '.' && smiles[i-1] != '.')
catalystCountHolder[0]++;
}
}
}
return count == 2;
}
public Reaction parseReaction(String smiles) throws Exception {
return smiles == null ? null : parseReaction(smiles.getBytes(StandardCharsets.UTF_8));
}
public Reaction parseReaction(byte[] smiles) throws Exception {
int index1 = ArrayUtils.indexOf(smiles, (byte)'>');
while (index1 > 0 && smiles[index1-1] == (byte)'-')
index1 = ArrayUtils.indexOf(smiles, (byte)'>', index1+1);
int index2 = (index1 == -1) ? -1 : ArrayUtils.indexOf(smiles, (byte)'>', index1+1);
while (index2 > 0 && smiles[index2-1] == (byte)'-')
index2 = ArrayUtils.indexOf(smiles, (byte)'>', index2+1);
if (index2 == -1)
throw new Exception("Missing one or both separators ('>').");
if (ArrayUtils.indexOf(smiles, (byte)'>', index2+1) != -1)
throw new Exception("Found more than 2 separators ('>').");
Reaction rxn = new Reaction();
int part = 0;
int index = 0;
int closingGroupBracketIndex = -1;
while (index < smiles.length) {
while (index'
&& !(smiles[end] == '.' && ((mSingleDotSeparator && closingGroupBracketIndex==-1) || closingGroupBracketIndex==end-1 || end+1==smiles.length || smiles[end+1] == '.')))
end++;
int molend = end;
if (closingGroupBracketIndex == end-1) {
molend--;
closingGroupBracketIndex = -1;
}
if (index != molend) {
StereoMolecule mol = new StereoMolecule();
parse(mol, smiles, index, molend);
if (mSmartsMode == SMARTS_MODE_GUESS && mSmartsFeatureFound)
return new SmilesParser(mMode | SMARTS_MODE_IS_SMARTS).parseReaction(smiles);
if (part == 0)
rxn.addReactant(mol);
else if (part == 1)
rxn.addCatalyst(mol);
else
rxn.addProduct(mol);
}
index = end;
while (index < smiles.length && smiles[index] == '>') {
index++;
part++;
}
}
return rxn;
}
/**
* If createSmartsWarning in the constructor was passed as true, then this method
* returns a list of all SMARTS features, which could not be interpreted in the most recently
* parsed SMILES/SMARTS pattern.
* @return
*/
public String getSmartsWarning() {
return mSmartsWarningBuffer == null ? "" : "Unresolved SMARTS features:"+mSmartsWarningBuffer;
}
/**
* Parses the given smiles into the molecule, creates proper atom coordinates
* to reflect correct double bond geometries and translates tetrahedral and allene
* parities into up/down-bonds. SMARTS features are neglected unless
* setAllowSmartsFeatures(true) was called before parsing.
* @param mol
* @param smiles
* @throws Exception
*/
public void parse(StereoMolecule mol, String smiles) throws Exception {
parse(mol, smiles.getBytes(StandardCharsets.UTF_8), true, true);
}
public void parse(StereoMolecule mol, byte[] smiles) throws Exception {
parse(mol, smiles, true, true);
}
public void parse(StereoMolecule mol, byte[] smiles, int position, int endIndex) throws Exception {
parse(mol, smiles, position, endIndex, true, true);
}
public void parse(StereoMolecule mol, byte[] smiles, boolean createCoordinates, boolean readStereoFeatures) throws Exception {
parse(mol, smiles, 0, smiles.length, createCoordinates, readStereoFeatures);
}
public void parse(StereoMolecule mol, byte[] smiles, int position, int endIndex, boolean createCoordinates, boolean readStereoFeatures) throws Exception {
mMol = mol;
mMol.clear();
if (mSmartsWarningBuffer != null)
mSmartsWarningBuffer.setLength(0);
mAromaticAtoms = 0;
mSmartsFeatureFound = false;
boolean allowSmarts = (mSmartsMode != SMARTS_MODE_IS_SMILES);
TreeMap parityMap = null;
int[] baseAtom = new int[BRACKET_LEVELS];
baseAtom[0] = -1;
int[] ringClosureAtom = new int[INITIAL_CONNECTIONS];
int[] ringClosurePosition = new int[INITIAL_CONNECTIONS];
int[] ringClosureBondType = new int[INITIAL_CONNECTIONS];
int[] ringClosureBondQueryFeatures = new int[INITIAL_CONNECTIONS];
for (int i = 0; i atomList = new SortedList<>();
SmilesRange range = new SmilesRange(smiles);
AtomInfo atomInfo = new AtomInfo();
ArrayList recursiveGroupList = new ArrayList<>();
int[] skipCount = new int[1];
while (smiles[position] <= 32)
position++;
while (position < endIndex) {
char theChar = (char)smiles[position++];
// if there is an atom symbol,
if (Character.isLetter(theChar)
|| theChar == '*'
|| theChar == '?'
|| (theChar == '!' && allowSmarts && squareBracketOpen)
|| (theChar == '#' && allowSmarts && squareBracketOpen)) {
int atomicNo = -1;
int charge = 0;
int mapNo = 0;
int abnormalValence = -1;
int explicitHydrogens = HYDROGEN_ANY;
boolean parityFound = false;
boolean isClockwise = false;
long atomQueryFeatures = 0; // translated from obvious SMARTS features
if (squareBracketOpen) {
if (theChar == '*') {
atomicNo = 6;
atomQueryFeatures |= Molecule.cAtomQFAny;
}
else if (theChar == '?') {
atomicNo = 0;
}
else {
boolean isNotList = (theChar == '!');
if (isNotList) {
mSmartsFeatureFound = true;
atomQueryFeatures |= Molecule.cAtomQFAny;
position++;
}
// Handle this before checking for atom symbols, because R (ring count) takes precedence to R1 - R16 (substituent pseudo label)
if (smiles[position-1] == 'R' && allowSmarts && (Character.isDigit(smiles[position]) || (mAllowCactvs && smiles[position] == '{'))) {
atomicNo = 6;
atomQueryFeatures |= Molecule.cAtomQFAny;
position--;
if (isNotList)
position--;
}
else {
if (!parseAtomInBrackets(smiles, position-1, endIndex, atomInfo))
throw new Exception("SmilesParser: Unexpected character in atom definition:'"+((char)smiles[position-1])+"' position:"+(position-1));
atomicNo = atomInfo.atomicNo;
position += atomInfo.labelLength - 1;
if (mSmartsMode != SMARTS_MODE_IS_SMARTS)
explicitHydrogens = HYDROGEN_IMPLICIT_ZERO; // in case we have SMILES; neglected, if we process a SMARTS, which we may learn later when hitting a query feature
// If we have a comma after the first atom label, then we need to parse a (positive) atom list.
// In this case we also have to set aromaticity query features from upper and lower case symbols.
if (allowSmarts && (smiles[position] == ',' || isNotList)) {
boolean mayBeAromatic = atomInfo.mayBeAromatic;
boolean mayBeAliphatic = atomInfo.mayBeAliphatic;
int start = position - atomInfo.labelLength;
while (start < endIndex) {
if (!parseAtomInBrackets(smiles, start, endIndex, atomInfo)) {
if (!isNotList)
throw new Exception("SmilesParser: Unexpected character in atom list:'"+((char)smiles[start])+"'. Position:"+start);
// a not-list may be followed by ';' and another atom condition, while a positive list must not end with ','
break;
}
if (atomInfo.atomicNo == 1) {
if (!isNotList) // in not-lists we are allowed to remove hydrogens!
throw new Exception("SmilesParser: Hydrogen is not supported in positive atom lists:'"+new String(Arrays.copyOfRange(smiles, start, endIndex))+"'. Position:"+start);
}
else {
atomList.add(atomInfo.atomicNo);
mayBeAromatic |= atomInfo.mayBeAromatic;
mayBeAliphatic |= atomInfo.mayBeAliphatic;
}
start += atomInfo.labelLength;
if (smiles[start] != (isNotList ? ';' : ',')) // positive list: ',' e.g. "N,O"; negative lists: ';' e.g. "!#7;!#8"
break;
if (isNotList && smiles[start+1] != '!')
break;
start++;
if (smiles[start] == '!')
start++;
}
if (atomList.size() > 1) {
explicitHydrogens = HYDROGEN_ANY; // don't use implicit zero with atom lists
if (!mayBeAliphatic)
atomQueryFeatures |= Molecule.cAtomQFAromatic;
else if (!mayBeAromatic)
atomQueryFeatures |= Molecule.cAtomQFNotAromatic;
}
position = start;
}
}
}
while (squareBracketOpen) {
if (smiles[position] == '@') {
position++;
if (smiles[position] == '@') {
isClockwise = true;
position++;
}
parityFound = true;
continue;
}
if (smiles[position] == ':') {
position++;
while (Character.isDigit(smiles[position])) {
mapNo = 10 * mapNo + smiles[position] - '0';
position++;
}
continue;
}
if (smiles[position] == '[')
throw new Exception("SmilesParser: nested square brackets found. Position:"+position);
if (smiles[position] == ']') {
position++;
squareBracketOpen = false;
continue;
}
charge = parseCharge(smiles, position, skipCount);
if (skipCount[0] != 0) {
position += skipCount[0];
// explicit charge=0 is usually meant as query feature
if (charge == 0)
atomQueryFeatures |= Molecule.cAtomQFNotChargeNeg | Molecule.cAtomQFNotChargePos;
continue;
}
boolean isNot = (smiles[position] == '!');
if (isNot)
position++;
if (smiles[position] == 'H') {
position++;
position += range.parse(position, 1, 1);
long flags = 0;
if (range.min <= 0 && range.max >= 0)
flags |= Molecule.cAtomQFNot0Hydrogen;
if (range.min <= 1 && range.max >= 1)
flags |= Molecule.cAtomQFNot1Hydrogen;
if (range.min <= 2 && range.max >= 2)
flags |= Molecule.cAtomQFNot2Hydrogen;
if (range.min <= 3 && range.max >= 3)
flags |= Molecule.cAtomQFNot3Hydrogen;
if (isNot) {
atomQueryFeatures |= flags;
explicitHydrogens = HYDROGEN_ANY;
}
else {
if (range.isSingle()) {
explicitHydrogens = range.min;
}
else {
atomQueryFeatures |= (Molecule.cAtomQFHydrogen & ~flags);
explicitHydrogens = HYDROGEN_ANY;
}
}
continue;
}
if (smiles[position] == 'D') { // non-H-neighbours
position++;
position += range.parse(position, 1, 1);
long flags = 0;
if (range.min <= 0 && range.max >= 0)
flags |= Molecule.cAtomQFNot0Neighbours;
if (range.min <= 1 && range.max >= 1)
flags |= Molecule.cAtomQFNot1Neighbour;
if (range.min <= 2 && range.max >= 2)
flags |= Molecule.cAtomQFNot2Neighbours;
if (range.min <= 3 && range.max >= 3)
flags |= Molecule.cAtomQFNot3Neighbours;
if (range.min <= 4 && range.max >= 4)
flags |= Molecule.cAtomQFNot4Neighbours;
if (flags != 0) {
if (isNot)
atomQueryFeatures |= flags;
else if ((atomQueryFeatures & Molecule.cAtomQFNeighbours) != 0)
atomQueryFeatures &= ~flags;
else {
flags = flags ^ Molecule.cAtomQFNeighbours;
atomQueryFeatures |= flags;
}
}
continue;
}
if (smiles[position] == 'z' && mAllowCactvs) { // electro-negative neighbour count (CACTVS extension)
position++;
position += range.parse(position, 1, 4);
long flags = 0;
if (range.min <= 0 && range.max >= 0)
flags |= Molecule.cAtomQFNot0ENeighbours;
if (range.min <= 1 && range.max >= 1)
flags |= Molecule.cAtomQFNot1ENeighbour;
if (range.min <= 2 && range.max >= 2)
flags |= Molecule.cAtomQFNot2ENeighbours;
if (range.min <= 3 && range.max >= 3)
flags |= Molecule.cAtomQFNot3ENeighbours;
if (range.min <= 4 && range.max >= 4)
flags |= Molecule.cAtomQFNot4ENeighbours;
if (flags != 0) {
if (isNot)
atomQueryFeatures |= flags;
else if ((atomQueryFeatures & Molecule.cAtomQFENeighbours) != 0)
atomQueryFeatures &= ~flags;
else {
flags = flags ^ Molecule.cAtomQFENeighbours;
atomQueryFeatures |= flags;
}
}
continue;
}
if (smiles[position] == 'X') { // neighbour count including implicit hydrogens
position++;
position += range.parse(position, 1, 1);
byte[] valences = Molecule.cAtomValence[atomicNo];
if (valences == null)
continue;
int valence = valences[0];
// if we have a locally defined charge, we update the valance properly
int localCharge = parseCharge(smiles, position, skipCount);
if (skipCount[0] != 0) {
if (Molecule.isAtomicNoElectronegative(atomicNo))
valence += localCharge;
else if (atomicNo == 6)
valence -= Math.abs(localCharge);
else
valence -= localCharge;
}
long flags = 0;
// we convert into pi-electron count using standard valence
if (valence-range.min <= 0 && valence-range.max >= 0)
flags |= Molecule.cAtomQFNot0PiElectrons;
if (valence-range.min <= 1 && valence-range.max >= 1)
flags |= Molecule.cAtomQFNot1PiElectron;
if (valence-range.min <= 2 && valence-range.max >= 2)
flags |= Molecule.cAtomQFNot2PiElectrons;
if (flags != 0) {
if (isNot)
atomQueryFeatures |= flags;
else if ((atomQueryFeatures & Molecule.cAtomQFPiElectrons) != 0)
atomQueryFeatures &= ~flags;
else {
flags = flags ^ Molecule.cAtomQFPiElectrons;
atomQueryFeatures |= flags;
}
}
continue;
}
if (smiles[position] == 'A' || smiles[position] == 'a') {
position++;
atomQueryFeatures |= (isNot ^ smiles[position] == 'A') ? Molecule.cAtomQFNotAromatic : Molecule.cAtomQFAromatic;
continue;
}
if (smiles[position] == 'R') {
position++;
position += range.parse(position, 1, 3);
long flags = 0;
if (range.min <= 0 && range.max >= 0)
flags |= Molecule.cAtomQFNotChain;
if (range.min <= 1 && range.max >= 1)
flags |= Molecule.cAtomQFNot2RingBonds;
if (range.min <= 2 && range.max >= 2)
flags |= Molecule.cAtomQFNot3RingBonds;
if (range.min <= 3 && range.max >= 3)
flags |= Molecule.cAtomQFNot4RingBonds;
if (range.max > 3)
smartsWarning((isNot?"!R":"R")+range.max);
if (flags != 0) {
if (isNot)
atomQueryFeatures |= flags;
else if ((atomQueryFeatures & Molecule.cAtomQFRingState) != 0)
atomQueryFeatures &= ~flags;
else {
flags = flags ^ Molecule.cAtomQFRingState;
atomQueryFeatures |= flags;
}
}
continue;
}
if (smiles[position] == 'r') {
position++;
position += range.parse(position, 1, 1);
if (range.isDefault) {
if (isNot)
atomQueryFeatures |= Molecule.cBondQFRingState & ~Molecule.cAtomQFNotChain;
else
atomQueryFeatures |= Molecule.cAtomQFNotChain;
continue;
}
int ringSize = range.min;
if (range.isRange())
smartsWarning((isNot ? "!r" : "r") + range.toString());
if (!isNot && ringSize >= 3 && ringSize <= 7)
atomQueryFeatures |= (ringSize << Molecule.cAtomQFSmallRingSizeShift);
else if (!range.isRange())
smartsWarning((isNot ? "!r" : "r") + ringSize);
continue;
}
if (smiles[position] == 'v') {
position++;
position += range.parse(position, 1, 1);
int valence = range.min;
if (range.isRange())
smartsWarning((isNot ? "!v" : "v") + range.toString());
if (!isNot && valence <= 14)
abnormalValence = valence;
else if (!range.isRange())
smartsWarning((isNot ? "!v" : "v") + valence);
continue;
}
if (smiles[position] == '$') { // recursive SMARTS
// if (!isNot)
// throw new Exception("SmilesParser: non-negated recursive SMARTS relating to preceding atom are not supported yet. Position:"+position);
position += parseRecursiveGroup(smiles, position, recursiveGroupList);
continue;
}
if (allowSmarts && (smiles[position] == ';' || smiles[position] == '&')) { // we interpret high and low precendence AND the same way
mSmartsFeatureFound = true;
position++;
continue;
}
if (allowSmarts && (smiles[position] == ',' && isRepeatedAllowedORFeature(smiles, position, skipCount))) { // we allow OR-logic for some query options if they have the same type
mSmartsFeatureFound = true;
position += skipCount[0] + 1;
continue;
}
throw new Exception("SmilesParser: unexpected character inside brackets: '"+(char)smiles[position]+"', position:"+position);
}
}
else if (theChar == '*') {
atomicNo = 6;
atomQueryFeatures |= Molecule.cAtomQFAny;
}
else if (theChar == '?') {
atomicNo = 0;
}
else if ((theChar == 'A' || theChar == 'a') && allowSmarts) {
atomicNo = 6;
atomQueryFeatures |= Molecule.cAtomQFAny;
atomQueryFeatures |= theChar == 'A' ? Molecule.cAtomQFNotAromatic : Molecule.cAtomQFAromatic;
mSmartsFeatureFound = true;
}
else {
switch (Character.toUpperCase(theChar)) {
case 'B':
if (position < endIndex && smiles[position] == 'r') {
atomicNo = 35;
position++;
}
else
atomicNo = 5;
break;
case 'C':
if (position < endIndex && smiles[position] == 'l') {
atomicNo = 17;
position++;
}
else
atomicNo = 6;
break;
case 'F':
atomicNo = 9;
break;
case 'I':
atomicNo = 53;
break;
case 'N':
atomicNo = 7;
break;
case 'O':
atomicNo = 8;
break;
case 'P':
atomicNo = 15;
break;
case 'S':
atomicNo = 16;
break;
}
}
///////////////////////////////////////////////////////////////////////////////
// At this position the atom is determined and the square bracket is closed! //
///////////////////////////////////////////////////////////////////////////////
if (atomicNo == -1 && theChar != '?')
throw new Exception("SmilesParser: unknown element label found. Position:"+(position-1));
int atom = mMol.addAtom(atomicNo); // this may be a hydrogen, if defined as [H]
mMol.setAtomCharge(atom, charge);
mMol.setAtomMapNo(atom, mapNo, false);
mMol.setAtomAbnormalValence(atom, abnormalValence);
if (atomQueryFeatures != 0) {
mSmartsFeatureFound = true;
if ((atomQueryFeatures & Molecule.cAtomQFAromatic) != 0) {
atomQueryFeatures &= ~Molecule.cAtomQFAromatic;
mMol.setAtomMarker(atom, true);
mAromaticAtoms++;
}
else {
mMol.setAtomMarker(atom, false);
}
mMol.setAtomQueryFeature(atom, atomQueryFeatures, true);
}
if (atomList.size() != 0) {
mSmartsFeatureFound = true;
int[] list = new int[atomList.size()];
for (int i=0; i();
// using position as hydrogenPosition is close enough
int hydrogenCount = (explicitHydrogens == HYDROGEN_IMPLICIT_ZERO) ? 0 : explicitHydrogens;
parityMap.put(atom, new THParity(atom, position - 2, fromAtom, hydrogenCount, position - 1, isClockwise));
}
}
continue;
}
if (theChar == '$') { // recursive SMARTS
if (!recursiveGroupList.isEmpty())
throw new Exception("SmilesParser: multiple recursive SMARTS without preceding atom are not supported yet. Position:"+(position-1));
baseAtom[bracketLevel] = mol.getAllAtoms();
position += parseRecursiveGroup(smiles, position-1, recursiveGroupList);
mol.addMolecule(recursiveGroupList.get(0));
recursiveGroupList.clear();
continue;
}
if (theChar == '.') {
baseAtom[bracketLevel] = -1;
bondType = Molecule.cBondTypeDeleted;
continue;
}
if (isBondSymbol(theChar)) {
if (squareBracketOpen)
throw new Exception("SmilesParser: unexpected bond symbol inside square brackets: '"+theChar+"', position:"+(position-1));
int excludedBonds = 0;
while (isBondSymbol(theChar)) {
if (theChar == '!') {
theChar = (char)smiles[position++];
if (theChar == '@')
bondQueryFeatures |= Molecule.cBondQFNotRing;
else if ((theChar == '-' && smiles[position] == '>')
|| (theChar == '<' && smiles[position] == '-')) {
excludedBonds |= Molecule.cBondTypeMetalLigand;
position++;
}
else if (theChar == '-')
excludedBonds |= Molecule.cBondQFSingle;
else if (theChar == '=')
excludedBonds |= Molecule.cBondQFDouble;
else if (theChar == '#')
excludedBonds |= Molecule.cBondQFTriple;
else if (theChar == '$')
excludedBonds |= Molecule.cBondQFQuadruple;
else if (theChar == ':')
excludedBonds |= Molecule.cBondQFDelocalized;
else
throw new Exception("SmilesParser: bond symbol '"+theChar+"' not allowed after '!'. Position:"+(position-1));
}
else {
if (theChar == '@')
bondQueryFeatures |= Molecule.cBondQFRing;
else if (theChar == '=')
bondType = Molecule.cBondTypeDouble;
else if (theChar == '#')
bondType = Molecule.cBondTypeTriple;
else if (theChar == '$')
bondType = Molecule.cBondTypeQuadruple;
else if (theChar == ':')
bondType = Molecule.cBondTypeDelocalized;
else if (theChar == '~')
bondQueryFeatures |= Molecule.cBondQFSingle | Molecule.cBondQFDouble | Molecule.cBondQFTriple | Molecule.cBondQFDelocalized;
else if (theChar == '/') {
if (readStereoFeatures)
bondType = Molecule.cBondTypeUp; // encode slash temporarily in bondType
}
else if (theChar == '\\') {
if (readStereoFeatures)
bondType = Molecule.cBondTypeDown; // encode slash temporarily in bondType
}
// Smiles extention 'dative bond'
else if ((theChar == '-' && smiles[position] == '>')
|| (theChar == '<' && smiles[position] == '-')) {
bondType = Molecule.cBondTypeMetalLigand;
position++;
}
if (smiles[position] == ',') {
bondQueryFeatures |= bondSymbolToQueryFeature(bondType == Molecule.cBondTypeMetalLigand ? '>' : theChar);
while (smiles[position] == ',') {
if ((smiles[position+1] == '<' && smiles[position+2] == '-')
|| (smiles[position+1] == '-' && smiles[position+2] == '>')) {
bondQueryFeatures |= bondSymbolToQueryFeature('>');
position += 3;
}
else {
bondQueryFeatures |= bondSymbolToQueryFeature((char)smiles[position+1]);
position += 2;
}
}
}
}
if (smiles[position] == ';') {
position++;
theChar = (char)smiles[position++];
continue;
}
if (excludedBonds != 0)
bondQueryFeatures |= Molecule.cBondQFBondTypes & ~excludedBonds;
break;
}
continue;
}
if (theChar <= ' ') { // we stop reading at whitespace
position = endIndex;
continue;
}
if (Character.isDigit(theChar)) {
int number = theChar - '0';
if (squareBracketOpen) {
while (position < endIndex
&& Character.isDigit(smiles[position])) {
number = 10 * number + smiles[position] - '0';
position++;
}
atomMass = number;
}
else {
int bondTypePosition = isDoubleDigit ? position - 3 : position - 2;
boolean hasBondType = (smiles[bondTypePosition] == '-'
|| smiles[bondTypePosition] == '/'
|| smiles[bondTypePosition] == '\\'
|| smiles[bondTypePosition] == '='
|| smiles[bondTypePosition] == '#'
|| smiles[bondTypePosition] == '$'
|| smiles[bondTypePosition] == ':'
|| smiles[bondTypePosition] == '>'
|| smiles[bondTypePosition] == '~');
if (isDoubleDigit
&& position < endIndex
&& Character.isDigit(smiles[position])) {
number = 10 * number + smiles[position] - '0';
isDoubleDigit = false;
position++;
}
if (number >= ringClosureAtom.length) {
if (number >=MAX_CONNECTIONS)
throw new Exception("SmilesParser: ringClosureAtom number out of range: "+number);
int oldSize = ringClosureAtom.length;
int newSize = ringClosureAtom.length;
while (newSize <= number)
newSize = Math.min(MAX_CONNECTIONS, newSize + INITIAL_CONNECTIONS);
ringClosureAtom = Arrays.copyOf(ringClosureAtom, newSize);
ringClosurePosition = Arrays.copyOf(ringClosurePosition, newSize);
ringClosureBondType = Arrays.copyOf(ringClosureBondType, newSize);
ringClosureBondQueryFeatures = Arrays.copyOf(ringClosureBondQueryFeatures, newSize);
for (int i=oldSize; i= 1)
mMol.setAtomQueryFeature(atom, Molecule.cAtomQFNot0Hydrogen, true);
if (explicitHydrogen >= 2)
mMol.setAtomQueryFeature(atom, Molecule.cAtomQFNot1Hydrogen, true);
if (explicitHydrogen >= 3)
mMol.setAtomQueryFeature(atom, Molecule.cAtomQFNot2Hydrogen, true);
if (explicitHydrogen >= 4)
mMol.setAtomQueryFeature(atom, Molecule.cAtomQFNot3Hydrogen, true);
}
}
if (!mMakeHydrogenExplicit && (mSmartsFeatureFound || mSmartsMode == SMARTS_MODE_IS_SMARTS))
mMol.removeExplicitHydrogens();
mMol.ensureHelperArrays(Molecule.cHelperNeighbours);
correctValenceExceededNitrogen(); // convert pyridine oxides and nitro into polar structures with valid nitrogen valences
locateAromaticDoubleBonds(allowSmarts, mSmartsFeatureFound);
mMol.removeAtomCustomLabels();
mMol.setHydrogenProtection(false);
if (readStereoFeatures) {
assignKnownEZBondParities();
if (parityMap != null) {
for (THParity parity:parityMap.values())
mMol.setAtomParity(parity.mCentralAtom, parity.calculateParity(handleHydrogenAtomMap), false);
mMol.setParitiesValid(0);
}
}
// defines unknown EZ parities as such, i.e. prevent coordinate generation to create implicit EZ-parities
mMol.setParitiesValid(0);
if (createCoordinates) {
CoordinateInventor inventor = new CoordinateInventor(mCoordinateMode);
if (mRandomSeed != 0)
inventor.setRandomSeed(mRandomSeed);
inventor.invent(mMol);
if (readStereoFeatures)
mMol.setUnknownParitiesToExplicitlyUnknown();
}
if (mSmartsFeatureFound || mSmartsMode == SMARTS_MODE_IS_SMARTS) {
mMol.setFragment(true);
mMol.validateAtomQueryFeatures();
mMol.validateBondQueryFeatures();
}
}
/**
* @return true if the previously parsed SMILES contained a SMARTS feature and was not parsed with SMARTS_MODE_IS_SMILES
*/
public boolean isSmarts() {
return mSmartsFeatureFound;
}
/**
* @param smiles
* @param position position of potential first charge symbol '+' or '-'
* @param characterCount receives number of characters needed for charge encoding
* @return extracted charge; 0: no charge defined or explicit charge=0 - distinguish by characterCount
*/
private int parseCharge(byte[] smiles, int position, int[] characterCount) {
characterCount[0] = 0;
if (smiles[position] == '+' || smiles[position] == '-') {
byte symbol = smiles[position];
int charge = 1;
characterCount[0]++;
while (smiles[position+characterCount[0]] == symbol) {
charge++;
characterCount[0]++;
}
if (charge == 1 && Character.isDigit(smiles[position+1])) {
charge = smiles[position+1] - '0';
characterCount[0]++;
}
return symbol == '+' ? charge : -charge;
}
return 0;
}
private boolean isBondSymbol(char theChar) {
return theChar == '-'
|| theChar == '='
|| theChar == '#'
|| theChar == '$'
|| theChar == ':'
|| theChar == '/'
|| theChar == '\\'
|| theChar == '<'
|| theChar == '~'
|| theChar == '!'
|| theChar == '@';
}
/**
* If two subsequent features are delimited by comma (OR-logic), then we allow these
* - if they have the same type (and atom label, if an atom label is preceding), e.g. 'NX' in NX3 and NX4+
* - if the feature supports the logic of adding query features to previously given ones (D,R,X,z)
* @param smiles
* @param commaPosition
* @param skipCount int[1] to hold the number of characters to skip for atom label (0 if there is no atom label)
* @return true, if comma (OR-logic) is an allowed delimiter here
*/
private boolean isRepeatedAllowedORFeature(byte[] smiles, int commaPosition, int[] skipCount) {
if (commaPosition < 3)
return false;
int index1 = commaPosition - 1;
if (smiles[index1] == '+' || smiles[index1] == '-')
index1--;
if (!Character.isDigit(smiles[index1]))
return false;
index1--;
if (smiles[index1] != 'D'
&& smiles[index1] != 'R'
&& smiles[index1] != 'X'
&& smiles[index1] != 'z')
return false;
skipCount[0] = 0;
while (index1 > 0 && Character.isLetter(smiles[index1-1])) {
index1--;
skipCount[0]++;
}
int index2 = commaPosition + 1;
while (Character.isLetter(smiles[index1])) {
if (smiles.length <= index2 || smiles[index1] != smiles[index2])
return false;
index1++;
index2++;
}
return true;
}
private boolean parseAtomInBrackets(byte[] smiles, int position, int endIndex, AtomInfo info) throws Exception {
info.mayBeAromatic = true;
info.mayBeAliphatic = true;
if (smiles[position] == '#') {
position++;
mSmartsFeatureFound = true;
info.atomicNo = 0;
info.labelLength = 1;
while (position < endIndex
&& Character.isDigit(smiles[position])) {
info.atomicNo = 10 * info.atomicNo + smiles[position] - '0';
info.labelLength++;
position++;
}
if (info.atomicNo == 0 || info.atomicNo >= Molecule.cAtomLabel.length)
throw new Exception("SmilesParser: Atomic number out of range. position:"+(position-1));
return true;
}
if (smiles[position] >= 'A' && smiles[position] <= 'Z') {
info.labelLength = (smiles[position+1] >= 'a' && smiles[position+1] <= 'z') ? 2 : 1;
info.atomicNo = Molecule.getAtomicNoFromLabel(new String(smiles, position, info.labelLength, StandardCharsets.UTF_8));
info.mayBeAromatic = false;
return true;
}
if (smiles[position] >= 'a' && smiles[position] <= 'z') {
info.labelLength = (smiles[position+1] >= 'a' && smiles[position+1] <= 'z') ? 2 : 1;
info.atomicNo = Molecule.getAtomicNoFromLabel(new String(smiles, position, info.labelLength, StandardCharsets.UTF_8));
info.mayBeAliphatic = false;
return true;
}
return false;
}
private int bondSymbolToQueryFeature(char symbol) {
return symbol == '=' ? Molecule.cBondQFDouble
: symbol == '#' ? Molecule.cBondQFTriple
: symbol == '$' ? Molecule.cBondQFQuadruple
: symbol == ':' ? Molecule.cBondQFDelocalized
: symbol == '>' ? Molecule.cBondQFMetalLigand
: symbol == '~' ? Molecule.cBondQFBondTypes : Molecule.cBondQFSingle;
}
private void smartsWarning(String feature) {
if (mCreateSmartsWarnings) {
if (mSmartsWarningBuffer == null)
mSmartsWarningBuffer = new StringBuilder();
mSmartsWarningBuffer.append(" ");
mSmartsWarningBuffer.append(feature);
}
}
private int parseRecursiveGroup(byte[] smiles, int dollarIndex, ArrayList groupList) throws Exception {
if (smiles[dollarIndex+1] != '(')
throw new Exception("SmilesParser: '$' for recursive SMARTS must be followed by '('. position:"+dollarIndex);
int openBrackets = 1;
int endIndex = dollarIndex+2;
while (endIndex < smiles.length && openBrackets > 0) {
if (smiles[endIndex] == '(')
openBrackets++;
else if (smiles[endIndex] == ')')
openBrackets--;
endIndex++;
}
if (openBrackets > 0)
throw new Exception("SmilesParser: Missing closing ')' for recursive SMARTS. '('-position:"+(dollarIndex+1));
StereoMolecule group = new StereoMolecule(16, 16);
new SmilesParser(mMode | mSmartsMode).parse(group, smiles, dollarIndex+2, endIndex-1);
groupList.add(group);
if (smiles[dollarIndex-1] == '!')
for (int atom=0; atom=0) System.arraycopy(mIsAromaticBond, 0, isAromaticBond, 0, mMol.getBonds());
// Some Smiles contain 'aromatic' rings with atoms not being compatible
// with a PI-bond. These include: tertiary non-charged nitrogen, [nH],
// sulfur, non-charged oxygen, charged carbon, etc...
// All these atoms and attached bonds are marked as handled to avoid
// attached bonds to be promoted (changed to double bond) later.
for (int ring=0; ring=4; qualifyingNo--) {
do {
qualifyingBondFound = false;
for (int bond=0; bond= 2)
if (!connectConjugatedRadicalPairs(isAromaticBond))
break;
if (allowSmartsFeatures) {
if (mAromaticAtoms != 0) {
for (int atom=0; atom 0)
return false;
}
int explicitHydrogens = (mMol.getAtomCustomLabel(atom) == null || mMol.getAtomCustomLabelBytes(atom)[0] == HYDROGEN_IMPLICIT_ZERO) ?
0 : mMol.getAtomCustomLabelBytes(atom)[0];
int freeValence = mMol.getFreeValence(atom) - explicitHydrogens;
if (freeValence < 1)
return false;
if (mMol.getAtomicNo(atom) == 16
|| mMol.getAtomicNo(atom) == 34
|| mMol.getAtomicNo(atom) == 52) {
if (mMol.getConnAtoms(atom) == 2 && mMol.getAtomCharge(atom) <= 0)
return false;
return freeValence != 2; // e.g. -S(=O)- correction to account for tetravalent S,Se
}
return true;
}
private void promoteBond(int bond) {
if (mMol.getBondType(bond) == Molecule.cBondTypeSingle)
mMol.setBondType(bond, Molecule.cBondTypeDouble);
for (int i=0; i<2; i++) {
int bondAtom = mMol.getBondAtom(i, bond);
if (mMol.isMarkedAtom(bondAtom)) {
mMol.setAtomMarker(bondAtom, false);
mAromaticAtoms--;
}
for (int j=0; j 3
&& mMol.getAtomPi(atom) > 0) {
for (int i=0; i 1)
&& mMol.isElectronegative(connAtom)) {
if (mMol.getBondType(connBond) == Molecule.cBondTypeTriple)
mMol.setBondType(connBond, Molecule.cBondTypeDouble);
else
mMol.setBondType(connBond, Molecule.cBondTypeSingle);
mMol.setAtomCharge(atom, mMol.getAtomCharge(atom) + 1);
mMol.setAtomCharge(connAtom, mMol.getAtomCharge(connAtom) - 1);
mMol.setAtomAbnormalValence(atom, -1);
break;
}
}
}
}
}
private boolean assignKnownEZBondParities() {
mMol.ensureHelperArrays(Molecule.cHelperRings);
boolean paritiesFound = false;
int[] refAtom = new int[2];
int[] refBond = new int[2];
int[] otherAtom = new int[2];
for (int bond=0; bond mNeighbourList;
/**
* Instantiates a new parity object during smiles traversal.
* @param centralAtom index of atom processed
* @param centralAtomPosition position in SMILES of central atom
* @param fromAtom index of parent atom of centralAtom (-1 if centralAtom is first atom in smiles)
* @param explicitHydrogen Daylight syntax: hydrogen atoms defined within square bracket of other atom
* @param hydrogenPosition position in SMILES of central atom
* @param isClockwise true if central atom is marked with @@ rather than @
*/
public THParity(int centralAtom, int centralAtomPosition, int fromAtom, int explicitHydrogen, int hydrogenPosition, boolean isClockwise) {
if (explicitHydrogen != 0 && explicitHydrogen != 1) {
mError = true;
}
else {
mCentralAtom = centralAtom;
mCentralAtomPosition = centralAtomPosition;
mIsClockwise = isClockwise;
mNeighbourList = new ArrayList<>();
// If we have a fromAtom, an explicit hydrogen, or a lone pair,
// then add it as a normal neighbour.
if (fromAtom != -1)
addNeighbor(fromAtom, centralAtomPosition-1, false);
if (fromAtom != -1 && explicitHydrogen == 1)
addNeighbor(PSEUDO_ATOM_HYDROGEN, centralAtomPosition+1, false);
}
}
/**
* Adds a currently traversed neighbor or ring closure to parity object,
* which belongs to the neighbor's parent atom.
* In case of a ring closure the bond closure digit's position in the smiles
* rather than the neighbor's position is the relevant position used for parity
* determination.
* We need to track the atom, because neighbors are not necessarily added in atom
* sequence (ring closure with connection back to stereo center).
* @param atom
* @param position
*/
public void addNeighbor(int atom, int position, boolean unused) {
if (!mError) {
if (mNeighbourList.size() == 4) {
mError = true;
return;
}
mNeighbourList.add(new ParityNeighbour(atom, position));
}
}
public int calculateParity(int[] handleHydrogenAtomMap) {
if (mError)
return Molecule.cAtomParityUnknown;
// We need to translate smiles-parse-time atom indexes to those that the molecule
// uses after calling handleHydrogens, which is called from ensureHelperArrays().
for (ParityNeighbour neighbour:mNeighbourList)
if (neighbour.mAtom != PSEUDO_ATOM_HYDROGEN && neighbour.mAtom != PSEUDO_ATOM_LONE_PAIR)
neighbour.mAtom = handleHydrogenAtomMap[neighbour.mAtom];
if (mNeighbourList.size() == 3)
// All hydrogens atoms within SMILES all stereo centers all hydrogens must be explicit (as explicit atoms or as H count in square brackets).
// Therefore, three neighbour atoms is a rare situation, e.g. CC[S@](=O)C or frozen out CC[N@H]C
// In these cases we add the electron pair as pseudo neighbour
mNeighbourList.add(new ParityNeighbour(PSEUDO_ATOM_LONE_PAIR, mCentralAtomPosition));
else if (mNeighbourList.size() != 4)
return Molecule.cAtomParityUnknown;
/*
System.out.println();
System.out.println("central:"+mCentralAtom+(mIsClockwise?" @@":" @")+" from:"
+((mFromAtom == -1)?"none":Integer.toString(mFromAtom))+" with "+mImplicitHydrogen+" hydrogens");
System.out.print("neighbors: "+mNeighborAtom[0]+"("+mNeighborPosition[0]+(mNeighborIsHydrogen[0]?",H":",non-H")+")");
for (int i=1; i mNeighbourList.get(i).mAtom)
inversion = !inversion;
if (mNeighbourList.get(j).mPosition > mNeighbourList.get(i).mPosition)
inversion = !inversion;
}
}
return inversion;
}
}
private static void testStereo() {
final String[][] data = { { "F/C=C/I", "F/C=C/I" },
{ "F/C=C\\I", "F/C=C\\I" },
{ "C(=C/I)/F", "F/C=C\\I" },
{ "[H]C(/F)=C/I", "F/C=C\\I" },
{ "C(=C\\1)/I.F1", "F/C=C/I" },
{ "C(=C1)/I.F/1", "F/C=C/I" },
{ "C(=C\\F)/1.I1", "F/C=C/I" },
{ "C(=C\\F)1.I\\1", "F/C=C/I" },
{ "C\\1=C/I.F1", "F/C=C/I" },
{ "C1=C/I.F/1", "F/C=C/I" },
{ "C(=C\\1)/2.F1.I2", "F/C=C/I" },
{ "C/2=C\\1.F1.I2", "F/C=C/I" },
{ "C/1=C/C=C/F.I1", "F/C=C/C=C\\I" },
{ "C1=C/C=C/F.I\\1", "F/C=C/C=C\\I" },
{ "C(/I)=C/C=C/1.F1", "F/C=C/C=C\\I" },
{ "C(/I)=C/C=C1.F\\1", "F/C=C/C=C\\I" },
{ "[C@](Cl)(F)(I)1.Br1", "F[C@](Cl)(Br)I" },
{ "Br[C@](Cl)(I)1.F1", "F[C@](Cl)(Br)I" },
{ "[C@H](F)(I)1.Br1", "F[C@H](Br)I" },
{ "Br[C@@H](F)1.I1", "F[C@H](Br)I" },
{ "C[S@@](CC)=O", "CC[S@](C)=O" },
{ "[S@](=O)(C)CC", "CC[S](C)=O" } };
StereoMolecule mol = new StereoMolecule();
for (String[] test:data) {
try {
new SmilesParser().parse(mol, test[0]);
String smiles = new IsomericSmilesCreator(mol).getSmiles();
System.out.print("IN:"+test[0]+" OUT:"+smiles);
if (!test[1].equals(smiles))
System.out.println(" EXPECTED: "+test[1]+" ERROR!");
else
System.out.println(" OK");
}
catch (Exception e) {
e.printStackTrace();
}
}
}
public static void main(String[] args) {
testStereo();
System.out.println("ID-code equivalence test:");
final String[][] data = { { "N[C@@]([H])(C)C(=O)O", "S-alanine", "gGX`BDdwMUM@@" },
{ "N[C@@H](C)C(=O)O", "S-alanine", "gGX`BDdwMUM@@" },
{ "N[C@H](C(=O)O)C", "S-alanine", "gGX`BDdwMUM@@" },
{ "[H][C@](N)(C)C(=O)O", "S-alanine", "gGX`BDdwMUM@@" },
{ "[C@H](N)(C)C(=O)O", "S-alanine", "gGX`BDdwMUM@@" },
{ "N[C@]([H])(C)C(=O)O", "R-alanine", "gGX`BDdwMUL`@" },
{ "N[C@H](C)C(=O)O", "R-alanine", "gGX`BDdwMUL`@" },
{ "N[C@@H](C(=O)O)C", "R-alanine", "gGX`BDdwMUL`@" },
{ "[H][C@@](N)(C)C(=O)O", "R-alanine", "gGX`BDdwMUL`@" },
{ "[C@@H](N)(C)C(=O)O", "R-alanine", "gGX`BDdwMUL`@" },
{ "C[C@H]1CCCCO1", "S-Methyl-pyran", "gOq@@eLm]UUH`@" },
{ "O1CCCC[C@@H]1C", "S-Methyl-pyran", "gOq@@eLm]UUH`@" },
{ "[C@H](F)(B)O", "S-Methyl-oxetan", "gCaDDICTBSURH@" },
{ "C1CO[C@H]1C", "S-Methyl-oxetan", "gKQ@@eLmUTb@" },
{ "C1CO[C@@H](C)1", "S-Methyl-oxetan", "gKQ@@eLmUTb@" },
{ "[C@H]1(C)CCO1", "S-Methyl-oxetan", "gKQ@@eLmUTb@" },
{ "[H][C@]1(C)CCO1", "S-Methyl-oxetan", "gKQ@@eLmUTb@" },
{ "[H][C@@]1(CCO1)C", "S-Methyl-oxetan", "gKQ@@eLmUTb@" },
{ "[C@@]1([H])(C)CCO1", "S-Methyl-oxetan", "gKQ@@eLmUTb@" },
{ "[C@]1(C)([H])CCO1", "S-Methyl-oxetan", "gKQ@@eLmUTb@" },
{ "C1[C@@H]2COC2=N1", "oxetan-azetin", "gGy@LDimDvfja`@" },
{ "CC(C)[C@@]12C[C@@H]1[C@@H](C)C(=O)C2", "alpha-thujone", "dmLH@@RYe~IfyjjjkDaIh@" },
{ "CN1CCC[C@H]1c2cccnc2", "Nicotine", "dcm@@@{IDeCEDUSh@UUECP@" },
{ "CC[C@H](O1)CC[C@@]12CCCO2", "2S,5R-Chalcogran", "dmLD@@qJZY|fFZjjjdbH`@" },
{ "CCCC", "butane", "gC`@Dij@@" },
{ "C1C.CC1", "butane", "gC`@Dij@@" },
{ "[CH3][CH2][CH2][CH3]", "butane", "gC`@Dij@@" },
{ "C-C-C-C", "butane", "gC`@Dij@@" },
{ "C12.C1.CC2", "butane", "gC`@Dij@@" },
{ "[Na+].[Cl-]", "NaCl", "eDARHm@zd@@" },
{ "[Na+]-[Cl-]", "NaCl", "error" },
{ "[Na+]1.[Cl-]1", "NaCl", "error" },
{ "c1ccccc1", "benzene", "gFp@DiTt@@@" },
{ "C1=C-C=C-C=C1", "benzene", "gFp@DiTt@@@" },
{ "C1:C:C:C:C:C:1", "benzene", "gFp@DiTt@@@" },
{ "c1ccncc1", "pyridine", "gFx@@eJf`@@@" },
{ "[nH]1cccc1", "pyrrole", "gKX@@eKcRp@" },
{ "N1C=C-C=C1", "pyrrole", "gKX@@eKcRp@" },
{ "[H]n1cccc1", "pyrrole", "gKX@@eKcRp@" },
{ "[H]n1cccc1", "pyrrole", "gKX@@eKcRp@" },
{ "c1cncc1", "pyrrole no [nH]", "error" },
{ "[13CH4]", "C13-methane", "fH@FJp@" },
{ "[35ClH]", "35-chlorane", "fHdP@qX`" },
{ "[35Cl-]", "35-chloride", "fHtPxAbq@" },
{ "[Na+].[O-]c1ccccc1", "Na-phenolate", "daxHaHCPBXyAYUn`@@@" },
{ "c1cc([O-].[Na+])ccc1", "Na-phenolate", "daxHaHCPBXyAYUn`@@@" },
{ "C[C@@](C)(O1)C[C@@H](O)[C@@]1(O2)[C@@H](C)[C@@H]3CC=C4[C@]3(C2)C(=O)C[C@H]5[C@H]4CC[C@@H](C6)[C@]5(C)Cc(n7)c6nc(C[C@@]89(C))c7C[C@@H]8CC[C@@H]%10[C@@H]9C[C@@H](O)[C@@]%11(C)C%10=C[C@H](O%12)[C@]%11(O)[C@H](C)[C@]%12(O%13)[C@H](O)C[C@@]%13(C)CO",
"Cephalostatin-1",
"gdKe@h@@K`H@XjKHuYlnoP\\bbdRbbVTLbTrJbRaQRRRbTJTRTrfrfTTOBPHtFODPhLNSMdIERYJmShLfs]aqy|uUMUUUUUUE@UUUUMUUUUUUTQUUTPR`nDdQQKB|RIFbiQeARuQt`rSSMNtGS\\ct@@" },
};
StereoMolecule mol = new StereoMolecule();
for (String[] test:data) {
try {
new SmilesParser().parse(mol, test[0]);
String idcode = new Canonizer(mol).getIDCode();
if (test[2].equals("error"))
System.out.println("Should create error! "+test[1]+" smiles:"+test[0]+" idcode:"+idcode);
else if (!test[2].equals(idcode))
System.out.println("ERROR! "+test[1]+" smiles:"+test[0]+" is:"+idcode+" must:"+test[2]);
}
catch (Exception e) {
if (!test[2].equals("error"))
System.out.println("ERROR! "+test[1]+" smiles:"+test[0]+" exception:"+e.getMessage());
}
}
}
}
class SmilesRange {
private final byte[] smiles;
private int pos;
public int min,max;
public boolean isDefault;
public SmilesRange(byte[] smiles) {
this.smiles = smiles;
}
public int parse(int position, int defaultMin, int defaultMax) {
isDefault = false;
pos = position;
if (Character.isDigit(smiles[position])) {
int val = parseInt();
min = max = val;
// If we have the same query feature, comma delimited and with different number, then we extend the range...
int firstLetter = position-1;
while (firstLetter > 1 && Character.isLetterOrDigit(smiles[firstLetter-1]))
firstLetter--;
while (smiles[pos] == ',') {
boolean lettersMatch = true;
int letterCount = position-firstLetter;
for (int i=0; i val)
min = val;
else if (max < val)
max = val;
}
return pos - position;
}
if (smiles[position] == '{'
&& Character.isDigit(smiles[position+1])) {
pos++;
min = parseInt();
if (smiles[pos++] != '-')
return 0; // unexpected
if (!Character.isDigit(smiles[pos]))
return 0; // unexpected
max = parseInt();
if (smiles[pos++] != '}')
return 0; // unexpected
return pos - position;
}
min = defaultMin;
max = defaultMax;
isDefault = true;
return 0;
}
public boolean isSingle() {
return max == min;
}
public boolean isRange() {
return max > min;
}
public String toString() {
return "{"+min+"-"+max+"}";
}
private int parseInt() {
int num = smiles[pos++] - '0';
if (Character.isDigit(smiles[pos]))
num = 10 * num + (smiles[pos++] - '0');
return num;
}
}