All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.actelion.research.chem.SmilesParser Maven / Gradle / Ivy

There is a newer version: 2024.11.2
Show newest version
/*
 * Copyright (c) 1997 - 2016
 * Actelion Pharmaceuticals Ltd.
 * Gewerbestrasse 16
 * CH-4123 Allschwil, Switzerland
 *
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
 *
 * 1. Redistributions of source code must retain the above copyright notice, this
 *    list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright notice,
 *    this list of conditions and the following disclaimer in the documentation
 *    and/or other materials provided with the distribution.
 * 3. Neither the name of the the copyright holder nor the
 *    names of its contributors may be used to endorse or promote products
 *    derived from this software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
 * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
 * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *
 * @author Thomas Sander
 */

package com.actelion.research.chem;

import com.actelion.research.chem.coords.CoordinateInventor;
import com.actelion.research.chem.reaction.Reaction;
import com.actelion.research.util.ArrayUtils;
import com.actelion.research.util.SortedList;

import java.nio.charset.StandardCharsets;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.TreeMap;


public class SmilesParser {
	private static final int SMARTS_MODE_MASK = 3;
	public static final int SMARTS_MODE_IS_SMILES = 0;
	public static final int SMARTS_MODE_GUESS = 1;
	public static final int SMARTS_MODE_IS_SMARTS = 2;

	public static final int MODE_SKIP_COORDINATE_TEMPLATES = 4;
	public static final int MODE_MAKE_HYDROGEN_EXPLICIT = 8;
	public static final int MODE_NO_CACTUS_SYNTAX = 16;  // if not set, then some CACTVS SMARTS extensions will be recognized and translated as close as possible
	public static final int MODE_SINGLE_DOT_SEPARATOR = 32;  // CONSIDER single dots '.' (rather than '..') as moelcule separator when parsing reactions
	public static final int MODE_CREATE_SMARTS_WARNING = 64;

	private static final int INITIAL_CONNECTIONS = 16;
	private static final int MAX_CONNECTIONS = 100; // largest allowed one in SMILES is 99
	private static final int BRACKET_LEVELS = 32;
	private static final int MAX_AROMATIC_RING_SIZE = 15;

	private static final int HYDROGEN_ANY = -1;

	// Unspecified hydrogen count within brackets means :=0 for SMILES and no-H-restriction for SMARTS.
	// Therefore, we have to distinguish from explicit H0, which defined query feature for SMARTS.
	private static final int HYDROGEN_IMPLICIT_ZERO = 9;

	private StereoMolecule mMol;
	private boolean[] mIsAromaticBond;
	private int mAromaticAtoms,mAromaticBonds,mCoordinateMode;
	private final int mSmartsMode,mMode;
	private long mRandomSeed;
	private final boolean mCreateSmartsWarnings,mMakeHydrogenExplicit,mAllowCactvs,mSingleDotSeparator;
	private StringBuilder mSmartsWarningBuffer;
	private boolean mSmartsFeatureFound;

	/**
	 * Creates a new SmilesParser that doesn't allow SMARTS features to be present in
	 * parsed strings. SMARTS features cause an exception. The fragment flag of created
	 * molecules is never set.
	 */
	public SmilesParser() {
		this(SMARTS_MODE_IS_SMILES);
		}

	/**
	 * Creates a new SmilesParser that may or may not allow SMARTS features to be present in
	 * parsed strings. If smartsMode is SMARTS_MODE_IS_SMILES, then any SMARTS features cause
	 * an exception. If smartsMode is SMARTS_MODE_IS_SMARTS, then the input string is considered
	 * a SMARTS, e.g. 'CC' is taken as fragment of two non-aromatic carbon atoms connected by a
	 * single bond and without any implicit hydrogen atoms. If smartsMode is SMARTS_MODE_IS_GUESS,
	 * then the molecule is considered a substructure if any SMARTS features are discovered.
	 * Depending on whether SMARTS features are found, created molecules have the fragment flag set
	 * or not set.
	 * @param mode one of SMARTS_MODE... and optionally other mode flags
	 */
	public SmilesParser(int mode) {
		mMode = mode & ~SMARTS_MODE_MASK;
		mSmartsMode = mode & SMARTS_MODE_MASK;
		mAllowCactvs = (mode & MODE_NO_CACTUS_SYNTAX) == 0;
		mSingleDotSeparator = (mode & MODE_SINGLE_DOT_SEPARATOR) != 0;
		mCreateSmartsWarnings = (mode & MODE_CREATE_SMARTS_WARNING) != 0;
		mMakeHydrogenExplicit = ((mode & MODE_MAKE_HYDROGEN_EXPLICIT) != 0);
		mCoordinateMode = CoordinateInventor.MODE_DEFAULT;
		if ((mode & MODE_SKIP_COORDINATE_TEMPLATES) != 0)
			mCoordinateMode |= CoordinateInventor.MODE_SKIP_DEFAULT_TEMPLATES;
		if (mMakeHydrogenExplicit)
			mCoordinateMode &= ~CoordinateInventor.MODE_REMOVE_HYDROGEN;
		}

	/**
	 * Depending on the parse() parameters, the SmilesParser may or may not generate new atom coordinates
	 * after parsing the SMILES. In difficult cases the employed CoordinateInventor uses random decisions
	 * when optimizing colliding coordinates. In strained and bridged ring systems, generated coordinates
	 * may not correctly represent all E/Z-bond configurations.
	 * Calling this method with a seed != 0 causes the creation of reproducible atom coordinates.
	 * @param seed value different from 0 in order to always create the same reproducible atom coordinates
	 */
	public void setRandomSeed(long seed) {
		mRandomSeed = seed;
		}

	public StereoMolecule parseMolecule(String smiles) {
		return smiles == null ? null : parseMolecule(smiles.getBytes(StandardCharsets.UTF_8));
		}

	/**
	 * Convenience method to quickly obtain a StereoMolecule from a SMILES string.
	 * If you process many SMILES, then the parse() methods are preferred, because
	 * they avoid the steady instantiation new StereoMolecules.
	 * @param smiles
	 * @return
	 */
	public StereoMolecule parseMolecule(byte[] smiles) {
		StereoMolecule mol = new StereoMolecule();
		try {
			parse(mol, smiles);
			}
		catch (Exception e) {
			return null;
			}
		return mol;
		}

	public static boolean isReactionSmiles(byte[] smiles) {
		return isReactionSmiles(smiles, null);
		}

	public static boolean isReactionSmiles(byte[] smiles, int[] catalystCountHolder) {
		int count = 0;
		int index = -1;

		while (count < 3) {
			index = ArrayUtils.indexOf(smiles, (byte)'>', index + 1);
			while (index>0 && smiles[index - 1] == (byte)'-')
				index = ArrayUtils.indexOf(smiles, (byte)'>', index + 1);

			if (index == -1)
				break;

			count++;

			if (catalystCountHolder != null && count == 1) {
				catalystCountHolder[0] = 0;
				if (index+1');
		while (index1 > 0 && smiles[index1-1] == (byte)'-')
			index1 = ArrayUtils.indexOf(smiles, (byte)'>', index1+1);

		int index2 = (index1 == -1) ? -1 : ArrayUtils.indexOf(smiles, (byte)'>', index1+1);
		while (index2 > 0 && smiles[index2-1] == (byte)'-')
			index2 = ArrayUtils.indexOf(smiles, (byte)'>', index2+1);

		if (index2 == -1)
			throw new Exception("Missing one or both separators ('>').");
		if (ArrayUtils.indexOf(smiles, (byte)'>', index2+1) != -1)
			throw new Exception("Found more than 2 separators ('>').");

		Reaction rxn = new Reaction();

		int part = 0;
		int index = 0;
		int closingGroupBracketIndex = -1;
		while (index < smiles.length) {
			while (index') {
				index++;
				part++;
				}
			}

		return rxn;
		}

	/**
	 * If createSmartsWarning in the constructor was passed as true, then this method
	 * returns a list of all SMARTS features, which could not be interpreted in the most recently
	 * parsed SMILES/SMARTS pattern.
	 * @return
	 */
	public String getSmartsWarning() {
		return mSmartsWarningBuffer == null ? "" : "Unresolved SMARTS features:"+mSmartsWarningBuffer;
		}

	/**
	 * Parses the given smiles into the molecule, creates proper atom coordinates
	 * to reflect correct double bond geometries and translates tetrahedral and allene
	 * parities into up/down-bonds. SMARTS features are neglected unless
	 * setAllowSmartsFeatures(true) was called before parsing.
	 * @param mol
	 * @param smiles
	 * @throws Exception
	 */
	public void parse(StereoMolecule mol, String smiles) throws Exception {
		parse(mol, smiles.getBytes(StandardCharsets.UTF_8), true, true);
		}

	public void parse(StereoMolecule mol, byte[] smiles) throws Exception {
		parse(mol, smiles, true, true);
		}

	public void parse(StereoMolecule mol, byte[] smiles, int position, int endIndex) throws Exception {
		parse(mol, smiles, position, endIndex, true, true);
		}

	public void parse(StereoMolecule mol, byte[] smiles, boolean createCoordinates, boolean readStereoFeatures) throws Exception {
		parse(mol, smiles, 0, smiles.length, createCoordinates, readStereoFeatures);
		}

	public void parse(StereoMolecule mol, byte[] smiles, int position, int endIndex, boolean createCoordinates, boolean readStereoFeatures) throws Exception {
		mMol = mol;
		mMol.clear();

		if (mSmartsWarningBuffer != null)
			mSmartsWarningBuffer.setLength(0);

		mAromaticAtoms = 0;
		mSmartsFeatureFound = false;
		boolean allowSmarts = (mSmartsMode != SMARTS_MODE_IS_SMILES);

		TreeMap parityMap = null;

		int[] baseAtom = new int[BRACKET_LEVELS];
		baseAtom[0] = -1;

		int[] ringClosureAtom = new int[INITIAL_CONNECTIONS];
		int[] ringClosurePosition = new int[INITIAL_CONNECTIONS];
		int[] ringClosureBondType = new int[INITIAL_CONNECTIONS];
		int[] ringClosureBondQueryFeatures = new int[INITIAL_CONNECTIONS];
		for (int i = 0; i atomList = new SortedList<>();
		SmilesRange range = new SmilesRange(smiles);
		AtomInfo atomInfo = new AtomInfo();
		ArrayList recursiveGroupList = new ArrayList<>();
		int[] skipCount = new int[1];

		while (smiles[position] <= 32)
			position++;

		while (position < endIndex) {
			char theChar = (char)smiles[position++];

			// if there is an atom symbol,
			if (Character.isLetter(theChar)
			 || theChar == '*'
			 || theChar == '?'
			 || (theChar == '!' && allowSmarts && squareBracketOpen)
			 || (theChar == '#' && allowSmarts && squareBracketOpen)) {
				int atomicNo = -1;
				int charge = 0;
				int mapNo = 0;
				int abnormalValence = -1;
				int explicitHydrogens = HYDROGEN_ANY;
				boolean parityFound = false;
				boolean isClockwise = false;
				long atomQueryFeatures = 0;      // translated from obvious SMARTS features
				if (squareBracketOpen) {
					if (theChar == '*') {
						atomicNo = 6;
						atomQueryFeatures |= Molecule.cAtomQFAny;
						}
					else if (theChar == '?') {
						atomicNo = 0;
						}
					else {
						boolean isNotList = (theChar == '!');
						if (isNotList) {
							mSmartsFeatureFound = true;
							atomQueryFeatures |= Molecule.cAtomQFAny;
							position++;
							}

						// Handle this before checking for atom symbols, because R (ring count) takes precedence to R1 - R16 (substituent pseudo label)
						if (smiles[position-1] == 'R' && allowSmarts && (Character.isDigit(smiles[position]) || (mAllowCactvs && smiles[position] == '{'))) {
							atomicNo = 6;
							atomQueryFeatures |= Molecule.cAtomQFAny;
							position--;
							if (isNotList)
								position--;
							}
						else {
							if (!parseAtomInBrackets(smiles, position-1, endIndex, atomInfo))
								throw new Exception("SmilesParser: Unexpected character in atom definition:'"+((char)smiles[position-1])+"' position:"+(position-1));

							atomicNo =  atomInfo.atomicNo;
							position += atomInfo.labelLength - 1;
							if (mSmartsMode != SMARTS_MODE_IS_SMARTS)
								explicitHydrogens = HYDROGEN_IMPLICIT_ZERO;  // in case we have SMILES; neglected, if we process a SMARTS, which we may learn later when hitting a query feature

							// If we have a comma after the first atom label, then we need to parse a (positive) atom list.
							// In this case we also have to set aromaticity query features from upper and lower case symbols.
							if (allowSmarts && (smiles[position] == ',' || isNotList)) {
								boolean mayBeAromatic = atomInfo.mayBeAromatic;
								boolean mayBeAliphatic = atomInfo.mayBeAliphatic;
								int start = position - atomInfo.labelLength;
								while (start < endIndex) {
									if (!parseAtomInBrackets(smiles, start, endIndex, atomInfo)) {
										if (!isNotList)
											throw new Exception("SmilesParser: Unexpected character in atom list:'"+((char)smiles[start])+"'. Position:"+start);
										// a not-list may be followed by ';' and another atom condition, while a positive list must not end with ','
										break;
										}

									if (atomInfo.atomicNo == 1) {
										if (!isNotList) // in not-lists we are allowed to remove hydrogens!
											throw new Exception("SmilesParser: Hydrogen is not supported in positive atom lists:'"+new String(Arrays.copyOfRange(smiles, start, endIndex))+"'. Position:"+start);
										}
									else {
										atomList.add(atomInfo.atomicNo);
										mayBeAromatic |= atomInfo.mayBeAromatic;
										mayBeAliphatic |= atomInfo.mayBeAliphatic;
										}
									start += atomInfo.labelLength;
									if (smiles[start] != (isNotList ? ';' : ','))   // positive list: ',' e.g. "N,O"; negative lists: ';' e.g. "!#7;!#8"
										break;
									if (isNotList && smiles[start+1] != '!')
										break;
									start++;
									if (smiles[start] == '!')
										start++;
									}

								if (atomList.size() > 1) {
									explicitHydrogens = HYDROGEN_ANY;   // don't use implicit zero with atom lists
									if (!mayBeAliphatic)
										atomQueryFeatures |= Molecule.cAtomQFAromatic;
									else if (!mayBeAromatic)
										atomQueryFeatures |= Molecule.cAtomQFNotAromatic;
									}

								position = start;
								}
							}
						}

					while (squareBracketOpen) {
						if (smiles[position] == '@') {
							position++;
							if (smiles[position] == '@') {
								isClockwise = true;
								position++;
								}
							parityFound = true;
							continue;
							}

						if (smiles[position] == ':') {
							position++;
							while (Character.isDigit(smiles[position])) {
								mapNo = 10 * mapNo + smiles[position] - '0';
								position++;
								}
							continue;
							}

						if (smiles[position] == '[')
							throw new Exception("SmilesParser: nested square brackets found. Position:"+position);

						if (smiles[position] == ']') {
							position++;
							squareBracketOpen = false;
							continue;
							}

						charge = parseCharge(smiles, position, skipCount);
						if (skipCount[0] != 0) {
							position += skipCount[0];

							// explicit charge=0 is usually meant as query feature
							if (charge == 0)
								atomQueryFeatures |= Molecule.cAtomQFNotChargeNeg | Molecule.cAtomQFNotChargePos;
							continue;
							}

						boolean isNot = (smiles[position] == '!');
						if (isNot)
							position++;

						if (smiles[position] == 'H') {
							position++;
							position += range.parse(position, 1, 1);
							long flags = 0;
							if (range.min <= 0 && range.max >= 0)
								flags |= Molecule.cAtomQFNot0Hydrogen;
							if (range.min <= 1 && range.max >= 1)
								flags |= Molecule.cAtomQFNot1Hydrogen;
							if (range.min <= 2 && range.max >= 2)
								flags |= Molecule.cAtomQFNot2Hydrogen;
							if (range.min <= 3 && range.max >= 3)
								flags |= Molecule.cAtomQFNot3Hydrogen;

							if (isNot) {
								atomQueryFeatures |= flags;
								explicitHydrogens = HYDROGEN_ANY;
								}
							else {
								if (range.isSingle()) {
									explicitHydrogens = range.min;
									}
								else {
									atomQueryFeatures |= (Molecule.cAtomQFHydrogen & ~flags);
									explicitHydrogens = HYDROGEN_ANY;
									}
								}
							continue;
							}

						if (smiles[position] == 'D') {   // non-H-neighbours
							position++;
							position += range.parse(position, 1, 1);
							long flags = 0;
							if (range.min <= 0 && range.max >= 0)
								flags |= Molecule.cAtomQFNot0Neighbours;
							if (range.min <= 1 && range.max >= 1)
								flags |= Molecule.cAtomQFNot1Neighbour;
							if (range.min <= 2 && range.max >= 2)
								flags |= Molecule.cAtomQFNot2Neighbours;
							if (range.min <= 3 && range.max >= 3)
								flags |= Molecule.cAtomQFNot3Neighbours;
							if (range.min <= 4 && range.max >= 4)
								flags |= Molecule.cAtomQFNot4Neighbours;

							if (flags != 0) {
								if (isNot)
									atomQueryFeatures |= flags;
								else if ((atomQueryFeatures & Molecule.cAtomQFNeighbours) != 0)
									atomQueryFeatures &= ~flags;
								else {
									flags = flags ^ Molecule.cAtomQFNeighbours;
									atomQueryFeatures |= flags;
									}
								}
							continue;
							}

						if (smiles[position] == 'z' && mAllowCactvs) {   // electro-negative neighbour count (CACTVS extension)
							position++;
							position += range.parse(position, 1, 4);
							long flags = 0;
							if (range.min <= 0 && range.max >= 0)
								flags |= Molecule.cAtomQFNot0ENeighbours;
							if (range.min <= 1 && range.max >= 1)
								flags |= Molecule.cAtomQFNot1ENeighbour;
							if (range.min <= 2 && range.max >= 2)
								flags |= Molecule.cAtomQFNot2ENeighbours;
							if (range.min <= 3 && range.max >= 3)
								flags |= Molecule.cAtomQFNot3ENeighbours;
							if (range.min <= 4 && range.max >= 4)
								flags |= Molecule.cAtomQFNot4ENeighbours;

							if (flags != 0) {
								if (isNot)
									atomQueryFeatures |= flags;
								else if ((atomQueryFeatures & Molecule.cAtomQFENeighbours) != 0)
									atomQueryFeatures &= ~flags;
								else {
									flags = flags ^ Molecule.cAtomQFENeighbours;
									atomQueryFeatures |= flags;
									}
								}
							continue;
							}

						if (smiles[position] == 'X') {   // neighbour count including implicit hydrogens
							position++;
							position += range.parse(position, 1, 1);
							byte[] valences = Molecule.cAtomValence[atomicNo];
							if (valences == null)
								continue;

							int valence = valences[0];

							// if we have a locally defined charge, we update the valance properly
							int localCharge = parseCharge(smiles, position, skipCount);
							if (skipCount[0] != 0) {
								if (Molecule.isAtomicNoElectronegative(atomicNo))
									valence += localCharge;
								else if (atomicNo == 6)
									valence -= Math.abs(localCharge);
								else
									valence -= localCharge;
								}

							long flags = 0;
							// we convert into pi-electron count using standard valence
							if (valence-range.min <= 0 && valence-range.max >= 0)
								flags |= Molecule.cAtomQFNot0PiElectrons;
							if (valence-range.min <= 1 && valence-range.max >= 1)
								flags |= Molecule.cAtomQFNot1PiElectron;
							if (valence-range.min <= 2 && valence-range.max >= 2)
								flags |= Molecule.cAtomQFNot2PiElectrons;

							if (flags != 0) {
								if (isNot)
									atomQueryFeatures |= flags;
								else if ((atomQueryFeatures & Molecule.cAtomQFPiElectrons) != 0)
									atomQueryFeatures &= ~flags;
								else {
									flags = flags ^ Molecule.cAtomQFPiElectrons;
									atomQueryFeatures |= flags;
									}
								}
							continue;
							}

						if (smiles[position] == 'A' || smiles[position] == 'a') {
							position++;
							atomQueryFeatures |= (isNot ^ smiles[position] == 'A') ? Molecule.cAtomQFNotAromatic : Molecule.cAtomQFAromatic;
							continue;
							}

						if (smiles[position] == 'R') {
							position++;
							position += range.parse(position, 1, 3);
							long flags = 0;
							if (range.min <= 0 && range.max >= 0)
								flags |= Molecule.cAtomQFNotChain;
							if (range.min <= 1 && range.max >= 1)
								flags |= Molecule.cAtomQFNot2RingBonds;
							if (range.min <= 2 && range.max >= 2)
								flags |= Molecule.cAtomQFNot3RingBonds;
							if (range.min <= 3 && range.max >= 3)
								flags |= Molecule.cAtomQFNot4RingBonds;
							if (range.max > 3)
								smartsWarning((isNot?"!R":"R")+range.max);

							if (flags != 0) {
								if (isNot)
									atomQueryFeatures |= flags;
								else if ((atomQueryFeatures & Molecule.cAtomQFRingState) != 0)
									atomQueryFeatures &= ~flags;
								else {
									flags = flags ^ Molecule.cAtomQFRingState;
									atomQueryFeatures |= flags;
									}
								}
							continue;
							}

						if (smiles[position] == 'r') {
							position++;
							position += range.parse(position, 1, 1);
							if (range.isDefault) {
								if (isNot)
									atomQueryFeatures |= Molecule.cBondQFRingState & ~Molecule.cAtomQFNotChain;
								else
									atomQueryFeatures |= Molecule.cAtomQFNotChain;
								continue;
								}

							int ringSize = range.min;

							if (range.isRange())
								smartsWarning((isNot ? "!r" : "r") + range.toString());

							if (!isNot && ringSize >= 3 && ringSize <= 7)
								atomQueryFeatures |= (ringSize << Molecule.cAtomQFSmallRingSizeShift);
							else if (!range.isRange())
								smartsWarning((isNot ? "!r" : "r") + ringSize);
							continue;
							}

						if (smiles[position] == 'v') {
							position++;
							position += range.parse(position, 1, 1);

							int valence = range.min;

							if (range.isRange())
								smartsWarning((isNot ? "!v" : "v") + range.toString());

							if (!isNot && valence <= 14)
								abnormalValence = valence;
							else if (!range.isRange())
								smartsWarning((isNot ? "!v" : "v") + valence);
							continue;
							}

						if (smiles[position] == '$') {  // recursive SMARTS
//							if (!isNot)
//								throw new Exception("SmilesParser: non-negated recursive SMARTS relating to preceding atom are not supported yet. Position:"+position);

							position += parseRecursiveGroup(smiles, position, recursiveGroupList);
							continue;
							}

						if (allowSmarts && (smiles[position] == ';' || smiles[position] == '&')) { // we interpret high and low precendence AND the same way
							mSmartsFeatureFound = true;
							position++;
							continue;
							}

						if (allowSmarts && (smiles[position] == ',' && isRepeatedAllowedORFeature(smiles, position, skipCount))) {    // we allow OR-logic for some query options if they have the same type
							mSmartsFeatureFound = true;
							position += skipCount[0] + 1;
							continue;
							}

						throw new Exception("SmilesParser: unexpected character inside brackets: '"+(char)smiles[position]+"', position:"+position);
						}
					}
				else if (theChar == '*') {
					atomicNo = 6;
					atomQueryFeatures |= Molecule.cAtomQFAny;
					}
				else if (theChar == '?') {
					atomicNo = 0;
					}
				else if ((theChar == 'A' || theChar == 'a') && allowSmarts) {
					atomicNo = 6;
					atomQueryFeatures |= Molecule.cAtomQFAny;
					atomQueryFeatures |= theChar == 'A' ? Molecule.cAtomQFNotAromatic : Molecule.cAtomQFAromatic;
					mSmartsFeatureFound = true;
					}
				else {
					switch (Character.toUpperCase(theChar)) {
					case 'B':
						if (position < endIndex && smiles[position] == 'r') {
							atomicNo = 35;
							position++;
							}
						else
							atomicNo = 5;
						break;
					case 'C':
						if (position < endIndex && smiles[position] == 'l') {
							atomicNo = 17;
							position++;
							}
						else
							atomicNo = 6;
						break;
					case 'F':
						atomicNo = 9;
						break;
					case 'I':
						atomicNo = 53;
						break;
					case 'N':
						atomicNo = 7;
						break;
					case 'O':
						atomicNo = 8;
						break;
					case 'P':
						atomicNo = 15;
						break;
					case 'S':
						atomicNo = 16;
						break;
						}
					}

				///////////////////////////////////////////////////////////////////////////////
				// At this position the atom is determined and the square bracket is closed! //
				///////////////////////////////////////////////////////////////////////////////

				if (atomicNo == -1 && theChar != '?')
					throw new Exception("SmilesParser: unknown element label found. Position:"+(position-1));

				int atom = mMol.addAtom(atomicNo);	// this may be a hydrogen, if defined as [H]
				mMol.setAtomCharge(atom, charge);
				mMol.setAtomMapNo(atom, mapNo, false);
				mMol.setAtomAbnormalValence(atom, abnormalValence);
				if (atomQueryFeatures != 0) {
					mSmartsFeatureFound = true;
					if ((atomQueryFeatures & Molecule.cAtomQFAromatic) != 0) {
						atomQueryFeatures &= ~Molecule.cAtomQFAromatic;
						mMol.setAtomMarker(atom, true);
						mAromaticAtoms++;
						}
					else {
						mMol.setAtomMarker(atom, false);
						}
					mMol.setAtomQueryFeature(atom, atomQueryFeatures, true);
					}
				if (atomList.size() != 0) {
					mSmartsFeatureFound = true;
					int[] list = new int[atomList.size()];
					for (int i=0; i();
	
						// using position as hydrogenPosition is close enough
						int hydrogenCount = (explicitHydrogens == HYDROGEN_IMPLICIT_ZERO) ? 0 : explicitHydrogens;
						parityMap.put(atom, new THParity(atom, position - 2, fromAtom, hydrogenCount, position - 1, isClockwise));
						}
					}

				continue;
				}

			if (theChar == '$') {  // recursive SMARTS
				if (!recursiveGroupList.isEmpty())
					throw new Exception("SmilesParser: multiple recursive SMARTS without preceding atom are not supported yet. Position:"+(position-1));

				baseAtom[bracketLevel] = mol.getAllAtoms();

				position += parseRecursiveGroup(smiles, position-1, recursiveGroupList);
				mol.addMolecule(recursiveGroupList.get(0));
				recursiveGroupList.clear();

				continue;
				}

			if (theChar == '.') {
				baseAtom[bracketLevel] = -1;
				bondType = Molecule.cBondTypeDeleted;
				continue;
				}

			if (isBondSymbol(theChar)) {
				if (squareBracketOpen)
					throw new Exception("SmilesParser: unexpected bond symbol inside square brackets: '"+theChar+"', position:"+(position-1));

				int excludedBonds = 0;
				while (isBondSymbol(theChar)) {
					if (theChar == '!') {
						theChar = (char)smiles[position++];
						if (theChar == '@')
							bondQueryFeatures |= Molecule.cBondQFNotRing;
						else if ((theChar == '-' && smiles[position] == '>')
						 || (theChar == '<' && smiles[position] == '-')) {
							excludedBonds |= Molecule.cBondTypeMetalLigand;
							position++;
							}
						else if (theChar == '-')
							excludedBonds |= Molecule.cBondQFSingle;
						else if (theChar == '=')
							excludedBonds |= Molecule.cBondQFDouble;
						else if (theChar == '#')
							excludedBonds |= Molecule.cBondQFTriple;
						else if (theChar == '$')
							excludedBonds |= Molecule.cBondQFQuadruple;
						else if (theChar == ':')
							excludedBonds |= Molecule.cBondQFDelocalized;
						else
							throw new Exception("SmilesParser: bond symbol '"+theChar+"' not allowed after '!'. Position:"+(position-1));
						}
					else {
						if (theChar == '@')
							bondQueryFeatures |= Molecule.cBondQFRing;
						else if (theChar == '=')
							bondType = Molecule.cBondTypeDouble;
						else if (theChar == '#')
							bondType = Molecule.cBondTypeTriple;
						else if (theChar == '$')
							bondType = Molecule.cBondTypeQuadruple;
						else if (theChar == ':')
							bondType = Molecule.cBondTypeDelocalized;
						else if (theChar == '~')
							bondQueryFeatures |= Molecule.cBondQFSingle | Molecule.cBondQFDouble | Molecule.cBondQFTriple | Molecule.cBondQFDelocalized;
						else if (theChar == '/') {
							if (readStereoFeatures)
								bondType = Molecule.cBondTypeUp;    // encode slash temporarily in bondType
							}
						else if (theChar == '\\') {
							if (readStereoFeatures)
								bondType = Molecule.cBondTypeDown;  // encode slash temporarily in bondType
							}

						// Smiles extention 'dative bond'
						else if ((theChar == '-' && smiles[position] == '>')
						 || (theChar == '<' && smiles[position] == '-')) {
								bondType = Molecule.cBondTypeMetalLigand;
								position++;
							}

						if (smiles[position] == ',') {
							bondQueryFeatures |= bondSymbolToQueryFeature(bondType == Molecule.cBondTypeMetalLigand ? '>' : theChar);
							while (smiles[position] == ',') {
								if ((smiles[position+1] == '<' && smiles[position+2] == '-')
								 || (smiles[position+1] == '-' && smiles[position+2] == '>')) {
									bondQueryFeatures |= bondSymbolToQueryFeature('>');
									position += 3;
									}
								else {
									bondQueryFeatures |= bondSymbolToQueryFeature((char)smiles[position+1]);
									position += 2;
									}
								}
							}
						}

					if (smiles[position] == ';') {
						position++;
						theChar = (char)smiles[position++];
						continue;
						}

					if (excludedBonds != 0)
						bondQueryFeatures |= Molecule.cBondQFBondTypes & ~excludedBonds;

					break;
					}

				continue;
				}

			if (theChar <= ' ') {	// we stop reading at whitespace
				position = endIndex;
				continue;
				}

			if (Character.isDigit(theChar)) {
				int number = theChar - '0';
				if (squareBracketOpen) {
					while (position < endIndex
					 && Character.isDigit(smiles[position])) {
						number = 10 * number + smiles[position] - '0';
						position++;
						}
					atomMass = number;
					}
				else {
					int bondTypePosition = isDoubleDigit ? position - 3 : position - 2;
					boolean hasBondType = (smiles[bondTypePosition] == '-'
										|| smiles[bondTypePosition] == '/'
										|| smiles[bondTypePosition] == '\\'
										|| smiles[bondTypePosition] == '='
										|| smiles[bondTypePosition] == '#'
										|| smiles[bondTypePosition] == '$'
										|| smiles[bondTypePosition] == ':'
										|| smiles[bondTypePosition] == '>'
										|| smiles[bondTypePosition] == '~');
					if (isDoubleDigit
					 && position < endIndex
					 && Character.isDigit(smiles[position])) {
						number = 10 * number + smiles[position] - '0';
						isDoubleDigit = false;
						position++;
						}
					if (number >= ringClosureAtom.length) {
						if (number >=MAX_CONNECTIONS)
							throw new Exception("SmilesParser: ringClosureAtom number out of range: "+number);

						int oldSize = ringClosureAtom.length;
						int newSize = ringClosureAtom.length;
						while (newSize <= number)
							newSize = Math.min(MAX_CONNECTIONS, newSize + INITIAL_CONNECTIONS);

						ringClosureAtom = Arrays.copyOf(ringClosureAtom, newSize);
						ringClosurePosition = Arrays.copyOf(ringClosurePosition, newSize);
						ringClosureBondType = Arrays.copyOf(ringClosureBondType, newSize);
						ringClosureBondQueryFeatures = Arrays.copyOf(ringClosureBondQueryFeatures, newSize);
						for (int i=oldSize; i= 1)
					mMol.setAtomQueryFeature(atom, Molecule.cAtomQFNot0Hydrogen, true);
				if (explicitHydrogen >= 2)
					mMol.setAtomQueryFeature(atom, Molecule.cAtomQFNot1Hydrogen, true);
				if (explicitHydrogen >= 3)
					mMol.setAtomQueryFeature(atom, Molecule.cAtomQFNot2Hydrogen, true);
				if (explicitHydrogen >= 4)
					mMol.setAtomQueryFeature(atom, Molecule.cAtomQFNot3Hydrogen, true);
				}
			}

		if (!mMakeHydrogenExplicit && (mSmartsFeatureFound || mSmartsMode == SMARTS_MODE_IS_SMARTS))
			mMol.removeExplicitHydrogens();

		mMol.ensureHelperArrays(Molecule.cHelperNeighbours);

		correctValenceExceededNitrogen();	// convert pyridine oxides and nitro into polar structures with valid nitrogen valences

		locateAromaticDoubleBonds(allowSmarts, mSmartsFeatureFound);

		mMol.removeAtomCustomLabels();
		mMol.setHydrogenProtection(false);

		if (readStereoFeatures) {
			assignKnownEZBondParities();

			if (parityMap != null) {
				for (THParity parity:parityMap.values())
					mMol.setAtomParity(parity.mCentralAtom, parity.calculateParity(handleHydrogenAtomMap), false);

				mMol.setParitiesValid(0);
				}
			}

		// defines unknown EZ parities as such, i.e. prevent coordinate generation to create implicit EZ-parities
		mMol.setParitiesValid(0);

		if (createCoordinates) {
			CoordinateInventor inventor = new CoordinateInventor(mCoordinateMode);
			if (mRandomSeed != 0)
				inventor.setRandomSeed(mRandomSeed);
			inventor.invent(mMol);

			if (readStereoFeatures)
				mMol.setUnknownParitiesToExplicitlyUnknown();
			}

		if (mSmartsFeatureFound || mSmartsMode == SMARTS_MODE_IS_SMARTS) {
			mMol.setFragment(true);
			mMol.validateAtomQueryFeatures();
			mMol.validateBondQueryFeatures();
			}
		}

	/**
	 * @return true if the previously parsed SMILES contained a SMARTS feature and was not parsed with SMARTS_MODE_IS_SMILES
	 */
	public boolean isSmarts() {
		return mSmartsFeatureFound;
	}

	/**
	 * @param smiles
	 * @param position position of potential first charge symbol '+' or '-'
	 * @param characterCount receives number of characters needed for charge encoding
	 * @return extracted charge; 0: no charge defined or explicit charge=0 - distinguish by characterCount
	 */
	private int parseCharge(byte[] smiles, int position, int[] characterCount) {
		characterCount[0] = 0;
		if (smiles[position] == '+' || smiles[position] == '-') {
			byte symbol = smiles[position];
			int charge = 1;
			characterCount[0]++;
			while (smiles[position+characterCount[0]] == symbol) {
				charge++;
				characterCount[0]++;
				}
			if (charge == 1 && Character.isDigit(smiles[position+1])) {
				charge = smiles[position+1] - '0';
				characterCount[0]++;
				}
			return symbol == '+' ? charge : -charge;
			}
		return 0;
		}

	private boolean isBondSymbol(char theChar) {
		return theChar == '-'
			|| theChar == '='
			|| theChar == '#'
			|| theChar == '$'
			|| theChar == ':'
			|| theChar == '/'
			|| theChar == '\\'
			|| theChar == '<'
			|| theChar == '~'
			|| theChar == '!'
			|| theChar == '@';
		}

	/**
	 * If two subsequent features are delimited by comma (OR-logic), then we allow these
	 * - if they have the same type (and atom label, if an atom label is preceding), e.g. 'NX' in NX3 and NX4+
	 * - if the feature supports the logic of adding query features to previously given ones (D,R,X,z)
	 * @param smiles
	 * @param commaPosition
	 * @param skipCount int[1] to hold the number of characters to skip for atom label (0 if there is no atom label)
	 * @return true, if comma (OR-logic) is an allowed delimiter here
	 */
	private boolean isRepeatedAllowedORFeature(byte[] smiles, int commaPosition, int[] skipCount) {
		if (commaPosition < 3)
			return false;

		int index1 = commaPosition - 1;
		if (smiles[index1] == '+' || smiles[index1] == '-')
			index1--;

		if (!Character.isDigit(smiles[index1]))
			return false;

		index1--;

		if (smiles[index1] != 'D'
		 && smiles[index1] != 'R'
		 && smiles[index1] != 'X'
		 && smiles[index1] != 'z')
			return false;

		skipCount[0] = 0;
		while (index1 > 0 && Character.isLetter(smiles[index1-1])) {
			index1--;
			skipCount[0]++;
			}

		int index2 = commaPosition + 1;
		while (Character.isLetter(smiles[index1])) {
			if (smiles.length <= index2 || smiles[index1] != smiles[index2])
				return false;
			index1++;
			index2++;
			}
		return true;
		}

	private boolean parseAtomInBrackets(byte[] smiles, int position, int endIndex, AtomInfo info) throws Exception {
		info.mayBeAromatic = true;
		info.mayBeAliphatic = true;
		if (smiles[position] == '#') {
			position++;
			mSmartsFeatureFound = true;
			info.atomicNo = 0;
			info.labelLength = 1;
			while (position < endIndex
			 && Character.isDigit(smiles[position])) {
				info.atomicNo = 10 * info.atomicNo + smiles[position] - '0';
				info.labelLength++;
				position++;
				}
			if (info.atomicNo == 0 || info.atomicNo >= Molecule.cAtomLabel.length)
				throw new Exception("SmilesParser: Atomic number out of range. position:"+(position-1));
			return true;
			}

		if (smiles[position] >= 'A' && smiles[position] <= 'Z') {
			info.labelLength = (smiles[position+1] >= 'a' && smiles[position+1] <= 'z') ? 2 : 1;
			info.atomicNo = Molecule.getAtomicNoFromLabel(new String(smiles, position, info.labelLength, StandardCharsets.UTF_8));
			info.mayBeAromatic = false;
			return true;
			}

		if (smiles[position] >= 'a' && smiles[position] <= 'z') {
			info.labelLength = (smiles[position+1] >= 'a' && smiles[position+1] <= 'z') ? 2 : 1;
			info.atomicNo = Molecule.getAtomicNoFromLabel(new String(smiles, position, info.labelLength, StandardCharsets.UTF_8));
			info.mayBeAliphatic = false;
			return true;
			}

		return false;
		}

	private int bondSymbolToQueryFeature(char symbol) {
		return symbol == '=' ? Molecule.cBondQFDouble
			 : symbol == '#' ? Molecule.cBondQFTriple
			 : symbol == '$' ? Molecule.cBondQFQuadruple
			 : symbol == ':' ? Molecule.cBondQFDelocalized
			 : symbol == '>' ? Molecule.cBondQFMetalLigand
			 : symbol == '~' ? Molecule.cBondQFBondTypes : Molecule.cBondQFSingle;
		}

	private void smartsWarning(String feature) {
		if (mCreateSmartsWarnings) {
			if (mSmartsWarningBuffer == null)
				mSmartsWarningBuffer = new StringBuilder();

			mSmartsWarningBuffer.append(" ");
			mSmartsWarningBuffer.append(feature);
			}
		}

	private int parseRecursiveGroup(byte[] smiles, int dollarIndex, ArrayList groupList) throws Exception {
		if (smiles[dollarIndex+1] != '(')
			throw new Exception("SmilesParser: '$' for recursive SMARTS must be followed by '('. position:"+dollarIndex);

		int openBrackets = 1;
		int endIndex = dollarIndex+2;
		while (endIndex < smiles.length && openBrackets > 0) {
			if (smiles[endIndex] == '(')
				openBrackets++;
			else if (smiles[endIndex] == ')')
				openBrackets--;
			endIndex++;
			}

		if (openBrackets > 0)
			throw new Exception("SmilesParser: Missing closing ')' for recursive SMARTS. '('-position:"+(dollarIndex+1));

		StereoMolecule group = new StereoMolecule(16, 16);
		new SmilesParser(mMode | mSmartsMode).parse(group, smiles, dollarIndex+2, endIndex-1);
		groupList.add(group);

		if (smiles[dollarIndex-1] == '!')
			for (int atom=0; atom=0) System.arraycopy(mIsAromaticBond, 0, isAromaticBond, 0, mMol.getBonds());

			// Some Smiles contain 'aromatic' rings with atoms not being compatible
			// with a PI-bond. These include: tertiary non-charged nitrogen, [nH],
			// sulfur, non-charged oxygen, charged carbon, etc...
			// All these atoms and attached bonds are marked as handled to avoid
			// attached bonds to be promoted (changed to double bond) later.
		for (int ring=0; ring=4; qualifyingNo--) {
			do {
				qualifyingBondFound = false;
				for (int bond=0; bond= 2)
			if (!connectConjugatedRadicalPairs(isAromaticBond))
				break;

		if (allowSmartsFeatures) {
			if (mAromaticAtoms != 0) {
				for (int atom=0; atom 0)
				return false;
			}

		int explicitHydrogens = (mMol.getAtomCustomLabel(atom) == null || mMol.getAtomCustomLabelBytes(atom)[0] == HYDROGEN_IMPLICIT_ZERO) ?
								0 : mMol.getAtomCustomLabelBytes(atom)[0];
		int freeValence = mMol.getFreeValence(atom) - explicitHydrogens;
		if (freeValence < 1)
			return false;

		if (mMol.getAtomicNo(atom) == 16
		 || mMol.getAtomicNo(atom) == 34
		 || mMol.getAtomicNo(atom) == 52) {
			if (mMol.getConnAtoms(atom) == 2 && mMol.getAtomCharge(atom) <= 0)
				return false;
			return freeValence != 2;	// e.g. -S(=O)- correction to account for tetravalent S,Se
			}

		return true;
		}


	private void promoteBond(int bond) {
		if (mMol.getBondType(bond) == Molecule.cBondTypeSingle)
			mMol.setBondType(bond, Molecule.cBondTypeDouble);

		for (int i=0; i<2; i++) {
			int bondAtom = mMol.getBondAtom(i, bond);
			if (mMol.isMarkedAtom(bondAtom)) {
				mMol.setAtomMarker(bondAtom, false);
				mAromaticAtoms--;
				}
			for (int j=0; j 3
			 && mMol.getAtomPi(atom) > 0) {
				for (int i=0; i 1)
					 && mMol.isElectronegative(connAtom)) {
						if (mMol.getBondType(connBond) == Molecule.cBondTypeTriple)
							mMol.setBondType(connBond, Molecule.cBondTypeDouble);
						else
							mMol.setBondType(connBond, Molecule.cBondTypeSingle);
	
						mMol.setAtomCharge(atom, mMol.getAtomCharge(atom) + 1);
						mMol.setAtomCharge(connAtom, mMol.getAtomCharge(connAtom) - 1);
						mMol.setAtomAbnormalValence(atom, -1);
						break;
						}
					}
				}
			}
		}

	private boolean assignKnownEZBondParities() {
		mMol.ensureHelperArrays(Molecule.cHelperRings);

		boolean paritiesFound = false;
		int[] refAtom = new int[2];
		int[] refBond = new int[2];
		int[] otherAtom = new int[2];
		for (int bond=0; bond mNeighbourList;

		/**
		 * Instantiates a new parity object during smiles traversal.
		 * @param centralAtom index of atom processed
		 * @param centralAtomPosition position in SMILES of central atom
		 * @param fromAtom index of parent atom of centralAtom (-1 if centralAtom is first atom in smiles)
		 * @param explicitHydrogen Daylight syntax: hydrogen atoms defined within square bracket of other atom
		 * @param hydrogenPosition position in SMILES of central atom
		 * @param isClockwise true if central atom is marked with @@ rather than @
		 */
		public THParity(int centralAtom, int centralAtomPosition, int fromAtom, int explicitHydrogen, int hydrogenPosition, boolean isClockwise) {
			if (explicitHydrogen != 0 && explicitHydrogen != 1) {
				mError = true;
				}
			else {
				mCentralAtom = centralAtom;
				mCentralAtomPosition = centralAtomPosition;
				mIsClockwise = isClockwise;
				mNeighbourList = new ArrayList<>();

				// If we have a fromAtom, an explicit hydrogen, or a lone pair,
				// then add it as a normal neighbour.
				if (fromAtom != -1)
					addNeighbor(fromAtom, centralAtomPosition-1, false);

				if (fromAtom != -1 && explicitHydrogen == 1)
					addNeighbor(PSEUDO_ATOM_HYDROGEN, centralAtomPosition+1, false);
			}
		}

		/**
		 * Adds a currently traversed neighbor or ring closure to parity object,
		 * which belongs to the neighbor's parent atom.
		 * In case of a ring closure the bond closure digit's position in the smiles
		 * rather than the neighbor's position is the relevant position used for parity
		 * determination.
		 * We need to track the atom, because neighbors are not necessarily added in atom
		 * sequence (ring closure with connection back to stereo center).
		 * @param atom
		 * @param position
		 */
		public void addNeighbor(int atom, int position, boolean unused) {
			if (!mError) {
				if (mNeighbourList.size() == 4) {
					mError = true;
					return;
				}

				mNeighbourList.add(new ParityNeighbour(atom, position));
			}
		}

		public int calculateParity(int[] handleHydrogenAtomMap) {
			if (mError)
				return Molecule.cAtomParityUnknown;

			// We need to translate smiles-parse-time atom indexes to those that the molecule
			// uses after calling handleHydrogens, which is called from ensureHelperArrays().
			for (ParityNeighbour neighbour:mNeighbourList)
				if (neighbour.mAtom != PSEUDO_ATOM_HYDROGEN && neighbour.mAtom != PSEUDO_ATOM_LONE_PAIR)
					neighbour.mAtom = handleHydrogenAtomMap[neighbour.mAtom];

			if (mNeighbourList.size() == 3)
				// All hydrogens atoms within SMILES all stereo centers all hydrogens must be explicit (as explicit atoms or as H count in square brackets).
				// Therefore, three neighbour atoms is a rare situation, e.g. CC[S@](=O)C or frozen out CC[N@H]C
				// In these cases we add the electron pair as pseudo neighbour
				mNeighbourList.add(new ParityNeighbour(PSEUDO_ATOM_LONE_PAIR, mCentralAtomPosition));
			else if (mNeighbourList.size() != 4)
				return Molecule.cAtomParityUnknown;

			/*
System.out.println();
System.out.println("central:"+mCentralAtom+(mIsClockwise?" @@":" @")+" from:"
				+((mFromAtom == -1)?"none":Integer.toString(mFromAtom))+" with "+mImplicitHydrogen+" hydrogens");
System.out.print("neighbors: "+mNeighborAtom[0]+"("+mNeighborPosition[0]+(mNeighborIsHydrogen[0]?",H":",non-H")+")");
for (int i=1; i mNeighbourList.get(i).mAtom)
						inversion = !inversion;
					if (mNeighbourList.get(j).mPosition > mNeighbourList.get(i).mPosition)
						inversion = !inversion;
				}
			}
			return inversion;
		}
	}

	private static void testStereo() {
		final String[][] data = { { "F/C=C/I", "F/C=C/I" },
								  { "F/C=C\\I", "F/C=C\\I" },
								  { "C(=C/I)/F", "F/C=C\\I" },
								  { "[H]C(/F)=C/I", "F/C=C\\I" },
								  { "C(=C\\1)/I.F1", "F/C=C/I" },
								  { "C(=C1)/I.F/1", "F/C=C/I" },
								  { "C(=C\\F)/1.I1", "F/C=C/I" },
								  { "C(=C\\F)1.I\\1", "F/C=C/I" },
								  { "C\\1=C/I.F1", "F/C=C/I" },
								  { "C1=C/I.F/1", "F/C=C/I" },
								  { "C(=C\\1)/2.F1.I2", "F/C=C/I" },
								  { "C/2=C\\1.F1.I2", "F/C=C/I" },
								  { "C/1=C/C=C/F.I1", "F/C=C/C=C\\I" },
								  { "C1=C/C=C/F.I\\1", "F/C=C/C=C\\I" },
								  { "C(/I)=C/C=C/1.F1", "F/C=C/C=C\\I" },
								  { "C(/I)=C/C=C1.F\\1", "F/C=C/C=C\\I" },

								  { "[C@](Cl)(F)(I)1.Br1", "F[C@](Cl)(Br)I" },
								  { "Br[C@](Cl)(I)1.F1", "F[C@](Cl)(Br)I" },
								  { "[C@H](F)(I)1.Br1", "F[C@H](Br)I" },
								  { "Br[C@@H](F)1.I1", "F[C@H](Br)I" },

								  { "C[S@@](CC)=O", "CC[S@](C)=O" },
								  { "[S@](=O)(C)CC", "CC[S](C)=O" } };
		StereoMolecule mol = new StereoMolecule();
		for (String[] test:data) {
			try {
				new SmilesParser().parse(mol, test[0]);
				String smiles = new IsomericSmilesCreator(mol).getSmiles();
				System.out.print("IN:"+test[0]+" OUT:"+smiles);
				if (!test[1].equals(smiles))
					System.out.println(" EXPECTED: "+test[1]+" ERROR!");
				else
					System.out.println(" OK");
				}
			catch (Exception e) {
				e.printStackTrace();
				}
			}
		}

	public static void main(String[] args) {
		testStereo();

		System.out.println("ID-code equivalence test:");
		final String[][] data = { {	"N[C@@]([H])(C)C(=O)O",	"S-alanine",		"gGX`BDdwMUM@@" },
								  { "N[C@@H](C)C(=O)O",		"S-alanine",		"gGX`BDdwMUM@@" },
								  { "N[C@H](C(=O)O)C",		"S-alanine",		"gGX`BDdwMUM@@" },
								  { "[H][C@](N)(C)C(=O)O",	"S-alanine",		"gGX`BDdwMUM@@" },
								  { "[C@H](N)(C)C(=O)O",	"S-alanine",		"gGX`BDdwMUM@@" },
								  { "N[C@]([H])(C)C(=O)O",	"R-alanine",		"gGX`BDdwMUL`@" },
								  { "N[C@H](C)C(=O)O",		"R-alanine",		"gGX`BDdwMUL`@" },
								  { "N[C@@H](C(=O)O)C",		"R-alanine",		"gGX`BDdwMUL`@" },
								  { "[H][C@@](N)(C)C(=O)O",	"R-alanine",		"gGX`BDdwMUL`@" },
								  { "[C@@H](N)(C)C(=O)O",	"R-alanine",		"gGX`BDdwMUL`@" },
								  { "C[C@H]1CCCCO1",		"S-Methyl-pyran",	"gOq@@eLm]UUH`@" },
								  { "O1CCCC[C@@H]1C",		"S-Methyl-pyran",	"gOq@@eLm]UUH`@" },
								  { "[C@H](F)(B)O",			"S-Methyl-oxetan",	"gCaDDICTBSURH@" },
								  { "C1CO[C@H]1C",			"S-Methyl-oxetan",	"gKQ@@eLmUTb@" },
								  { "C1CO[C@@H](C)1",		"S-Methyl-oxetan",	"gKQ@@eLmUTb@" },
								  { "[C@H]1(C)CCO1",		"S-Methyl-oxetan",	"gKQ@@eLmUTb@" },
								  { "[H][C@]1(C)CCO1",		"S-Methyl-oxetan",	"gKQ@@eLmUTb@" },
								  { "[H][C@@]1(CCO1)C",		"S-Methyl-oxetan",	"gKQ@@eLmUTb@" },
								  { "[C@@]1([H])(C)CCO1",	"S-Methyl-oxetan",	"gKQ@@eLmUTb@" },
								  { "[C@]1(C)([H])CCO1",	"S-Methyl-oxetan",	"gKQ@@eLmUTb@" },
								  { "C1[C@@H]2COC2=N1",		"oxetan-azetin",	"gGy@LDimDvfja`@" },
								  { "CC(C)[C@@]12C[C@@H]1[C@@H](C)C(=O)C2", "alpha-thujone", "dmLH@@RYe~IfyjjjkDaIh@" },
								  { "CN1CCC[C@H]1c2cccnc2",	"Nicotine",			"dcm@@@{IDeCEDUSh@UUECP@" },
								  { "CC[C@H](O1)CC[C@@]12CCCO2", "2S,5R-Chalcogran", "dmLD@@qJZY|fFZjjjdbH`@" },
								  { "CCCC",					"butane",			"gC`@Dij@@" },
								  { "C1C.CC1",				"butane",			"gC`@Dij@@" },
								  { "[CH3][CH2][CH2][CH3]",	"butane",			"gC`@Dij@@" },
								  { "C-C-C-C",				"butane",			"gC`@Dij@@" },
								  { "C12.C1.CC2",			"butane",			"gC`@Dij@@" },
								  { "[Na+].[Cl-]",			"NaCl",				"eDARHm@zd@@" },
								  { "[Na+]-[Cl-]",			"NaCl",				"error" },
								  { "[Na+]1.[Cl-]1",		"NaCl",				"error" },
								  { "c1ccccc1",				"benzene",			"gFp@DiTt@@@" },
								  { "C1=C-C=C-C=C1",		"benzene",			"gFp@DiTt@@@" },
								  { "C1:C:C:C:C:C:1",		"benzene",			"gFp@DiTt@@@" },
								  { "c1ccncc1",				"pyridine",			"gFx@@eJf`@@@" },
								  { "[nH]1cccc1",			"pyrrole",			"gKX@@eKcRp@" },
								  { "N1C=C-C=C1",			"pyrrole",			"gKX@@eKcRp@" },
								  { "[H]n1cccc1",			"pyrrole",			"gKX@@eKcRp@" },
								  { "[H]n1cccc1",			"pyrrole",			"gKX@@eKcRp@" },
								  { "c1cncc1",				"pyrrole no [nH]",	"error" },
								  { "[13CH4]",				"C13-methane",		"fH@FJp@" },
								  { "[35ClH]",				"35-chlorane",		"fHdP@qX`" },
								  { "[35Cl-]",				"35-chloride",		"fHtPxAbq@" },
								  { "[Na+].[O-]c1ccccc1",	"Na-phenolate",		"daxHaHCPBXyAYUn`@@@" },
								  { "c1cc([O-].[Na+])ccc1",	"Na-phenolate",		"daxHaHCPBXyAYUn`@@@" },
								  { "C[C@@](C)(O1)C[C@@H](O)[C@@]1(O2)[C@@H](C)[C@@H]3CC=C4[C@]3(C2)C(=O)C[C@H]5[C@H]4CC[C@@H](C6)[C@]5(C)Cc(n7)c6nc(C[C@@]89(C))c7C[C@@H]8CC[C@@H]%10[C@@H]9C[C@@H](O)[C@@]%11(C)C%10=C[C@H](O%12)[C@]%11(O)[C@H](C)[C@]%12(O%13)[C@H](O)C[C@@]%13(C)CO",
									"Cephalostatin-1",
									"gdKe@h@@K`H@XjKHuYlnoP\\bbdRbbVTLbTrJbRaQRRRbTJTRTrfrfTTOBPHtFODPhLNSMdIERYJmShLfs]aqy|uUMUUUUUUE@UUUUMUUUUUUTQUUTPR`nDdQQKB|RIFbiQeARuQt`rSSMNtGS\\ct@@" },
									};

		StereoMolecule mol = new StereoMolecule();
		for (String[] test:data) {
			try {
				new SmilesParser().parse(mol, test[0]);
				String idcode = new Canonizer(mol).getIDCode();
				if (test[2].equals("error"))
					System.out.println("Should create error! "+test[1]+" smiles:"+test[0]+" idcode:"+idcode);
				else if (!test[2].equals(idcode))
					System.out.println("ERROR! "+test[1]+" smiles:"+test[0]+" is:"+idcode+" must:"+test[2]);
				}
			catch (Exception e) {
				if (!test[2].equals("error"))
					System.out.println("ERROR! "+test[1]+" smiles:"+test[0]+" exception:"+e.getMessage());
				}
			}
		}
	}

class SmilesRange {
	private final byte[] smiles;
	private int pos;
	public int min,max;
	public boolean isDefault;

	public SmilesRange(byte[] smiles) {
		this.smiles = smiles;
		}

	public int parse(int position, int defaultMin, int defaultMax) {
		isDefault = false;
		pos = position;

		if (Character.isDigit(smiles[position])) {
			int val = parseInt();
			min = max = val;

			// If we have the same query feature, comma delimited and with different number, then we extend the range...
			int firstLetter = position-1;
			while (firstLetter > 1 && Character.isLetterOrDigit(smiles[firstLetter-1]))
				firstLetter--;
			while (smiles[pos] == ',') {
				boolean lettersMatch = true;
				int letterCount = position-firstLetter;
				for (int i=0; i val)
					min = val;
				else if (max < val)
					max = val;
				}

			return pos - position;
			}

		if (smiles[position] == '{'
			&& Character.isDigit(smiles[position+1])) {
			pos++;
			min = parseInt();
			if (smiles[pos++] != '-')
				return 0;   // unexpected
			if (!Character.isDigit(smiles[pos]))
				return 0;   // unexpected
			max = parseInt();
			if (smiles[pos++] != '}')
				return 0;   // unexpected
			return pos - position;
			}

		min = defaultMin;
		max = defaultMax;
		isDefault = true;
		return 0;
		}

	public boolean isSingle() {
		return max == min;
	}

	public boolean isRange() {
		return max > min;
	}

	public String toString() {
		return "{"+min+"-"+max+"}";
	}

	private int parseInt() {
		int num = smiles[pos++] - '0';
		if (Character.isDigit(smiles[pos]))
			num = 10 * num + (smiles[pos++] - '0');
		return num;
	}
}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy