All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.actelion.research.chem.SmilesParser Maven / Gradle / Ivy

There is a newer version: 2024.11.2
Show newest version
/*
 * Copyright (c) 1997 - 2016
 * Actelion Pharmaceuticals Ltd.
 * Gewerbestrasse 16
 * CH-4123 Allschwil, Switzerland
 *
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
 *
 * 1. Redistributions of source code must retain the above copyright notice, this
 *    list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright notice,
 *    this list of conditions and the following disclaimer in the documentation
 *    and/or other materials provided with the distribution.
 * 3. Neither the name of the the copyright holder nor the
 *    names of its contributors may be used to endorse or promote products
 *    derived from this software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
 * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
 * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *
 * @author Thomas Sander
 */

package com.actelion.research.chem;

import com.actelion.research.chem.coords.CoordinateInventor;
import com.actelion.research.chem.reaction.Reaction;
import com.actelion.research.util.ArrayUtils;

import java.nio.charset.StandardCharsets;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.TreeMap;


public class SmilesParser {
	protected static final int SMARTS_MODE_MASK = 3;
	public static final int SMARTS_MODE_IS_SMILES = 0;
	public static final int SMARTS_MODE_GUESS = 1;
	public static final int SMARTS_MODE_IS_SMARTS = 2;

	public static final int MODE_SKIP_COORDINATE_TEMPLATES = 4;
	public static final int MODE_MAKE_HYDROGEN_EXPLICIT = 8;
	public static final int MODE_NO_CACTUS_SYNTAX = 16;  // if not set, then some CACTVS SMARTS extensions will be recognized and translated as close as possible
	public static final int MODE_SINGLE_DOT_SEPARATOR = 32;  // CONSIDER single dots '.' (rather than '..') as moelcule separator when parsing reactions
	public static final int MODE_CREATE_SMARTS_WARNING = 64;
	public static final int MODE_ENUMERATE_SMARTS = 128;

	private static final int INITIAL_CONNECTIONS = 16;
	private static final int MAX_CONNECTIONS = 100; // largest allowed one in SMILES is 99
	private static final int BRACKET_LEVELS = 32;
	private static final int MAX_AROMATIC_RING_SIZE = 15;

	// Unspecified hydrogen count within brackets means :=0 for SMILES and no-H-restriction for SMARTS.
	// Therefore, we have to distinguish from explicit H0, which defined query feature for SMARTS.
	protected static final int HYDROGEN_IMPLICIT_ZERO = 9;

	private StereoMolecule mMol;
	private boolean[] mIsAromaticBond;
	private int mMode,mSmartsMode,mAromaticAtoms,mAromaticBonds,mCoordinateMode;
	private long mRandomSeed;
	private final boolean mCreateSmartsWarnings,mMakeHydrogenExplicit,mSingleDotSeparator;
	private StringBuilder mSmartsWarningBuffer;
	private boolean mSmartsFeatureFound;
	private ArrayList mEnumerationPositionList;

	/**
	 * Creates a new SmilesParser that doesn't allow SMARTS features to be present in
	 * parsed strings. SMARTS features cause an exception. The fragment flag of created
	 * molecules is never set.
	 */
	public SmilesParser() {
		this(SMARTS_MODE_IS_SMILES);
		}

	/**
	 * Creates a new SmilesParser that may or may not allow SMARTS features to be present in
	 * parsed strings. If smartsMode is SMARTS_MODE_IS_SMILES, then any SMARTS features cause
	 * an exception. If smartsMode is SMARTS_MODE_IS_SMARTS, then the input string is considered
	 * a SMARTS, e.g. 'CC' is taken as fragment of two non-aromatic carbon atoms connected by a
	 * single bond and without any implicit hydrogen atoms. If smartsMode is SMARTS_MODE_IS_GUESS,
	 * then the molecule is considered a substructure if any SMARTS features are discovered.
	 * Depending on whether SMARTS features are found, created molecules have the fragment flag set
	 * or not set.
	 * @param mode one of SMARTS_MODE... and optionally other mode flags
	 */
	public SmilesParser(int mode) {
		mMode = mode & ~SMARTS_MODE_MASK;
		mSmartsMode = mode & SMARTS_MODE_MASK;
		mSingleDotSeparator = (mode & MODE_SINGLE_DOT_SEPARATOR) != 0;
		mCreateSmartsWarnings = (mode & MODE_CREATE_SMARTS_WARNING) != 0;
		mMakeHydrogenExplicit = ((mode & MODE_MAKE_HYDROGEN_EXPLICIT) != 0);
		mCoordinateMode = CoordinateInventor.MODE_DEFAULT;
		if ((mode & MODE_SKIP_COORDINATE_TEMPLATES) != 0)
			mCoordinateMode |= CoordinateInventor.MODE_SKIP_DEFAULT_TEMPLATES;
		if (mMakeHydrogenExplicit)
			mCoordinateMode &= ~CoordinateInventor.MODE_REMOVE_HYDROGEN;
		}

	/**
	 * Depending on the parse() parameters, the SmilesParser may or may not generate new atom coordinates
	 * after parsing the SMILES. In difficult cases the employed CoordinateInventor uses random decisions
	 * when optimizing colliding coordinates. In strained and bridged ring systems, generated coordinates
	 * may not correctly represent all E/Z-bond configurations.
	 * Calling this method with a seed != 0 causes the creation of reproducible atom coordinates.
	 * @param seed value different from 0 in order to always create the same reproducible atom coordinates
	 */
	public void setRandomSeed(long seed) {
		mRandomSeed = seed;
		}

	public StereoMolecule parseMolecule(String smiles) {
		return smiles == null ? null : parseMolecule(smiles.getBytes(StandardCharsets.UTF_8));
		}

	/**
	 * Convenience method to quickly obtain a StereoMolecule from a SMILES string.
	 * If you process many SMILES, then the parse() methods are preferred, because
	 * they avoid the steady instantiation new StereoMolecules.
	 * @param smiles
	 * @return
	 */
	public StereoMolecule parseMolecule(byte[] smiles) {
		StereoMolecule mol = new StereoMolecule();
		try {
			parse(mol, smiles);
			}
		catch (Exception e) {
			return null;
			}
		return mol;
		}

	public static boolean isReactionSmiles(byte[] smiles) {
		return isReactionSmiles(smiles, null);
		}

	public static boolean isReactionSmiles(byte[] smiles, int[] catalystCountHolder) {
		int count = 0;
		int index = -1;

		while (count < 3) {
			index = ArrayUtils.indexOf(smiles, (byte)'>', index + 1);
			while (index>0 && smiles[index - 1] == (byte)'-')
				index = ArrayUtils.indexOf(smiles, (byte)'>', index + 1);

			if (index == -1)
				break;

			count++;

			if (catalystCountHolder != null && count == 1) {
				catalystCountHolder[0] = 0;
				if (index+1');
		while (index1 > 0 && smiles[index1-1] == (byte)'-')
			index1 = ArrayUtils.indexOf(smiles, (byte)'>', index1+1);

		int index2 = (index1 == -1) ? -1 : ArrayUtils.indexOf(smiles, (byte)'>', index1+1);
		while (index2 > 0 && smiles[index2-1] == (byte)'-')
			index2 = ArrayUtils.indexOf(smiles, (byte)'>', index2+1);

		if (index2 == -1)
			throw new Exception("Missing one or both separators ('>').");
		if (ArrayUtils.indexOf(smiles, (byte)'>', index2+1) != -1)
			throw new Exception("Found more than 2 separators ('>').");

		Reaction rxn = new Reaction();

		int part = 0;
		int index = 0;
		int closingGroupBracketIndex = -1;
		while (index < smiles.length) {
			while (index') {
				index++;
				part++;
				}
			}

		return rxn;
		}

	protected ArrayList getEnumerationPositionList() {
		return mEnumerationPositionList;
	}

	protected void setEnumerationPositionList(ArrayList l) {
		mEnumerationPositionList = l;
	}

	public String[] enumerateSmarts(String smarts) throws Exception {
		mEnumerationPositionList = new ArrayList<>();
		mSmartsMode = SMARTS_MODE_IS_SMARTS;
		mMode |= MODE_ENUMERATE_SMARTS;

		ArrayList smartsList = new ArrayList<>();
		smartsList.add(smarts);

		try {
			parse(new StereoMolecule(), smarts);
		}
		catch (Exception e) {
			System.out.println(e.getMessage());
		}

		EnumerationPosition[] options = mEnumerationPositionList.toArray(new EnumerationPosition[0]);
		Arrays.sort(options);

		for (EnumerationPosition option : options) {
			ArrayList enumeration = new ArrayList<>();
			for (String s : smartsList)
				option.enumerate(this, s.getBytes(StandardCharsets.UTF_8), enumeration);

			smartsList = enumeration;
		}

		return smartsList.toArray(new String[0]);
	}

	/**
	 * If createSmartsWarning in the constructor was passed as true, then this method
	 * returns a list of all SMARTS features, which could not be interpreted in the most recently
	 * parsed SMILES/SMARTS pattern.
	 * @return
	 */
	public String getSmartsWarning() {
		return mSmartsWarningBuffer == null ? "" : "Unresolved SMARTS features:"+mSmartsWarningBuffer;
		}

	/**
	 * Parses the given smiles into the molecule, creates proper atom coordinates
	 * to reflect correct double bond geometries and translates tetrahedral and allene
	 * parities into up/down-bonds. SMARTS features are neglected unless
	 * setAllowSmartsFeatures(true) was called before parsing.
	 * @param mol
	 * @param smiles
	 * @throws Exception
	 */
	public void parse(StereoMolecule mol, String smiles) throws Exception {
		parse(mol, smiles.getBytes(StandardCharsets.UTF_8), true, true);
		}

	public void parse(StereoMolecule mol, byte[] smiles) throws Exception {
		parse(mol, smiles, true, true);
		}

	public void parse(StereoMolecule mol, byte[] smiles, int position, int endIndex) throws Exception {
		parse(mol, smiles, position, endIndex, true, true);
		}

	public void parse(StereoMolecule mol, byte[] smiles, boolean createCoordinates, boolean readStereoFeatures) throws Exception {
		parse(mol, smiles, 0, smiles.length, createCoordinates, readStereoFeatures);
		}

	public void parse(StereoMolecule mol, byte[] smiles, int position, int endIndex, boolean createCoordinates, boolean readStereoFeatures) throws Exception {
		mMol = mol;
		mMol.clear();

		if (mSmartsWarningBuffer != null)
			mSmartsWarningBuffer.setLength(0);

		mAromaticAtoms = 0;
		mSmartsFeatureFound = false;
		boolean allowSmarts = (mSmartsMode != SMARTS_MODE_IS_SMILES);

		TreeMap parityMap = null;

		int[] baseAtom = new int[BRACKET_LEVELS];
		baseAtom[0] = -1;

		int[] ringClosureAtom = new int[INITIAL_CONNECTIONS];
		int[] ringClosurePosition = new int[INITIAL_CONNECTIONS];
		int[] ringClosureBondType = new int[INITIAL_CONNECTIONS];
		int[] ringClosureBondQueryFeatures = new int[INITIAL_CONNECTIONS];
		for (int i = 0; i();
	
						// using position as hydrogenPosition is close enough
						int hydrogenCount = (atomParser.explicitHydrogens == HYDROGEN_IMPLICIT_ZERO) ? 0 : atomParser.explicitHydrogens;
						parityMap.put(atom, new THParity(atom, position - 2, fromAtom, hydrogenCount, position - 1, atomParser.isClockwise));
						}
					}

				continue;
				}

			if (theChar == '.') {
				baseAtom[bracketLevel] = -1;
				bondType = Molecule.cBondTypeDeleted;
				continue;
				}

			if (isBondSymbol(theChar)) {
				if (squareBracketOpen)
					throw new Exception("SmilesParser: unexpected bond symbol inside square brackets: '"+theChar+"', position:"+(position-1));

				int excludedBonds = 0;
				while (isBondSymbol(theChar)) {
					if (theChar == '!') {
						theChar = (char)smiles[position++];
						if (theChar == '@')
							bondQueryFeatures |= Molecule.cBondQFNotRing;
						else if ((theChar == '-' && smiles[position] == '>')
						 || (theChar == '<' && smiles[position] == '-')) {
							excludedBonds |= Molecule.cBondTypeMetalLigand;
							position++;
							}
						else if (theChar == '-')
							excludedBonds |= Molecule.cBondQFSingle;
						else if (theChar == '=')
							excludedBonds |= Molecule.cBondQFDouble;
						else if (theChar == '#')
							excludedBonds |= Molecule.cBondQFTriple;
						else if (theChar == '$')
							excludedBonds |= Molecule.cBondQFQuadruple;
						else if (theChar == ':')
							excludedBonds |= Molecule.cBondQFDelocalized;
						else
							throw new Exception("SmilesParser: bond symbol '"+theChar+"' not allowed after '!'. Position:"+(position-1));
						}
					else {
						if (theChar == '@')
							bondQueryFeatures |= Molecule.cBondQFRing;
						else if (theChar == '=')
							bondType = Molecule.cBondTypeDouble;
						else if (theChar == '#')
							bondType = Molecule.cBondTypeTriple;
						else if (theChar == '$')
							bondType = Molecule.cBondTypeQuadruple;
						else if (theChar == ':')
							bondType = Molecule.cBondTypeDelocalized;
						else if (theChar == '~')
							bondQueryFeatures |= Molecule.cBondQFSingle | Molecule.cBondQFDouble | Molecule.cBondQFTriple | Molecule.cBondQFDelocalized;
						else if (theChar == '/') {
							if (readStereoFeatures)
								bondType = Molecule.cBondTypeUp;    // encode slash temporarily in bondType
							}
						else if (theChar == '\\') {
							if (readStereoFeatures)
								bondType = Molecule.cBondTypeDown;  // encode slash temporarily in bondType
							}

						// Smiles extention 'dative bond'
						else if ((theChar == '-' && smiles[position] == '>')
						 || (theChar == '<' && smiles[position] == '-')) {
								bondType = Molecule.cBondTypeMetalLigand;
								position++;
							}

						if (smiles[position] == ',') {
							bondQueryFeatures |= bondSymbolToQueryFeature(bondType == Molecule.cBondTypeMetalLigand ? '>' : theChar);
							while (smiles[position] == ',') {
								if ((smiles[position+1] == '<' && smiles[position+2] == '-')
								 || (smiles[position+1] == '-' && smiles[position+2] == '>')) {
									bondQueryFeatures |= bondSymbolToQueryFeature('>');
									position += 3;
									}
								else {
									bondQueryFeatures |= bondSymbolToQueryFeature((char)smiles[position+1]);
									position += 2;
									}
								}
							}
						}

					if (smiles[position] == ';') {
						position++;
						theChar = (char)smiles[position++];
						continue;
						}

					if (excludedBonds != 0)
						bondQueryFeatures |= Molecule.cBondQFBondTypes & ~excludedBonds;

					break;
					}

				continue;
				}

			if (theChar <= ' ') {	// we stop reading at whitespace
				position = endIndex;
				continue;
				}

			if (Character.isDigit(theChar)) {
				int number = theChar - '0';
				if (squareBracketOpen) {
					while (position < endIndex
					 && Character.isDigit(smiles[position])) {
						number = 10 * number + smiles[position] - '0';
						position++;
						}
					atomMass = number;
					}
				else {
					int bondTypePosition = isDoubleDigit ? position - 3 : position - 2;
					boolean hasBondType = (smiles[bondTypePosition] == '-'
										|| smiles[bondTypePosition] == '/'
										|| smiles[bondTypePosition] == '\\'
										|| smiles[bondTypePosition] == '='
										|| smiles[bondTypePosition] == '#'
										|| smiles[bondTypePosition] == '$'
										|| smiles[bondTypePosition] == ':'
										|| smiles[bondTypePosition] == '>'
										|| smiles[bondTypePosition] == '~');
					if (isDoubleDigit
					 && position < endIndex
					 && Character.isDigit(smiles[position])) {
						number = 10 * number + smiles[position] - '0';
						isDoubleDigit = false;
						position++;
						}
					if (number >= ringClosureAtom.length) {
						if (number >=MAX_CONNECTIONS)
							throw new Exception("SmilesParser: ringClosureAtom number out of range: "+number);

						int oldSize = ringClosureAtom.length;
						int newSize = ringClosureAtom.length;
						while (newSize <= number)
							newSize = Math.min(MAX_CONNECTIONS, newSize + INITIAL_CONNECTIONS);

						ringClosureAtom = Arrays.copyOf(ringClosureAtom, newSize);
						ringClosurePosition = Arrays.copyOf(ringClosurePosition, newSize);
						ringClosureBondType = Arrays.copyOf(ringClosureBondType, newSize);
						ringClosureBondQueryFeatures = Arrays.copyOf(ringClosureBondQueryFeatures, newSize);
						for (int i=oldSize; i= 1)
					mMol.setAtomQueryFeature(atom, Molecule.cAtomQFNot0Hydrogen, true);
				if (explicitHydrogen >= 2)
					mMol.setAtomQueryFeature(atom, Molecule.cAtomQFNot1Hydrogen, true);
				if (explicitHydrogen >= 3)
					mMol.setAtomQueryFeature(atom, Molecule.cAtomQFNot2Hydrogen, true);
				if (explicitHydrogen >= 4)
					mMol.setAtomQueryFeature(atom, Molecule.cAtomQFNot3Hydrogen, true);
				}
			}

		if (!mMakeHydrogenExplicit && (mSmartsFeatureFound || mSmartsMode == SMARTS_MODE_IS_SMARTS))
			mMol.removeExplicitHydrogens();

		mMol.ensureHelperArrays(Molecule.cHelperNeighbours);

		correctValenceExceededNitrogen();	// convert pyridine oxides and nitro into polar structures with valid nitrogen valences

		locateAromaticDoubleBonds(allowSmarts, mSmartsFeatureFound);

		mMol.removeAtomCustomLabels();
		mMol.setHydrogenProtection(false);

		if (readStereoFeatures) {
			assignKnownEZBondParities();

			if (parityMap != null) {
				for (THParity parity:parityMap.values())
					mMol.setAtomParity(handleHydrogenAtomMap[parity.mCentralAtom], parity.calculateParity(handleHydrogenAtomMap), false);

				mMol.setParitiesValid(0);
				}
			}

		// defines unknown EZ parities as such, i.e. prevent coordinate generation to create implicit EZ-parities
		mMol.setParitiesValid(0);

		if (createCoordinates) {
			CoordinateInventor inventor = new CoordinateInventor(mCoordinateMode);
			if (mRandomSeed != 0)
				inventor.setRandomSeed(mRandomSeed);
			inventor.invent(mMol);

			if (readStereoFeatures)
				mMol.setUnknownParitiesToExplicitlyUnknown();
			}

		if (mSmartsFeatureFound || mSmartsMode == SMARTS_MODE_IS_SMARTS) {
			mMol.setFragment(true);
			mMol.validateAtomQueryFeatures();
			mMol.validateBondQueryFeatures();
			}
		}

	/**
	 * @return true if the previously parsed SMILES contained a SMARTS feature and was not parsed with SMARTS_MODE_IS_SMILES
	 */
	public boolean isSmarts() {
		return mSmartsFeatureFound;
	}

	private boolean isBondSymbol(char theChar) {
		return theChar == '-'
			|| theChar == '='
			|| theChar == '#'
			|| theChar == '$'
			|| theChar == ':'
			|| theChar == '/'
			|| theChar == '\\'
			|| theChar == '<'
			|| theChar == '~'
			|| theChar == '!'
			|| theChar == '@';
		}

	private int bondSymbolToQueryFeature(char symbol) {
		return symbol == '=' ? Molecule.cBondQFDouble
			 : symbol == '#' ? Molecule.cBondQFTriple
			 : symbol == '$' ? Molecule.cBondQFQuadruple
			 : symbol == ':' ? Molecule.cBondQFDelocalized
			 : symbol == '>' ? Molecule.cBondQFMetalLigand
			 : symbol == '~' ? Molecule.cBondQFBondTypes : Molecule.cBondQFSingle;
		}

	protected void smartsWarning(String feature) {
		if (mCreateSmartsWarnings) {
			if (mSmartsWarningBuffer == null)
				mSmartsWarningBuffer = new StringBuilder();

			mSmartsWarningBuffer.append(" ");
			mSmartsWarningBuffer.append(feature);
			}
		}

	private void locateAromaticDoubleBonds(boolean allowSmartsFeatures, boolean smartsFeatureFound) throws Exception {
		mMol.ensureHelperArrays(Molecule.cHelperNeighbours);
		mIsAromaticBond = new boolean[mMol.getBonds()];
		mAromaticBonds = 0;

		// all explicitly defined aromatic bonds are taken
		for (int bond=0; bond=0) System.arraycopy(mIsAromaticBond, 0, isAromaticBond, 0, mMol.getBonds());

			// Some Smiles contain 'aromatic' rings with atoms not being compatible
			// with a PI-bond. These include: tertiary non-charged nitrogen, [nH],
			// sulfur, non-charged oxygen, charged carbon, etc...
			// All these atoms and attached bonds are marked as handled to avoid
			// attached bonds to be promoted (changed to double bond) later.
		for (int ring=0; ring=4; qualifyingNo--) {
			do {
				qualifyingBondFound = false;
				for (int bond=0; bond= 2)
			if (!connectConjugatedRadicalPairs(isAromaticBond))
				break;

		if (allowSmartsFeatures) {
			if (mAromaticAtoms != 0) {
				for (int atom=0; atom 0)
				return false;
			}

		int explicitHydrogens = (mMol.getAtomCustomLabel(atom) == null || mMol.getAtomCustomLabelBytes(atom)[0] == HYDROGEN_IMPLICIT_ZERO) ?
								0 : mMol.getAtomCustomLabelBytes(atom)[0];
		int freeValence = mMol.getFreeValence(atom) - explicitHydrogens;
		if (freeValence < 1)
			return false;

		if (mMol.getAtomicNo(atom) == 16
		 || mMol.getAtomicNo(atom) == 34
		 || mMol.getAtomicNo(atom) == 52) {
			if (mMol.getConnAtoms(atom) == 2 && mMol.getAtomCharge(atom) <= 0)
				return false;
			return freeValence != 2;	// e.g. -S(=O)- correction to account for tetravalent S,Se
			}

		return true;
		}


	private void promoteBond(int bond) {
		if (mMol.getBondType(bond) == Molecule.cBondTypeSingle)
			mMol.setBondType(bond, Molecule.cBondTypeDouble);

		for (int i=0; i<2; i++) {
			int bondAtom = mMol.getBondAtom(i, bond);
			if (mMol.isMarkedAtom(bondAtom)) {
				mMol.setAtomMarker(bondAtom, false);
				mAromaticAtoms--;
				}
			for (int j=0; j 3
			 && mMol.getAtomPi(atom) > 0) {
				for (int i=0; i 1)
					 && mMol.isElectronegative(connAtom)) {
						if (mMol.getBondType(connBond) == Molecule.cBondTypeTriple)
							mMol.setBondType(connBond, Molecule.cBondTypeDouble);
						else
							mMol.setBondType(connBond, Molecule.cBondTypeSingle);
	
						mMol.setAtomCharge(atom, mMol.getAtomCharge(atom) + 1);
						mMol.setAtomCharge(connAtom, mMol.getAtomCharge(connAtom) - 1);
						mMol.setAtomAbnormalValence(atom, -1);
						break;
						}
					}
				}
			}
		}

	private boolean assignKnownEZBondParities() {
		mMol.ensureHelperArrays(Molecule.cHelperRings);

		boolean paritiesFound = false;
		int[] refAtom = new int[2];
		int[] refBond = new int[2];
		int[] otherAtom = new int[2];
		for (int bond=0; bond {
		int mPosition,mCount;

		/**
		 * @param position position of first option in original smarts
		 */
		public EnumerationPosition(int position) {
			mPosition = position;
			mCount = 1;
			}

		public void increase() {
			mCount++;
			}

		public void enumerate(SmilesParser parser, byte[] smarts, ArrayList enumeration) throws Exception {
			ArrayList optionList = new ArrayList<>();

			int start = mPosition;
			SmilesAtomParser atomParser = new SmilesAtomParser(parser, mMode | mSmartsMode);
			int end = atomParser.parseAtomInsideBrackets(smarts, start+1, smarts.length, true, true)-1;
			if (smarts[end] != ']') {  // we have multiple options and create an option list
				optionList.add(new String(smarts, start, end-start));
				while (smarts[end] != ']') {
					start = end+1;
					end = atomParser.parseAtomInsideBrackets(smarts, start+1, smarts.length, true, true)-1;
					optionList.add(new String(smarts, start, end-start));
				}
			}

			for (String option : optionList)
				enumeration.add(new String(smarts, 0, mPosition) + option + new String(smarts, end, smarts.length-end));
			}

		@Override
		public int compareTo(EnumerationPosition o) {
			return Integer.compare(o.mPosition, mPosition);
		}
	}

	private static class ParityNeighbour {
		int mAtom,mPosition;

		public ParityNeighbour(int atom, int position) {
			mAtom = atom;
			mPosition = position;
			}
		}

	private static class THParity {
		private static final int PSEUDO_ATOM_HYDROGEN = Integer.MAX_VALUE - 1;
		private static final int PSEUDO_ATOM_LONE_PAIR = Integer.MAX_VALUE;

		int mCentralAtom,mCentralAtomPosition;
		boolean mIsClockwise,mError;
		ArrayList mNeighbourList;

		/**
		 * Instantiates a new parity object during smiles traversal.
		 * @param centralAtom index of atom processed
		 * @param centralAtomPosition position in SMILES of central atom
		 * @param fromAtom index of parent atom of centralAtom (-1 if centralAtom is first atom in smiles)
		 * @param explicitHydrogen Daylight syntax: hydrogen atoms defined within square bracket of other atom
		 * @param hydrogenPosition position in SMILES of central atom
		 * @param isClockwise true if central atom is marked with @@ rather than @
		 */
		public THParity(int centralAtom, int centralAtomPosition, int fromAtom, int explicitHydrogen, int hydrogenPosition, boolean isClockwise) {
			if (explicitHydrogen != 0 && explicitHydrogen != 1) {
				mError = true;
				}
			else {
				mCentralAtom = centralAtom;
				mCentralAtomPosition = centralAtomPosition;
				mIsClockwise = isClockwise;
				mNeighbourList = new ArrayList<>();

				// If we have a fromAtom, an explicit hydrogen, or a lone pair,
				// then add it as a normal neighbour.
				if (fromAtom != -1)
					addNeighbor(fromAtom, centralAtomPosition-1, false);

				if (fromAtom != -1 && explicitHydrogen == 1)
					addNeighbor(PSEUDO_ATOM_HYDROGEN, centralAtomPosition+1, false);
			}
		}

		/**
		 * Adds a currently traversed neighbor or ring closure to parity object,
		 * which belongs to the neighbor's parent atom.
		 * In case of a ring closure the bond closure digit's position in the smiles
		 * rather than the neighbor's position is the relevant position used for parity
		 * determination.
		 * We need to track the atom, because neighbors are not necessarily added in atom
		 * sequence (ring closure with connection back to stereo center).
		 * @param atom
		 * @param position
		 */
		public void addNeighbor(int atom, int position, boolean unused) {
			if (!mError) {
				if (mNeighbourList.size() == 4) {
					mError = true;
					return;
				}

				mNeighbourList.add(new ParityNeighbour(atom, position));
			}
		}

		public int calculateParity(int[] handleHydrogenAtomMap) {
			if (mError)
				return Molecule.cAtomParityUnknown;

			// We need to translate smiles-parse-time atom indexes to those that the molecule
			// uses after calling handleHydrogens, which is called from ensureHelperArrays().
			for (ParityNeighbour neighbour:mNeighbourList)
				if (neighbour.mAtom != PSEUDO_ATOM_HYDROGEN && neighbour.mAtom != PSEUDO_ATOM_LONE_PAIR)
					neighbour.mAtom = handleHydrogenAtomMap[neighbour.mAtom];

			if (mNeighbourList.size() == 3)
				// All hydrogens atoms within SMILES all stereo centers all hydrogens must be explicit (as explicit atoms or as H count in square brackets).
				// Therefore, three neighbour atoms is a rare situation, e.g. CC[S@](=O)C or frozen out CC[N@H]C
				// In these cases we add the electron pair as pseudo neighbour
				mNeighbourList.add(new ParityNeighbour(PSEUDO_ATOM_LONE_PAIR, mCentralAtomPosition));
			else if (mNeighbourList.size() != 4)
				return Molecule.cAtomParityUnknown;

			/*
System.out.println();
System.out.println("central:"+mCentralAtom+(mIsClockwise?" @@":" @")+" from:"
				+((mFromAtom == -1)?"none":Integer.toString(mFromAtom))+" with "+mImplicitHydrogen+" hydrogens");
System.out.print("neighbors: "+mNeighborAtom[0]+"("+mNeighborPosition[0]+(mNeighborIsHydrogen[0]?",H":",non-H")+")");
for (int i=1; i mNeighbourList.get(i).mAtom)
						inversion = !inversion;
					if (mNeighbourList.get(j).mPosition > mNeighbourList.get(i).mPosition)
						inversion = !inversion;
				}
			}
			return inversion;
		}
	}

	private static void testStereo() {
		final String[][] data = { { "F/C=C/I", "F/C=C/I" },
								  { "F/C=C\\I", "F/C=C\\I" },
								  { "C(=C/I)/F", "F/C=C\\I" },
								  { "[H]C(/F)=C/I", "F/C=C\\I" },
								  { "C(=C\\1)/I.F1", "F/C=C/I" },
								  { "C(=C1)/I.F/1", "F/C=C/I" },
								  { "C(=C\\F)/1.I1", "F/C=C/I" },
								  { "C(=C\\F)1.I\\1", "F/C=C/I" },
								  { "C\\1=C/I.F1", "F/C=C/I" },
								  { "C1=C/I.F/1", "F/C=C/I" },
								  { "C(=C\\1)/2.F1.I2", "F/C=C/I" },
								  { "C/2=C\\1.F1.I2", "F/C=C/I" },
								  { "C/1=C/C=C/F.I1", "F/C=C/C=C\\I" },
								  { "C1=C/C=C/F.I\\1", "F/C=C/C=C\\I" },
								  { "C(/I)=C/C=C/1.F1", "F/C=C/C=C\\I" },
								  { "C(/I)=C/C=C1.F\\1", "F/C=C/C=C\\I" },

								  { "[C@](Cl)(F)(I)1.Br1", "F[C@](Cl)(Br)I" },
								  { "Br[C@](Cl)(I)1.F1", "F[C@](Cl)(Br)I" },
								  { "[C@H](F)(I)1.Br1", "F[C@H](Br)I" },
								  { "Br[C@@H](F)1.I1", "F[C@H](Br)I" },

								  { "C[S@@](CC)=O", "CC[S@](C)=O" },
								  { "[S@](=O)(C)CC", "CC[S](C)=O" } };
		StereoMolecule mol = new StereoMolecule();
		for (String[] test:data) {
			try {
				new SmilesParser().parse(mol, test[0]);
				String smiles = new IsomericSmilesCreator(mol).getSmiles();
				System.out.print("IN:"+test[0]+" OUT:"+smiles);
				if (!test[1].equals(smiles))
					System.out.println(" EXPECTED: "+test[1]+" ERROR!");
				else
					System.out.println(" OK");
				}
			catch (Exception e) {
				e.printStackTrace();
				}
			}
		}

	public static void main(String[] args) {
		testStereo();

		System.out.println("ID-code equivalence test:");
		final String[][] data = { {	"N[C@@]([H])(C)C(=O)O",	"S-alanine",		"gGX`BDdwMUM@@" },
								  { "N[C@@H](C)C(=O)O",		"S-alanine",		"gGX`BDdwMUM@@" },
								  { "N[C@H](C(=O)O)C",		"S-alanine",		"gGX`BDdwMUM@@" },
								  { "[H][C@](N)(C)C(=O)O",	"S-alanine",		"gGX`BDdwMUM@@" },
								  { "[C@H](N)(C)C(=O)O",	"S-alanine",		"gGX`BDdwMUM@@" },
								  { "N[C@]([H])(C)C(=O)O",	"R-alanine",		"gGX`BDdwMUL`@" },
								  { "N[C@H](C)C(=O)O",		"R-alanine",		"gGX`BDdwMUL`@" },
								  { "N[C@@H](C(=O)O)C",		"R-alanine",		"gGX`BDdwMUL`@" },
								  { "[H][C@@](N)(C)C(=O)O",	"R-alanine",		"gGX`BDdwMUL`@" },
								  { "[C@@H](N)(C)C(=O)O",	"R-alanine",		"gGX`BDdwMUL`@" },
								  { "C[C@H]1CCCCO1",		"S-Methyl-pyran",	"gOq@@eLm]UUH`@" },
								  { "O1CCCC[C@@H]1C",		"S-Methyl-pyran",	"gOq@@eLm]UUH`@" },
								  { "[C@H](F)(B)O",			"S-Methyl-oxetan",	"gCaDDICTBSURH@" },
								  { "C1CO[C@H]1C",			"S-Methyl-oxetan",	"gKQ@@eLmUTb@" },
								  { "C1CO[C@@H](C)1",		"S-Methyl-oxetan",	"gKQ@@eLmUTb@" },
								  { "[C@H]1(C)CCO1",		"S-Methyl-oxetan",	"gKQ@@eLmUTb@" },
								  { "[H][C@]1(C)CCO1",		"S-Methyl-oxetan",	"gKQ@@eLmUTb@" },
								  { "[H][C@@]1(CCO1)C",		"S-Methyl-oxetan",	"gKQ@@eLmUTb@" },
								  { "[C@@]1([H])(C)CCO1",	"S-Methyl-oxetan",	"gKQ@@eLmUTb@" },
								  { "[C@]1(C)([H])CCO1",	"S-Methyl-oxetan",	"gKQ@@eLmUTb@" },
								  { "C1[C@@H]2COC2=N1",		"oxetan-azetin",	"gGy@LDimDvfja`@" },
								  { "CC(C)[C@@]12C[C@@H]1[C@@H](C)C(=O)C2", "alpha-thujone", "dmLH@@RYe~IfyjjjkDaIh@" },
								  { "CN1CCC[C@H]1c2cccnc2",	"Nicotine",			"dcm@@@{IDeCEDUSh@UUECP@" },
								  { "CC[C@H](O1)CC[C@@]12CCCO2", "2S,5R-Chalcogran", "dmLD@@qJZY|fFZjjjdbH`@" },
								  { "CCCC",					"butane",			"gC`@Dij@@" },
								  { "C1C.CC1",				"butane",			"gC`@Dij@@" },
								  { "[CH3][CH2][CH2][CH3]",	"butane",			"gC`@Dij@@" },
								  { "C-C-C-C",				"butane",			"gC`@Dij@@" },
								  { "C12.C1.CC2",			"butane",			"gC`@Dij@@" },
								  { "[Na+].[Cl-]",			"NaCl",				"eDARHm@zd@@" },
								  { "[Na+]-[Cl-]",			"NaCl",				"error" },
								  { "[Na+]1.[Cl-]1",		"NaCl",				"error" },
								  { "c1ccccc1",				"benzene",			"gFp@DiTt@@@" },
								  { "C1=C-C=C-C=C1",		"benzene",			"gFp@DiTt@@@" },
								  { "C1:C:C:C:C:C:1",		"benzene",			"gFp@DiTt@@@" },
								  { "c1ccncc1",				"pyridine",			"gFx@@eJf`@@@" },
								  { "[nH]1cccc1",			"pyrrole",			"gKX@@eKcRp@" },
								  { "N1C=C-C=C1",			"pyrrole",			"gKX@@eKcRp@" },
								  { "[H]n1cccc1",			"pyrrole",			"gKX@@eKcRp@" },
								  { "[H]n1cccc1",			"pyrrole",			"gKX@@eKcRp@" },
								  { "c1cncc1",				"pyrrole no [nH]",	"error" },
								  { "[13CH4]",				"C13-methane",		"fH@FJp@" },
								  { "[35ClH]",				"35-chlorane",		"fHdP@qX`" },
								  { "[35Cl-]",				"35-chloride",		"fHtPxAbq@" },
								  { "[Na+].[O-]c1ccccc1",	"Na-phenolate",		"daxHaHCPBXyAYUn`@@@" },
								  { "c1cc([O-].[Na+])ccc1",	"Na-phenolate",		"daxHaHCPBXyAYUn`@@@" },
								  { "C[C@@](C)(O1)C[C@@H](O)[C@@]1(O2)[C@@H](C)[C@@H]3CC=C4[C@]3(C2)C(=O)C[C@H]5[C@H]4CC[C@@H](C6)[C@]5(C)Cc(n7)c6nc(C[C@@]89(C))c7C[C@@H]8CC[C@@H]%10[C@@H]9C[C@@H](O)[C@@]%11(C)C%10=C[C@H](O%12)[C@]%11(O)[C@H](C)[C@]%12(O%13)[C@H](O)C[C@@]%13(C)CO",
									"Cephalostatin-1",
									"gdKe@h@@K`H@XjKHuYlnoP\\bbdRbbVTLbTrJbRaQRRRbTJTRTrfrfTTOBPHtFODPhLNSMdIERYJmShLfs]aqy|uUMUUUUUUE@UUUUMUUUUUUTQUUTPR`nDdQQKB|RIFbiQeARuQt`rSSMNtGS\\ct@@" },
									};

		StereoMolecule mol = new StereoMolecule();
		for (String[] test:data) {
			try {
				new SmilesParser().parse(mol, test[0]);
				String idcode = new Canonizer(mol).getIDCode();
				if (test[2].equals("error"))
					System.out.println("Should create error! "+test[1]+" smiles:"+test[0]+" idcode:"+idcode);
				else if (!test[2].equals(idcode))
					System.out.println("ERROR! "+test[1]+" smiles:"+test[0]+" is:"+idcode+" must:"+test[2]);
				}
			catch (Exception e) {
				if (!test[2].equals("error"))
					System.out.println("ERROR! "+test[1]+" smiles:"+test[0]+" exception:"+e.getMessage());
				}
			}
		}
	}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy