All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.hfg.chem.format.SMILES_Parser Maven / Gradle / Ivy

There is a newer version: 20240423
Show newest version
package com.hfg.chem.format;

import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;

import com.hfg.chem.Atom;
import com.hfg.chem.CovalentBond;
import com.hfg.chem.Element;
import com.hfg.chem.Isotope;
import com.hfg.chem.MolecularStructureException;
import com.hfg.chem.Molecule;
import com.hfg.chem.ValenceModel;
import com.hfg.util.StringUtil;
import com.hfg.util.collection.CollectionUtil;


//------------------------------------------------------------------------------
/**
 Class for parsing a Simplified Molecular-Input Line-Entry System (SMILES)
 string into a Molecule.

 @see opensmiles.org
 
@author J. Alex Taylor, hairyfatguy.com
*/ //------------------------------------------------------------------------------ // com.hfg Library // // This library is free software; you can redistribute it and/or // modify it under the terms of the GNU Lesser General Public // License as published by the Free Software Foundation; either // version 2.1 of the License, or (at your option) any later version. // // This library is distributed in the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU // Lesser General Public License for more details. // // You should have received a copy of the GNU Lesser General Public // License along with this library; if not, write to the Free Software // Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA // // J. Alex Taylor, President, Founder, CEO, COO, CFO, OOPS hairyfatguy.com // [email protected] //------------------------------------------------------------------------------ // SOME PARSING RULES: // - A SMILES string is terminated by a whitespace terminator character (space, tab, // newline, carriage-return), or by the end of the string. // - A SMILES parser should accept at least four digits for the atom class, and the values 0 to 9999. // - Ring-closure numbers can be reused. public class SMILES_Parser { private String mSMILES; private int mSMILES_Length; private int mCurrentIndex; private boolean mEND; private Map mRingClosureMap; private ValenceModel mValenceModel = ValenceModel.MDL_2017; // TODO: Switch to SMILES-specific valence model //########################################################################### // PUBLIC METHODS //########################################################################### //--------------------------------------------------------------------------- public synchronized Molecule parse(String inValue) { Molecule molecule = null; if (StringUtil.isSet(inValue)) { init(); mSMILES = inValue.trim(); mSMILES_Length = mSMILES.length(); List atoms = new ArrayList<>(10); try { Atom atom = parseAtom(); atoms.add(atom); Integer hCount = atom.getHCount(); if (hCount != null) { for (int i = 0; i < hCount; i++) { Atom hAtom = new Atom(Element.HYDROGEN); atoms.add(hAtom); CovalentBond hBond = new CovalentBond(atom, hAtom); atom.addBond(hBond); hAtom.addBond(hBond); } } List atomList = parseSection(atom, (char) -1); if (CollectionUtil.hasValues(atomList)) { atoms.addAll(atomList); } // Check to see if any rings were left open if (CollectionUtil.hasValues(mRingClosureMap)) { throw new SMILES_ParseException("Ring " + mRingClosureMap.keySet().iterator().next() + " left unclosed in " + mSMILES + "!"); } // We've finished parsing the string. Now build the molecule. molecule = new Molecule().addAtoms(atoms); // Set implicit hydrogen counts for (Atom molAtom : new ArrayList<>(molecule.getAtoms())) { int implicitHCount = mValenceModel.calculateImplicitHCount(molAtom); if (implicitHCount > 0) { for (int i = 0; i < implicitHCount; i++) { Atom hAtom = new Atom(Element.HYDROGEN); molecule.addAtom(hAtom); CovalentBond bond = new CovalentBond(molAtom, hAtom); molAtom.addBond(bond); hAtom.addBond(bond); } } } } catch (MolecularStructureException e) { throw new SMILES_ParseException("Problem parsing " + StringUtil.singleQuote(mSMILES) + "!", e); } } return molecule; } //########################################################################### // PRIVATE METHODS //########################################################################### //--------------------------------------------------------------------------- private void init() { mCurrentIndex = 0; mEND = false; mRingClosureMap = new HashMap<>(4); } //--------------------------------------------------------------------------- private char peek() { char theChar = (char) -1; if (mCurrentIndex < mSMILES_Length) { theChar = mSMILES.charAt(mCurrentIndex); } else { mEND = true; } return theChar; } //--------------------------------------------------------------------------- private char nextChar() { char theChar; if (mCurrentIndex >= mSMILES_Length) { mEND = true; theChar = (char) -1; } else { theChar = mSMILES.charAt(mCurrentIndex++); if (mCurrentIndex >= mSMILES_Length) { mEND = true; } } return theChar; } //--------------------------------------------------------------------------- private List parseSection(Atom inPrevAtom, char inEndingChar) { List atoms = new ArrayList<>(5); CovalentBond bond = null; Atom atom; Atom prevAtom = inPrevAtom; Integer hCount; boolean ionicBond = false; char theChar = peek(); while (! mEND && theChar != inEndingChar) { if ('[' == theChar || Character.isLetter(theChar)) { // Letter indicates an atom (and a default single bond between them) atom = parseAtom(); atoms.add(atom); hCount = atom.getHCount(); if (hCount != null) { for (int i = 0; i < hCount; i++) { Atom hAtom = new Atom(Element.HYDROGEN); atoms.add(hAtom); CovalentBond hBond = new CovalentBond(atom, hAtom); atom.addBond(hBond); hAtom.addBond(hBond); } } if (ionicBond) { ionicBond = false; } else { if (null == bond) { bond = new CovalentBond(prevAtom, atom); // A bond order of 1 is assumed prevAtom.addBond(bond); } else { bond.setSecondAtom(atom); } atom.addBond(bond); } prevAtom = atom; bond = null; } else if ('(' == theChar) { atoms.addAll(parseBranch(prevAtom)); } else if (Character.isDigit(theChar) || '%' == theChar) // '%' preceeds the use of a 2-digit ring-closure number { String ringClosureNumString = ""; if ('%' == theChar) { nextChar(); // Consume the '%' theChar = peek(); ringClosureNumString += theChar; nextChar(); // Consume the firstDigit theChar = peek(); } ringClosureNumString += theChar; if (! StringUtil.isNumber(ringClosureNumString)) { throw new SMILES_ParseException("The '%' at position " + (mCurrentIndex) + " of " + StringUtil.singleQuote(mSMILES) + " should precede a 2-digit ring-closure number!"); } // Ring-closure number int ringClosureNum = Integer.parseInt(ringClosureNumString); CovalentBond ringClosureBond = mRingClosureMap.remove(ringClosureNum); if (ringClosureBond != null) { // Don't leave a half-processed bond (we didn't know yet that it was a ring bond) Integer specifiedBondOrder = null; if (bond != null) { specifiedBondOrder = bond.getSpecifiedBondOrder(); prevAtom.removeBond(bond); if (specifiedBondOrder != null && ringClosureBond.getSpecifiedBondOrder() != null && ! ringClosureBond.getSpecifiedBondOrder().equals(specifiedBondOrder)) { throw new SMILES_ParseException("Ring-closure bond mismatch at position " + (mCurrentIndex + 1) + " of " + mSMILES + "!"); } ringClosureBond.setBondOrder(specifiedBondOrder); } // Close the ring ringClosureBond.setSecondAtom(prevAtom); prevAtom.addBond(ringClosureBond); } else { // Start the ring if (null == bond) { bond = new CovalentBond(prevAtom, null); prevAtom.addBond(bond); mRingClosureMap.put(ringClosureNum, bond); } mRingClosureMap.put(ringClosureNum, bond); bond = null; } nextChar(); // Consume the peeked char } else { switch (theChar) { case '-': // Single bond bond = new CovalentBond(prevAtom, 1); prevAtom.addBond(bond); break; case '=': // Double bond bond = new CovalentBond(prevAtom, 2); prevAtom.addBond(bond); break; case '#': // Triple bond bond = new CovalentBond(prevAtom, 3); prevAtom.addBond(bond); break; case '$': // Quadruple bond bond = new CovalentBond(prevAtom, 4); prevAtom.addBond(bond); break; case ':': // aromatic bond bond = new CovalentBond(prevAtom, 1).setIsAromatic(); prevAtom.addBond(bond); break; case '.': // Ionic bond (non-covalent) bond = null; ionicBond = true; break; case '/': // Cis/Trans "up" bond bond = new CovalentBond(prevAtom, 1).setIsUp(); prevAtom.addBond(bond); break; case '\\': // Cis/Trans "down" bond bond = new CovalentBond(prevAtom, 1).setIsDown(); prevAtom.addBond(bond); break; case ' ': case '\t': case '\r': case '\n': case (char) -1: mEND = true; break; default: throw new SMILES_ParseException("Unexpected character '" + theChar + "' at position " + (mCurrentIndex + 1) + " in " + StringUtil.singleQuote(mSMILES) + "!"); } nextChar(); // Consume the peeked char } theChar = peek(); } if (theChar == inEndingChar) { nextChar(); // Consume it } return atoms; } //--------------------------------------------------------------------------- private Atom parseAtom() { char theChar = nextChar(); boolean inBrackets = (theChar == '['); if (inBrackets) { theChar = nextChar(); } String isotopeString = ""; while (! mEND && Character.isDigit(theChar)) { isotopeString += theChar; theChar = nextChar(); } String symbol = ""; boolean aromatic = false; if (Character.isLowerCase(theChar)) { // Lowercase symbols indicate aromatic atoms if ("bcnospat".indexOf(theChar) < 0) // s for S or Se; a for As, t for Te { throw new SMILES_ParseException("Unexpected aromatic symbol '" + theChar + "' at position " + (mCurrentIndex + 1) + " of " + StringUtil.singleQuote(mSMILES) + "!"); } symbol += Character.toUpperCase(theChar); aromatic = true; } else { symbol += theChar; } theChar = peek(); if (inBrackets || ! aromatic) { // Two char element symbols are possible if (Character.isLetter(theChar)) { // It could be a second letter of an element symbol or it could be the next element if (Element.valueOf(symbol + theChar) != null) { symbol += theChar; nextChar(); // Consume the second symbol letter theChar = peek(); } } } Element element = Element.valueOf(symbol); if (isotopeString.length() > 0) { element = Isotope.valueOf(element, Integer.parseInt(isotopeString)); } Atom atom = new Atom(element); if (aromatic) { atom.setIsAromatic(true); } // Chirality (optional) if ('@' == theChar) { Atom.ChiralityOrder chiralityOrder = Atom.ChiralityOrder.anticlockwise; nextChar(); // Consume the character theChar = peek(); if ('@' == theChar) { chiralityOrder = Atom.ChiralityOrder.clockwise; nextChar(); // Consume the character theChar = peek(); } atom.setChiralityOrder(chiralityOrder); } // Explicit hydrogen count (optional) if ('H' == theChar) { int hCount = 1; nextChar(); // Consume the character theChar = peek(); if (Character.isDigit(theChar)) { hCount = Integer.parseInt(theChar + ""); nextChar(); // Consume the character theChar = peek(); } atom.setHCount(hCount); } else if (inBrackets) { atom.setHCount(0); } // Charge (optional) if (inBrackets) { if ('+' == theChar) { int charge = 1; nextChar(); // Consume the character theChar = peek(); if ('+' == theChar) // ++ { charge = 2; nextChar(); // Consume the character theChar = peek(); } else { String chargeString = ""; while (Character.isDigit(theChar)) { chargeString += theChar; nextChar(); // Consume the character theChar = peek(); } if (chargeString.length() > 0) { charge = Integer.parseInt(chargeString); } } atom.setCharge(charge); } else if ('-' == theChar) { int charge = -1; nextChar(); // Consume the character theChar = peek(); if ('-' == theChar) // ++ { charge = -2; nextChar(); // Consume the character theChar = peek(); } else { String chargeString = ""; while (Character.isDigit(theChar)) { chargeString += theChar; nextChar(); // Consume the character theChar = peek(); } if (chargeString.length() > 0) { charge = -1 * Integer.parseInt(chargeString); } } atom.setCharge(charge); } // Atom class if (':' == theChar) { nextChar(); // Consume the character theChar = peek(); String classString = ""; while (Character.isDigit(theChar)) { classString += theChar; nextChar(); // Consume the character theChar = peek(); } atom.setAtomClass(Integer.parseInt(classString)); } } if (']' == theChar) { if (inBrackets) { // We were expecting this ending bracket ++mCurrentIndex; } else { throw new SMILES_ParseException("Unexpected ending atom bracket at position " + (mCurrentIndex + 1) + "!"); } } else if (inBrackets) { throw new SMILES_ParseException("Missing ending atom bracket at position " + (mCurrentIndex + 1) + "!"); } return atom; } //--------------------------------------------------------------------------- private List parseBranch(Atom inPrevAtom) { char theChar = nextChar(); if ('(' != theChar) { throw new SMILES_ParseException("SMILES branch missing starting parenthesis at position " + (mCurrentIndex + 1) + "!"); } return parseSection(inPrevAtom, ')'); } }




© 2015 - 2024 Weber Informatics LLC | Privacy Policy