com.hfg.chem.format.SMILES_Parser Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of com_hfg Show documentation
Show all versions of com_hfg Show documentation
com.hfg xml, html, svg, and bioinformatics utility library
package com.hfg.chem.format;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import com.hfg.chem.Atom;
import com.hfg.chem.CovalentBond;
import com.hfg.chem.Element;
import com.hfg.chem.Isotope;
import com.hfg.chem.MolecularStructureException;
import com.hfg.chem.Molecule;
import com.hfg.chem.ValenceModel;
import com.hfg.util.StringUtil;
import com.hfg.util.collection.CollectionUtil;
//------------------------------------------------------------------------------
/**
Class for parsing a Simplified Molecular-Input Line-Entry System (SMILES)
string into a Molecule.
@see opensmiles.org
@author J. Alex Taylor, hairyfatguy.com
*/
//------------------------------------------------------------------------------
// com.hfg Library
//
// This library is free software; you can redistribute it and/or
// modify it under the terms of the GNU Lesser General Public
// License as published by the Free Software Foundation; either
// version 2.1 of the License, or (at your option) any later version.
//
// This library is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
// Lesser General Public License for more details.
//
// You should have received a copy of the GNU Lesser General Public
// License along with this library; if not, write to the Free Software
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
//
// J. Alex Taylor, President, Founder, CEO, COO, CFO, OOPS hairyfatguy.com
// [email protected]
//------------------------------------------------------------------------------
// SOME PARSING RULES:
// - A SMILES string is terminated by a whitespace terminator character (space, tab,
// newline, carriage-return), or by the end of the string.
// - A SMILES parser should accept at least four digits for the atom class, and the values 0 to 9999.
// - Ring-closure numbers can be reused.
public class SMILES_Parser
{
private String mSMILES;
private int mSMILES_Length;
private int mCurrentIndex;
private boolean mEND;
private Map mRingClosureMap;
private ValenceModel mValenceModel = ValenceModel.MDL_2017; // TODO: Switch to SMILES-specific valence model
//###########################################################################
// PUBLIC METHODS
//###########################################################################
//---------------------------------------------------------------------------
public synchronized Molecule parse(String inValue)
{
Molecule molecule = null;
if (StringUtil.isSet(inValue))
{
init();
mSMILES = inValue.trim();
mSMILES_Length = mSMILES.length();
List atoms = new ArrayList<>(10);
try
{
Atom atom = parseAtom();
atoms.add(atom);
Integer hCount = atom.getHCount();
if (hCount != null)
{
for (int i = 0; i < hCount; i++)
{
Atom hAtom = new Atom(Element.HYDROGEN);
atoms.add(hAtom);
CovalentBond hBond = new CovalentBond(atom, hAtom);
atom.addBond(hBond);
hAtom.addBond(hBond);
}
}
List atomList = parseSection(atom, (char) -1);
if (CollectionUtil.hasValues(atomList))
{
atoms.addAll(atomList);
}
// Check to see if any rings were left open
if (CollectionUtil.hasValues(mRingClosureMap))
{
throw new SMILES_ParseException("Ring " + mRingClosureMap.keySet().iterator().next() + " left unclosed in " + mSMILES + "!");
}
// We've finished parsing the string. Now build the molecule.
molecule = new Molecule().addAtoms(atoms);
// Set implicit hydrogen counts
for (Atom molAtom : new ArrayList<>(molecule.getAtoms()))
{
int implicitHCount = mValenceModel.calculateImplicitHCount(molAtom);
if (implicitHCount > 0)
{
for (int i = 0; i < implicitHCount; i++)
{
Atom hAtom = new Atom(Element.HYDROGEN);
molecule.addAtom(hAtom);
CovalentBond bond = new CovalentBond(molAtom, hAtom);
molAtom.addBond(bond);
hAtom.addBond(bond);
}
}
}
}
catch (MolecularStructureException e)
{
throw new SMILES_ParseException("Problem parsing " + StringUtil.singleQuote(mSMILES) + "!", e);
}
}
return molecule;
}
//###########################################################################
// PRIVATE METHODS
//###########################################################################
//---------------------------------------------------------------------------
private void init()
{
mCurrentIndex = 0;
mEND = false;
mRingClosureMap = new HashMap<>(4);
}
//---------------------------------------------------------------------------
private char peek()
{
char theChar = (char) -1;
if (mCurrentIndex < mSMILES_Length)
{
theChar = mSMILES.charAt(mCurrentIndex);
}
else
{
mEND = true;
}
return theChar;
}
//---------------------------------------------------------------------------
private char nextChar()
{
char theChar;
if (mCurrentIndex >= mSMILES_Length)
{
mEND = true;
theChar = (char) -1;
}
else
{
theChar = mSMILES.charAt(mCurrentIndex++);
if (mCurrentIndex >= mSMILES_Length)
{
mEND = true;
}
}
return theChar;
}
//---------------------------------------------------------------------------
private List parseSection(Atom inPrevAtom, char inEndingChar)
{
List atoms = new ArrayList<>(5);
CovalentBond bond = null;
Atom atom;
Atom prevAtom = inPrevAtom;
Integer hCount;
boolean ionicBond = false;
char theChar = peek();
while (! mEND
&& theChar != inEndingChar)
{
if ('[' == theChar
|| Character.isLetter(theChar))
{
// Letter indicates an atom (and a default single bond between them)
atom = parseAtom();
atoms.add(atom);
hCount = atom.getHCount();
if (hCount != null)
{
for (int i = 0; i < hCount; i++)
{
Atom hAtom = new Atom(Element.HYDROGEN);
atoms.add(hAtom);
CovalentBond hBond = new CovalentBond(atom, hAtom);
atom.addBond(hBond);
hAtom.addBond(hBond);
}
}
if (ionicBond)
{
ionicBond = false;
}
else
{
if (null == bond)
{
bond = new CovalentBond(prevAtom, atom); // A bond order of 1 is assumed
prevAtom.addBond(bond);
}
else
{
bond.setSecondAtom(atom);
}
atom.addBond(bond);
}
prevAtom = atom;
bond = null;
}
else if ('(' == theChar)
{
atoms.addAll(parseBranch(prevAtom));
}
else if (Character.isDigit(theChar)
|| '%' == theChar) // '%' preceeds the use of a 2-digit ring-closure number
{
String ringClosureNumString = "";
if ('%' == theChar)
{
nextChar(); // Consume the '%'
theChar = peek();
ringClosureNumString += theChar;
nextChar(); // Consume the firstDigit
theChar = peek();
}
ringClosureNumString += theChar;
if (! StringUtil.isNumber(ringClosureNumString))
{
throw new SMILES_ParseException("The '%' at position " + (mCurrentIndex)
+ " of " + StringUtil.singleQuote(mSMILES)
+ " should precede a 2-digit ring-closure number!");
}
// Ring-closure number
int ringClosureNum = Integer.parseInt(ringClosureNumString);
CovalentBond ringClosureBond = mRingClosureMap.remove(ringClosureNum);
if (ringClosureBond != null)
{
// Don't leave a half-processed bond (we didn't know yet that it was a ring bond)
Integer specifiedBondOrder = null;
if (bond != null)
{
specifiedBondOrder = bond.getSpecifiedBondOrder();
prevAtom.removeBond(bond);
if (specifiedBondOrder != null
&& ringClosureBond.getSpecifiedBondOrder() != null
&& ! ringClosureBond.getSpecifiedBondOrder().equals(specifiedBondOrder))
{
throw new SMILES_ParseException("Ring-closure bond mismatch at position " + (mCurrentIndex + 1) + " of " + mSMILES + "!");
}
ringClosureBond.setBondOrder(specifiedBondOrder);
}
// Close the ring
ringClosureBond.setSecondAtom(prevAtom);
prevAtom.addBond(ringClosureBond);
}
else
{
// Start the ring
if (null == bond)
{
bond = new CovalentBond(prevAtom, null);
prevAtom.addBond(bond);
mRingClosureMap.put(ringClosureNum, bond);
}
mRingClosureMap.put(ringClosureNum, bond);
bond = null;
}
nextChar(); // Consume the peeked char
}
else
{
switch (theChar)
{
case '-': // Single bond
bond = new CovalentBond(prevAtom, 1);
prevAtom.addBond(bond);
break;
case '=': // Double bond
bond = new CovalentBond(prevAtom, 2);
prevAtom.addBond(bond);
break;
case '#': // Triple bond
bond = new CovalentBond(prevAtom, 3);
prevAtom.addBond(bond);
break;
case '$': // Quadruple bond
bond = new CovalentBond(prevAtom, 4);
prevAtom.addBond(bond);
break;
case ':': // aromatic bond
bond = new CovalentBond(prevAtom, 1).setIsAromatic();
prevAtom.addBond(bond);
break;
case '.': // Ionic bond (non-covalent)
bond = null;
ionicBond = true;
break;
case '/': // Cis/Trans "up" bond
bond = new CovalentBond(prevAtom, 1).setIsUp();
prevAtom.addBond(bond);
break;
case '\\': // Cis/Trans "down" bond
bond = new CovalentBond(prevAtom, 1).setIsDown();
prevAtom.addBond(bond);
break;
case ' ':
case '\t':
case '\r':
case '\n':
case (char) -1:
mEND = true;
break;
default:
throw new SMILES_ParseException("Unexpected character '" + theChar + "' at position " + (mCurrentIndex + 1) + " in " + StringUtil.singleQuote(mSMILES) + "!");
}
nextChar(); // Consume the peeked char
}
theChar = peek();
}
if (theChar == inEndingChar)
{
nextChar(); // Consume it
}
return atoms;
}
//---------------------------------------------------------------------------
private Atom parseAtom()
{
char theChar = nextChar();
boolean inBrackets = (theChar == '[');
if (inBrackets)
{
theChar = nextChar();
}
String isotopeString = "";
while (! mEND
&& Character.isDigit(theChar))
{
isotopeString += theChar;
theChar = nextChar();
}
String symbol = "";
boolean aromatic = false;
if (Character.isLowerCase(theChar))
{
// Lowercase symbols indicate aromatic atoms
if ("bcnospat".indexOf(theChar) < 0) // s for S or Se; a for As, t for Te
{
throw new SMILES_ParseException("Unexpected aromatic symbol '" + theChar + "' at position " + (mCurrentIndex + 1) + " of " + StringUtil.singleQuote(mSMILES) + "!");
}
symbol += Character.toUpperCase(theChar);
aromatic = true;
}
else
{
symbol += theChar;
}
theChar = peek();
if (inBrackets || ! aromatic)
{
// Two char element symbols are possible
if (Character.isLetter(theChar))
{
// It could be a second letter of an element symbol or it could be the next element
if (Element.valueOf(symbol + theChar) != null)
{
symbol += theChar;
nextChar(); // Consume the second symbol letter
theChar = peek();
}
}
}
Element element = Element.valueOf(symbol);
if (isotopeString.length() > 0)
{
element = Isotope.valueOf(element, Integer.parseInt(isotopeString));
}
Atom atom = new Atom(element);
if (aromatic)
{
atom.setIsAromatic(true);
}
// Chirality (optional)
if ('@' == theChar)
{
Atom.ChiralityOrder chiralityOrder = Atom.ChiralityOrder.anticlockwise;
nextChar(); // Consume the character
theChar = peek();
if ('@' == theChar)
{
chiralityOrder = Atom.ChiralityOrder.clockwise;
nextChar(); // Consume the character
theChar = peek();
}
atom.setChiralityOrder(chiralityOrder);
}
// Explicit hydrogen count (optional)
if ('H' == theChar)
{
int hCount = 1;
nextChar(); // Consume the character
theChar = peek();
if (Character.isDigit(theChar))
{
hCount = Integer.parseInt(theChar + "");
nextChar(); // Consume the character
theChar = peek();
}
atom.setHCount(hCount);
}
else if (inBrackets)
{
atom.setHCount(0);
}
// Charge (optional)
if (inBrackets)
{
if ('+' == theChar)
{
int charge = 1;
nextChar(); // Consume the character
theChar = peek();
if ('+' == theChar) // ++
{
charge = 2;
nextChar(); // Consume the character
theChar = peek();
}
else
{
String chargeString = "";
while (Character.isDigit(theChar))
{
chargeString += theChar;
nextChar(); // Consume the character
theChar = peek();
}
if (chargeString.length() > 0)
{
charge = Integer.parseInt(chargeString);
}
}
atom.setCharge(charge);
}
else if ('-' == theChar)
{
int charge = -1;
nextChar(); // Consume the character
theChar = peek();
if ('-' == theChar) // ++
{
charge = -2;
nextChar(); // Consume the character
theChar = peek();
}
else
{
String chargeString = "";
while (Character.isDigit(theChar))
{
chargeString += theChar;
nextChar(); // Consume the character
theChar = peek();
}
if (chargeString.length() > 0)
{
charge = -1 * Integer.parseInt(chargeString);
}
}
atom.setCharge(charge);
}
// Atom class
if (':' == theChar)
{
nextChar(); // Consume the character
theChar = peek();
String classString = "";
while (Character.isDigit(theChar))
{
classString += theChar;
nextChar(); // Consume the character
theChar = peek();
}
atom.setAtomClass(Integer.parseInt(classString));
}
}
if (']' == theChar)
{
if (inBrackets)
{
// We were expecting this ending bracket
++mCurrentIndex;
}
else
{
throw new SMILES_ParseException("Unexpected ending atom bracket at position " + (mCurrentIndex + 1) + "!");
}
}
else if (inBrackets)
{
throw new SMILES_ParseException("Missing ending atom bracket at position " + (mCurrentIndex + 1) + "!");
}
return atom;
}
//---------------------------------------------------------------------------
private List parseBranch(Atom inPrevAtom)
{
char theChar = nextChar();
if ('(' != theChar)
{
throw new SMILES_ParseException("SMILES branch missing starting parenthesis at position " + (mCurrentIndex + 1) + "!");
}
return parseSection(inPrevAtom, ')');
}
}