uk.ac.ebi.beam.Parser Maven / Gradle / Ivy
/*
* Copyright (c) 2013, European Bioinformatics Institute (EMBL-EBI)
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice, this
* list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
* ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
* The views and conclusions contained in the software and documentation are those
* of the authors and should not be interpreted as representing official policies,
* either expressed or implied, of the FreeBSD Project.
*/
package uk.ac.ebi.beam;
import java.util.*;
import static java.util.Map.Entry;
/**
* Parse a SMILES string and create a {@link Graph}. A new parser should be
* created for each invocation, for convenience {@link #parse(String)} is
* provided.
*
*
* Graph g = Parser.parse("CCO");
*
*
* @author John May
*/
final class Parser {
/**
* Keep track of branching.
*/
private final IntStack stack = new IntStack(10);
/**
* Molecule being loaded.
*/
private final Graph g;
/**
* Keep track of ring information.
*/
private RingBond[] rings = new RingBond[10];
/**
* Local arrangement for ring openings.
*/
private Map arrangement
= new HashMap(5);
private Map configurations
= new HashMap(5);
/**
* Current bond.
*/
private Bond bond = Bond.IMPLICIT;
/**
* Current configuration.
*/
private Configuration configuration = Configuration.UNKNOWN;
/**
* Which vertices start a new run of tokens. This includes the first vertex
* and all vertices which immediately follow a 'dot' bond. These are
* required to correctly store atom topologies.
*/
private Set start = new TreeSet();
/**
* Number of open rings - all rings should be closed.
*/
private int openRings = 0;
/**
* Strict parsing.
*/
private final boolean strict;
private BitSet checkDirectionalBonds = new BitSet();
private int lastBondPos = -1;
private Map bondStrPos = new HashMap<>();
private List warnings = new ArrayList<>();
private boolean hasAstrix = false;
/**
* Create a new parser for the specified buffer.
*
* @param buffer character buffer holding a SMILES string
* @throws InvalidSmilesException thrown if the SMILES could not be parsed
*/
Parser(CharBuffer buffer, boolean strict) throws InvalidSmilesException {
this.strict = strict;
g = new Graph(1 + (2 * (buffer.length() / 3)));
readSmiles(buffer);
if (openRings > 0)
throw new InvalidSmilesException("Unclosed ring detected, SMILES may be truncated:", buffer);
if (stack.size() > 1)
throw new InvalidSmilesException("Unclosed branch detected, SMILES may be truncated:", buffer);
start.add(0); // always include first vertex as start
if (g.getFlags(Graph.HAS_STRO) != 0) {
createTopologies(buffer);
}
if (hasAstrix) {
for (int i = 0; i < g.order(); i++) {
Atom atom = g.atom(i);
if (atom.element() == Element.Unknown) {
int nArom = 0;
for (Edge e : g.edges(i)) {
if (e.bond() == Bond.AROMATIC ||
e.bond() == Bond.IMPLICIT && g.atom(e.other(i)).aromatic())
nArom++;
}
if (nArom >= 2) {
if (atom == AtomImpl.AliphaticSubset.Any)
g.setAtom(i, AtomImpl.AromaticSubset.Any);
else
g.setAtom(i,
new AtomImpl.BracketAtom(-1,
Element.Unknown,
atom.label(),
atom.hydrogens(),
atom.charge(),
atom.atomClass(),
true));
}
}
}
}
}
/**
* Create a new (loose) parser for the specified string.
*
* @param str SMILES string
* @throws InvalidSmilesException thrown if the SMILES could not be parsed
*/
Parser(String str) throws InvalidSmilesException {
this(CharBuffer.fromString(str), false);
}
/**
* Strict parsing of the provided SMILES string. The strict parser will
* throw more exceptions for unusual input.
*
* @param str the SMILES string to process
* @return a graph created with the strict parser
* @throws InvalidSmilesException
*/
static Graph strict(String str) throws InvalidSmilesException {
return new Parser(CharBuffer.fromString(str), true).molecule();
}
/**
* Loose parsing of the provided SMILES string. The loose parser is more
* relaxed and will allow abnormal aromatic elements (e.g. 'te') as well as
* bare 'H', 'D' and 'T' for hydrogen and it's isotopes. Note the hydrogen
* and isotopes are replaced with their correct bracket equivalent.
*
* @param str the SMILES string to process
* @return a graph created with the loose parser
* @throws InvalidSmilesException
*/
static Graph losse(String str) throws InvalidSmilesException {
return new Parser(CharBuffer.fromString(str), false).molecule();
}
/**
* Access the molecule created by the parser.
*
* @return the chemical graph for the parsed smiles string
*/
Graph molecule() {
return g;
}
/**
* Create the topologies (stereo configurations) for the chemical graph. The
* topologies define spacial arrangement around atoms.
*/
private void createTopologies(CharBuffer buffer) throws InvalidSmilesException {
// create topologies (stereo configurations)
for (Entry e : configurations.entrySet()) {
addTopology(e.getKey(),
e.getValue(),
Topology.toExplicit(g, e.getKey(), e.getValue()));
}
for (int v = checkDirectionalBonds.nextSetBit(0); v >= 0; v = checkDirectionalBonds.nextSetBit(v + 1)) {
int nUpV = 0;
int nDownV = 0;
int nUpW = 0;
int nDownW = 0;
int w = -1;
{
final int d = g.degree(v);
for (int j = 0; j < d; ++j) {
final Edge e = g.edgeAt(v, j);
Bond bond = e.bond(v);
if (bond == Bond.UP)
nUpV++;
else if (bond == Bond.DOWN)
nDownV++;
else if (bond == Bond.DOUBLE)
w = e.other(v);
}
}
if (w < 0)
continue;
checkDirectionalBonds.clear(w);
{
final int d = g.degree(w);
for (int j = 0; j < d; ++j) {
final Edge e = g.edgeAt(w, j);
Bond bond = e.bond(w);
if (bond == Bond.UP)
nUpW++;
else if (bond == Bond.DOWN)
nDownW++;
}
}
if (nUpV + nDownV == 0 || nUpW + nDownW == 0)
continue;
if (nUpV > 1 || nDownV > 1) {
int offset1 = -1, offset2 = -1;
for (Edge e : g.edges(v)) {
if (e.bond().directional())
if (offset1 < 0)
offset1 = bondStrPos.get(e);
else
offset2 = bondStrPos.get(e);
}
String errorPos = InvalidSmilesException.display(buffer,
offset1 - buffer.length(),
offset2 - buffer.length());
if (strict)
throw new InvalidSmilesException("Ignored invalid Cis/Trans specification: " + errorPos);
else
warnings.add("Ignored invalid Cis/Trans specification: " + errorPos);
}
if (nUpW > 1 || nDownW > 1) {
int offset1 = -1, offset2 = -1;
for (Edge e : g.edges(w)) {
if (e.bond().directional())
if (offset1 < 0)
offset1 = bondStrPos.get(e);
else
offset2 = bondStrPos.get(e);
}
String errorPos = InvalidSmilesException.display(buffer,
offset1 - buffer.length(),
offset2 - buffer.length());
if (strict)
throw new InvalidSmilesException("Ignored invalid Cis/Trans specification: " + errorPos);
else
warnings.add("Ignored invalid Cis/Trans specification: " + errorPos);
}
}
}
public List getEdges(LocalArrangement localArrangement, int u) {
if (localArrangement == null)
return g.edges(u);
int[] vs = localArrangement.toArray();
List edges = new ArrayList(vs.length);
for (int v : vs)
edges.add(g.edge(u, v));
return edges;
}
private int getOtherDb(int u, int v) {
for (Edge e : getLocalEdges(u)) {
if (e.bond() != Bond.DOUBLE)
continue;
int nbr = e.other(u);
if (nbr == v)
continue;
return nbr;
}
return -1;
}
private int[] findExtendedTetrahedralEnds(int focus) {
List es = getLocalEdges(focus);
int prevEnd1 = focus;
int prevEnd2 = focus;
int end1 = es.get(0).other(prevEnd2);
int end2 = es.get(1).other(prevEnd2);
int tmp;
while (end1 >= 0 && end2 >= 0) {
tmp = getOtherDb(end1, prevEnd1);
prevEnd1 = end1;
end1 = tmp;
tmp = getOtherDb(end2, prevEnd2);
prevEnd2 = end2;
end2 = tmp;
}
return new int[]{prevEnd1, prevEnd2};
}
/**
* Access the local edges in order.
*/
private List getLocalEdges(int end) {
return getEdges(arrangement.get(end), end);
}
/**
* Complicated process to get correct Allene neighbors.
*
* @param focus the focus (central cumualted atom)
* @return the carrier list
*/
public int[] getAlleneCarriers(int focus) {
int[] carriers = new int[4];
int i = 0;
int[] ends = findExtendedTetrahedralEnds(focus);
int beg = ends[0];
int end = ends[1];
boolean begh = g.implHCount(beg) == 1;
boolean endh = g.implHCount(end) == 1;
List begEdges = new ArrayList<>(getLocalEdges(beg));
if (begh)
begEdges.add(start.contains(beg) ? 0 : 1, null);
for (Edge bEdge : getLocalEdges(beg)) {
if (bEdge == null) {
carriers[i++] = beg;
continue;
}
int bnbr = bEdge.other(beg);
if (beg < bnbr && begh) {
carriers[i++] = beg;
begh = false;
}
if (bEdge.bond() == Bond.DOUBLE) {
// neighbors next to end
List endEdges = new ArrayList<>(getLocalEdges(end));
if (endh)
endEdges.add(1, null);
for (Edge eEdge : endEdges) {
if (eEdge == null)
carriers[i++] = end;
else if (eEdge.bond() != Bond.DOUBLE)
carriers[i++] = eEdge.other(end);
}
} else {
carriers[i++] = bnbr;
}
}
if (i != 4)
return null;
return carriers;
}
/**
* Add a topology for vertex 'u' with configuration 'c'. If the atom 'u' was
* involved in a ring closure the local arrangement is used instead of the
* order in the graph. The configuration should be explicit '@TH1' or '@TH2'
* instead of '@' or '@@'.
*
* @param u a vertex
* @param c explicit configuration of that vertex
* @see Topology#toExplicit(Graph, int, Configuration)
*/
private void addTopology(int u, Configuration input, Configuration c) throws
InvalidSmilesException {
// stereo on ring closure - use local arrangement
if (arrangement.containsKey(u)) {
int[] us = arrangement.get(u).toArray();
List es = getLocalEdges(u);
if (c.type() == Configuration.Type.Tetrahedral) {
us = insertThImplicitRef(u, us); // XXX: temp fix
} else if (c.type() == Configuration.Type.DoubleBond) {
us = insertDbImplicitRef(u, us); // XXX: temp fix
} else if (c.type() == Configuration.Type.ExtendedTetrahedral) {
g.addFlags(Graph.HAS_EXT_STRO);
if ((us = getAlleneCarriers(u)) == null) {
if (strict)
throw new InvalidSmilesException("Invalid Allene stereo");
else
warnings.add("Ignored invalid Allene stereochemistry");
return;
}
} else if (input.type() == Configuration.Type.SquarePlanar) {
us = insertMultipleImplicitRefs(u, us, 4);
} else if (input.type() == Configuration.Type.TrigonalBipyramidal) {
us = insertMultipleImplicitRefs(u, us, 5);
} else if (input.type() == Configuration.Type.Octahedral) {
us = insertMultipleImplicitRefs(u, us, 6);
} else if (c.type() == Configuration.Type.SquarePlanar &&
us.length != 4) {
if (strict)
throw new InvalidSmilesException("SquarePlanar without 4 explicit neighbours");
else
warnings.add("SquarePlanar without 4 explicit neighbours");
return;
} else if (c.type() == Configuration.Type.TrigonalBipyramidal &&
us.length != 5) {
if (strict)
throw new InvalidSmilesException("TrigonalBipyramidal without 5 explicit neighbours");
else
warnings.add("SquarePlanar without 5 explicit neighbours");
return;
} else if (c.type() == Configuration.Type.Octahedral &&
us.length != 6) {
if (strict)
throw new InvalidSmilesException("Octahedral without 6 explicit neighbours");
else
warnings.add("SquarePlanar without 6 explicit neighbours");
return;
}
g.addTopology(Topology.create(u, us, es, c));
} else {
int[] us = new int[g.degree(u)];
List es = g.edges(u);
for (int i = 0; i < us.length; i++)
us[i] = es.get(i).other(u);
if (c.type() == Configuration.Type.Tetrahedral) {
us = insertThImplicitRef(u, us); // XXX: temp fix
} else if (c.type() == Configuration.Type.DoubleBond) {
us = insertDbImplicitRef(u, us); // XXX: temp fix
} else if (c.type() == Configuration.Type.ExtendedTetrahedral) {
g.addFlags(Graph.HAS_EXT_STRO);
if ((us = getAlleneCarriers(u)) == null)
return;
} else if (input.type() == Configuration.Type.SquarePlanar) {
us = insertMultipleImplicitRefs(u, us, 4);
} else if (input.type() == Configuration.Type.TrigonalBipyramidal) {
us = insertMultipleImplicitRefs(u, us, 5);
} else if (input.type() == Configuration.Type.Octahedral) {
us = insertMultipleImplicitRefs(u, us, 6);
} else if (c.type() == Configuration.Type.SquarePlanar &&
us.length != 4) {
if (strict)
throw new InvalidSmilesException("SquarePlanar without 4 explicit neighbours");
else
warnings.add("SquarePlanar without 4 explicit neighbours");
return;
} else if (c.type() == Configuration.Type.TrigonalBipyramidal &&
us.length != 5) {
if (strict)
throw new InvalidSmilesException("TrigonalBipyramidal without 5 explicit neighbours");
else
warnings.add("SquarePlanar without 5 explicit neighbours");
return;
} else if (c.type() == Configuration.Type.Octahedral &&
us.length != 6) {
if (strict)
throw new InvalidSmilesException("Octahedral without 6 explicit neighbours");
else
warnings.add("SquarePlanar without 6 explicit neighbours");
return;
}
g.addTopology(Topology.create(u, us, es, c));
}
}
private int[] insertThImplicitRef(int u, int[] vs) throws
InvalidSmilesException {
if (vs.length == 4)
return vs;
if (vs.length != 3)
throw new InvalidSmilesException("Invalid number of vertices for TH1/TH2 stereo chemistry");
if (start.contains(u))
return new int[]{u, vs[0], vs[1], vs[2]};
else
return new int[]{vs[0], u, vs[1], vs[2]};
}
private int[] insertMultipleImplicitRefs(int u, int[] vs, int n) throws
InvalidSmilesException {
if (vs.length == n)
return vs;
if (vs.length <= 1)
throw new InvalidSmilesException("Cannot have <= 1 vertices for high-order stereo chemistry");
int cnt = n - vs.length;
int srcIdx = 0;
int dstIdx = 0;
int[] padded = new int[n];
if (!start.contains(u))
padded[dstIdx++] = vs[srcIdx++];
while (cnt-- > 0)
padded[dstIdx++] = u;
while (srcIdx < vs.length)
padded[dstIdx++] = vs[srcIdx++];
return padded;
}
// XXX: temporary fix for correcting configurations
private int[] insertDbImplicitRef(int u, int[] vs) throws
InvalidSmilesException {
if (vs.length == 3)
return vs;
if (vs.length != 2)
throw new InvalidSmilesException("Invaid number of verticies for DB1/DB2 stereo chemistry");
if (start.contains(u))
return new int[]{u, vs[0], vs[1]};
else
return new int[]{vs[0], u, vs[1]};
}
/**
* Add an atom and bond with the atom on the stack (if available and non-dot
* bond).
*
* @param a an atom to add
*/
private void addAtom(Atom a, CharBuffer buffer) throws InvalidSmilesException {
int v = g.addAtom(a);
if (!stack.empty()) {
int u = stack.pop();
if (bond != Bond.DOT) {
Edge e = new Edge(u, v, bond);
if (bond.directional()) {
bondStrPos.put(e, lastBondPos);
checkDirectionalBonds.set(u);
checkDirectionalBonds.set(v);
}
g.addEdge(e);
if (arrangement.containsKey(u))
arrangement.get(u).add(v);
} else {
start.add(v); // start of a new run
}
}
stack.push(v);
bond = Bond.IMPLICIT;
// configurations used to create topologies after parsing
if (configuration != Configuration.UNKNOWN) {
g.addFlags(Graph.HAS_ATM_STRO);
configurations.put(v, configuration);
configuration = Configuration.UNKNOWN;
}
}
/**
* Read a molecule from the character buffer.
*
* @param buffer a character buffer
* @throws InvalidSmilesException invalid grammar
*/
private void readSmiles(final CharBuffer buffer) throws
InvalidSmilesException {
// primary dispatch
while (buffer.hasRemaining()) {
char c = buffer.get();
switch (c) {
// aliphatic subset
case '*':
hasAstrix = true;
addAtom(AtomImpl.AliphaticSubset.Any, buffer);
break;
case 'B':
if (buffer.getIf('r'))
addAtom(AtomImpl.AliphaticSubset.Bromine, buffer);
else
addAtom(AtomImpl.AliphaticSubset.Boron, buffer);
break;
case 'C':
if (buffer.getIf('l'))
addAtom(AtomImpl.AliphaticSubset.Chlorine, buffer);
else
addAtom(AtomImpl.AliphaticSubset.Carbon, buffer);
break;
case 'N':
addAtom(AtomImpl.AliphaticSubset.Nitrogen, buffer);
break;
case 'O':
addAtom(AtomImpl.AliphaticSubset.Oxygen, buffer);
break;
case 'P':
addAtom(AtomImpl.AliphaticSubset.Phosphorus, buffer);
break;
case 'S':
addAtom(AtomImpl.AliphaticSubset.Sulfur, buffer);
break;
case 'F':
addAtom(AtomImpl.AliphaticSubset.Fluorine, buffer);
break;
case 'I':
addAtom(AtomImpl.AliphaticSubset.Iodine, buffer);
break;
// aromatic subset
case 'b':
addAtom(AtomImpl.AromaticSubset.Boron, buffer);
g.addFlags(Graph.HAS_AROM);
break;
case 'c':
addAtom(AtomImpl.AromaticSubset.Carbon, buffer);
g.addFlags(Graph.HAS_AROM);
break;
case 'n':
addAtom(AtomImpl.AromaticSubset.Nitrogen, buffer);
g.addFlags(Graph.HAS_AROM);
break;
case 'o':
addAtom(AtomImpl.AromaticSubset.Oxygen, buffer);
g.addFlags(Graph.HAS_AROM);
break;
case 'p':
addAtom(AtomImpl.AromaticSubset.Phosphorus, buffer);
g.addFlags(Graph.HAS_AROM);
break;
case 's':
addAtom(AtomImpl.AromaticSubset.Sulfur, buffer);
g.addFlags(Graph.HAS_AROM);
break;
// D/T for hydrogen isotopes - non-standard but OpenSMILES spec
// says it's possible. The D and T here are automatic converted
// to [2H] and [3H].
case 'H':
if (strict)
throw new InvalidSmilesException("hydrogens should be specified in square brackets - '[H]'",
buffer);
addAtom(AtomImpl.EXPLICIT_HYDROGEN, buffer);
break;
case 'D':
if (strict)
throw new InvalidSmilesException("deuterium should be specified as a hydrogen isotope - '[2H]'",
buffer);
addAtom(AtomImpl.DEUTERIUM, buffer);
break;
case 'T':
if (strict)
throw new InvalidSmilesException("tritium should be specified as a hydrogen isotope - '[3H]'",
buffer);
addAtom(AtomImpl.TRITIUM, buffer);
break;
// bracket atom
case '[':
addAtom(readBracketAtom(buffer), buffer);
break;
// ring bonds
case '0':
case '1':
case '2':
case '3':
case '4':
case '5':
case '6':
case '7':
case '8':
case '9':
ring(c - '0', buffer);
break;
case '%':
int num = buffer.getNumber(2);
if (num < 0)
throw new InvalidSmilesException("a number (+) must follow '%':", buffer);
if (strict && num < 10)
throw new InvalidSmilesException("two digits must follow '%'", buffer);
ring(num, buffer);
lastBondPos = buffer.position();
break;
// bond/dot
case '-':
if (bond != Bond.IMPLICIT)
throw new InvalidSmilesException("Multiple bonds specified:", buffer);
bond = Bond.SINGLE;
lastBondPos = buffer.position();
break;
case '=':
if (bond != Bond.IMPLICIT)
throw new InvalidSmilesException("Multiple bonds specified:", buffer);
bond = Bond.DOUBLE;
lastBondPos = buffer.position();
break;
case '#':
if (bond != Bond.IMPLICIT)
throw new InvalidSmilesException("Multiple bonds specified:", buffer);
bond = Bond.TRIPLE;
lastBondPos = buffer.position();
break;
case '$':
if (bond != Bond.IMPLICIT)
throw new InvalidSmilesException("Multiple bonds specified:", buffer);
bond = Bond.QUADRUPLE;
lastBondPos = buffer.position();
break;
case ':':
if (bond != Bond.IMPLICIT)
throw new InvalidSmilesException("Multiple bonds specified:", buffer);
g.addFlags(Graph.HAS_AROM);
bond = Bond.AROMATIC;
lastBondPos = buffer.position();
break;
case '/':
if (bond != Bond.IMPLICIT)
throw new InvalidSmilesException("Multiple bonds specified:", buffer);
bond = Bond.UP;
lastBondPos = buffer.position();
g.addFlags(Graph.HAS_BND_STRO);
break;
case '\\':
// we allow C\\C=C/C since it could be an escaping error
if (bond != Bond.IMPLICIT && bond != Bond.DOWN)
throw new InvalidSmilesException("Multiple bonds specified:", buffer);
bond = Bond.DOWN;
lastBondPos = buffer.position();
g.addFlags(Graph.HAS_BND_STRO);
break;
case '.':
if (bond != Bond.IMPLICIT)
throw new InvalidSmilesException("Bond specified before disconnection:", buffer);
bond = Bond.DOT;
break;
// branching
case '(':
if (stack.empty())
throw new InvalidSmilesException("Cannot open branch at this position, SMILES may be truncated:",
buffer);
stack.push(stack.peek());
break;
case ')':
if (stack.size() < 2)
throw new InvalidSmilesException("Closing of an unopened branch, SMILES may be truncated:",
buffer);
stack.pop();
break;
// termination
case '\t':
case ' ':
// String suffix is title
StringBuilder sb = new StringBuilder();
while (buffer.hasRemaining()) {
c = buffer.get();
if (c == '\n' || c == '\r')
break;
sb.append(c);
}
g.setTitle(sb.toString());
return;
case '\n':
case '\r':
return;
default:
throw new InvalidSmilesException("unexpected character:", buffer);
}
}
}
/**
* Read a bracket atom from the buffer. A bracket atom optionally defines
* isotope, chirality, hydrogen count, formal charge and the atom class.
*
*
* bracket_atom ::= '[' isotope? symbol chiral? hcount? charge? class? ']'
*
*
* @param buffer a character buffer
* @return a bracket atom
* @throws InvalidSmilesException thrown if the bracket atom did not match
* the grammar, invalid symbol, missing
* closing bracket or invalid chiral
* specification.
*/
Atom readBracketAtom(final CharBuffer buffer) throws
InvalidSmilesException {
int start = buffer.position;
boolean arbitraryLabel = false;
if (!buffer.hasRemaining())
throw new InvalidSmilesException("Unclosed bracket atom, SMILES may be truncated", buffer);
final int isotope = buffer.getNumber();
final boolean aromatic = buffer.next() >= 'a' && buffer.next() <= 'z';
final Element element = Element.read(buffer);
if (element == Element.Unknown)
hasAstrix = true;
if (strict && element == null)
throw new InvalidSmilesException("unrecognised element symbol, SMILES may be truncated: ", buffer);
if (element != null && aromatic)
g.addFlags(Graph.HAS_AROM);
// element isn't aromatic as per the OpenSMILES specification
if (strict && aromatic && !element.aromatic(Element.AromaticSpecification.OpenSmiles))
throw new InvalidSmilesException("abnormal aromatic element", buffer);
if (element == null) {
arbitraryLabel = true;
}
configuration = Configuration.read(buffer);
int hCount = readHydrogens(buffer);
int charge = readCharge(buffer);
int atomClass = readClass(buffer);
if (!arbitraryLabel && !buffer.getIf(']')) {
if (strict) {
throw InvalidSmilesException.invalidBracketAtom(buffer);
} else {
arbitraryLabel = true;
}
}
if (arbitraryLabel) {
int end = buffer.position;
int depth = 1;
while (buffer.hasRemaining()) {
char c = buffer.get();
if (c == '[')
depth++;
else if (c == ']') {
depth--;
if (depth == 0)
break;
}
end++;
}
if (depth != 0)
throw new InvalidSmilesException("unparsable label in bracket atom",
buffer,
buffer.position - 1);
String label = buffer.substr(start, end);
hasAstrix = true;
return new AtomImpl.BracketAtom(label);
}
return new AtomImpl.BracketAtom(isotope,
element,
hCount,
charge,
atomClass,
aromatic);
}
/**
* Read the hydrogen count and progress the provided buffer. The hydrogen
* count is specified by a 'H' an 0 or more digits. A 'H' without digits is
* intercepted as 'H1'. When there is no 'H' or 'H0' is specified then the
* the hydrogen count is 0.
*
* @param buffer a character buffer
* @return the hydrogen count, 0 if none
*/
static int readHydrogens(final CharBuffer buffer) {
if (buffer.getIf('H')) {
// when no number is specified 'H' then there is 1 hydrogen
int count = buffer.getNumber();
return count < 0 ? 1 : count;
}
return 0;
}
/**
* Read a charge value and progress the provide buffer. The charge value is
* present in bracket atoms either directly after the symbol, the chiral
* specification or the hydrogen count. The specification of charge by
* concatenated signs (e.g. ++, --) and other bad form (e.g. '++-1') is
* intercepted.
*
* @param buffer a character buffer
* @return the formal charge value, 0 if none present
* @see Charge -
* OpenSMILES Specification
*/
static int readCharge(final CharBuffer buffer) {
return readCharge(0, buffer);
}
/**
* Internal method for parsing charge, to allow concatenated signs (--, ++)
* the method recursively invokes increment or decrementing an accumulator.
*
* @param acc accumulator
* @param buffer a character buffer
* @return the charge value
*/
private static int readCharge(int acc, final CharBuffer buffer) {
if (buffer.getIf('+'))
return buffer.nextIsDigit() ? acc + buffer.getNumber()
: readCharge(acc + 1, buffer);
if (buffer.getIf('-'))
return buffer.nextIsDigit() ? acc - buffer.getNumber()
: readCharge(acc - 1, buffer);
return acc;
}
/**
* Read the atom class of a bracket atom and progress the buffer (if read).
* The atom class is the last attribute of the bracket atom and is
* identified by a ':' followed by one or more digits. The atom class may be
* padded such that ':005' and ':5' are equivalent.
*
* @param buffer a character buffer
* @return the atom class, or 0
* @see Atom
* Class - OpenSMILES Specification
*/
static int readClass(CharBuffer buffer) throws InvalidSmilesException {
if (buffer.getIf(':')) {
if (buffer.nextIsDigit())
return buffer.getNumber();
throw new InvalidSmilesException("invalid atom class, + must follow ':'", buffer);
}
return 0;
}
/**
* Handle the ring open/closure of the specified ring number 'rnum'.
*
* @param rnum ring number
* @throws InvalidSmilesException bond types did not match on ring closure
*/
private void ring(int rnum, CharBuffer buffer) throws InvalidSmilesException {
if (bond == Bond.DOT)
throw new InvalidSmilesException("a ring bond can not be a 'dot':",
buffer,
buffer.position());
if (stack.empty())
throw new InvalidSmilesException("No previous atom for ring open!",
buffer,
buffer.position());
if (rings.length <= rnum || rings[rnum] == null)
openRing(rnum, buffer);
else
closeRing(rnum, buffer);
}
/**
* Open the ring bond with the specified 'rnum'.
*
* @param rnum ring number
*/
private void openRing(int rnum, CharBuffer buf) {
if (rnum >= rings.length)
rings = Arrays.copyOf(rings,
Math.min(100, rnum * 2)); // max rnum: 99
int u = stack.peek();
// create a ring bond
rings[rnum] = new RingBond(u, bond, lastBondPos);
// keep track of arrangement (important for stereo configurations)
createArrangement(u).add(-rnum);
openRings++;
bond = Bond.IMPLICIT;
}
/**
* Create the current local arrangement for vertex 'u' - if the arrangment
* already exists then that arrangement is used.
*
* @param u vertex to get the arrangement around
* @return current local arrangement
*/
private LocalArrangement createArrangement(int u) {
LocalArrangement la = arrangement.get(u);
if (la == null) {
la = new LocalArrangement();
final int d = g.degree(u);
for (int j = 0; j < d; ++j) {
final Edge e = g.edgeAt(u, j);
la.add(e.other(u));
}
arrangement.put(u, la);
}
return la;
}
/**
* Close the ring bond with the specified 'rnum'.
*
* @param rnum ring number
* @throws InvalidSmilesException bond types did not match
*/
private void closeRing(int rnum, CharBuffer buffer) throws InvalidSmilesException {
RingBond rbond = rings[rnum];
rings[rnum] = null;
int u = rbond.u;
int v = stack.peek();
if (u == v)
throw new InvalidSmilesException("Endpoints of ringbond are the same - loops are not allowed",
buffer);
if (g.adjacent(u, v))
throw new InvalidSmilesException("Endpoints of ringbond are already connected - multi-edges are not allowed",
buffer);
bond = decideBond(rbond.bond, bond.inverse(), rbond.pos, buffer);
Edge e = new Edge(u, v, bond);
if (bond.directional()) {
checkDirectionalBonds.set(u);
checkDirectionalBonds.set(v);
if (rbond.bond.directional())
bondStrPos.put(e, rbond.pos);
else
bondStrPos.put(e, lastBondPos);
}
g.addEdge(e);
bond = Bond.IMPLICIT;
// adjust the arrangement replacing where this ring number was openned
arrangement.get(rbond.u).replace(-rnum, stack.peek());
if (arrangement.containsKey(v))
arrangement.get(v).add(rbond.u);
openRings--;
}
/**
* Decide the bond to use for a ring bond. The bond symbol can be present on
* either or both bonded atoms. This method takes those bonds, chooses the
* correct one or reports an error if there is a conflict.
*
* Equivalent SMILES:
*
* C=1CCCCC=1
* C=1CCCCC1 (preferred)
* C1CCCCC=1
*
*
* @param a a bond
* @param b other bond
* @param pos the position in the string of bond a
* @param buffer the buffer and it's current position
* @return the bond to use for this edge
* @throws InvalidSmilesException ring bonds did not match
*/
Bond decideBond(final Bond a, final Bond b, int pos, CharBuffer buffer) throws InvalidSmilesException {
if (a == b)
return a;
else if (a == Bond.IMPLICIT)
return b;
else if (b == Bond.IMPLICIT)
return a;
if (strict || a.inverse() != b)
throw new InvalidSmilesException("Ring closure bonds did not match, '" + a + "'!='" + b + "':" +
InvalidSmilesException.display(buffer,
pos - buffer.position,
lastBondPos - buffer.position));
warnings.add("Ignored invalid Cis/Trans on ring closure, should flip:" +
InvalidSmilesException.display(buffer, pos - buffer.position,
lastBondPos - buffer.position));
return Bond.IMPLICIT;
}
/**
* Convenience method for parsing a SMILES string.
*
* @param str SMILES string
* @return the chemical graph for the provided SMILES notation
* @throws InvalidSmilesException thrown if the SMILES could not be
* interpreted
*/
static Graph parse(String str) throws InvalidSmilesException {
return new Parser(str).molecule();
}
/**
* Access any warning messages from parsing the SMILES.
*
* @return the warnings.
*/
public Collection extends String> getWarnings() {
return Collections.unmodifiableCollection(warnings);
}
/**
* Hold information about ring open/closures. The ring bond can optionally
* specify the bond type.
*/
private static final class RingBond {
int u;
Bond bond;
int pos;
private RingBond(int u, Bond bond, int pos) {
this.u = u;
this.bond = bond;
this.pos = pos;
}
}
/**
* Hold information on the local arrangement around an atom. The arrangement
* is normally identical to the order loaded unless the atom is involved in
* a ring closure. This is particularly important for stereo specification
* where the ring bonds should be in the order listed. This class stores the
* local arrangement by setting a negated 'rnum' as a placeholder and then
* replacing it once the connected atom has been read. Although this could
* be stored directly on the graph (negated edge) it allows us to keep all
* edges in sorted order.
*/
private static final class LocalArrangement {
int[] vs;
int n;
/**
* New local arrangement.
*/
private LocalArrangement() {
this.vs = new int[4];
}
/**
* Append a vertex to the arrangement.
*
* @param v vertex to append
*/
void add(final int v) {
if (n == vs.length)
vs = Arrays.copyOf(vs, n * 2);
vs[n++] = v;
}
/**
* Replace the vertex 'u' with 'v'. Allows us to use negated values as
* placeholders.
*
*
* LocalArrangement la = new LocalArrangement();
* la.add(1);
* la.add(-2);
* la.add(-1);
* la.add(5);
* la.replace(-1, 4);
* la.replace(-2, 6);
* la.toArray() = {1, 6, 4, 5}
*
*
* @param u negated vertex
* @param v new vertex
*/
void replace(final int u, final int v) {
for (int i = 0; i < n; i++) {
if (vs[i] == u) {
vs[i] = v;
return;
}
}
}
/**
* Access the local arrange of vertices.
*
* @return array of vertices and there order around an atom.
*/
int[] toArray() {
return Arrays.copyOf(vs, n);
}
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy