All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.openscience.cdk.io.MDLV2000Reader Maven / Gradle / Ivy

There is a newer version: 2.10
Show newest version
/* Copyright (C) 1997-2007  Christoph Steinbeck 
 *                    2010  Egon Willighagen 
 *                    2014  Mark B Vine (orcid:0000-0002-7794-0426)
 *
 *  Contact: [email protected]
 *
 *  This program is free software; you can redistribute it and/or
 *  modify it under the terms of the GNU Lesser General Public License
 *  as published by the Free Software Foundation; either version 2.1
 *  of the License, or (at your option) any later version.
 *  All we ask is that proper credit is given for our work, which includes
 *  - but is not limited to - adding the above copyright notice to the beginning
 *  of your source code files, and to any copyright notice that you may distribute
 *  with programs based on this work.
 *
 *  This program is distributed in the hope that it will be useful,
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *  GNU Lesser General Public License for more details.
 *
 *  You should have received a copy of the GNU Lesser General Public License
 *  along with this program; if not, write to the Free Software
 *  Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
 */
package org.openscience.cdk.io;

import com.google.common.collect.ImmutableSet;
import org.openscience.cdk.CDKConstants;
import org.openscience.cdk.config.IsotopeFactory;
import org.openscience.cdk.config.Isotopes;
import org.openscience.cdk.exception.CDKException;
import org.openscience.cdk.interfaces.IAtom;
import org.openscience.cdk.interfaces.IAtomContainer;
import org.openscience.cdk.interfaces.IAtomContainerSet;
import org.openscience.cdk.interfaces.IBond;
import org.openscience.cdk.interfaces.IChemFile;
import org.openscience.cdk.interfaces.IChemModel;
import org.openscience.cdk.interfaces.IChemObject;
import org.openscience.cdk.interfaces.IChemObjectBuilder;
import org.openscience.cdk.interfaces.IChemSequence;
import org.openscience.cdk.interfaces.IIsotope;
import org.openscience.cdk.interfaces.IPseudoAtom;
import org.openscience.cdk.interfaces.ISingleElectron;
import org.openscience.cdk.interfaces.IStereoElement;
import org.openscience.cdk.interfaces.ITetrahedralChirality;
import org.openscience.cdk.interfaces.ITetrahedralChirality.Stereo;
import org.openscience.cdk.io.formats.IResourceFormat;
import org.openscience.cdk.io.formats.MDLV2000Format;
import org.openscience.cdk.io.setting.BooleanIOSetting;
import org.openscience.cdk.io.setting.IOSetting;
import org.openscience.cdk.isomorphism.matchers.CTFileQueryBond;
import org.openscience.cdk.isomorphism.matchers.QueryAtomContainer;
import org.openscience.cdk.sgroup.Sgroup;
import org.openscience.cdk.sgroup.SgroupBracket;
import org.openscience.cdk.sgroup.SgroupKey;
import org.openscience.cdk.sgroup.SgroupType;
import org.openscience.cdk.stereo.StereoElementFactory;
import org.openscience.cdk.stereo.TetrahedralChirality;
import org.openscience.cdk.tools.ILoggingTool;
import org.openscience.cdk.tools.LoggingToolFactory;
import org.openscience.cdk.tools.manipulator.AtomContainerManipulator;
import org.openscience.cdk.tools.periodictable.PeriodicTable;

import javax.vecmath.Point2d;
import javax.vecmath.Point3d;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.Reader;
import java.io.StringReader;
import java.util.ArrayList;
import java.util.Collections;
import java.util.HashMap;
import java.util.HashSet;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.StringTokenizer;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import static org.openscience.cdk.io.MDLV2000Writer.SPIN_MULTIPLICITY;

/**
 * Reads content from MDL molfiles and SD files. It can read a {@link
 * IAtomContainer} or {@link IChemModel} from an MDL molfile, and a {@link
 * IChemFile} from a SD file, with a {@link IChemSequence} of {@link
 * IChemModel}'s, where each IChemModel will contain one {@link IAtomContainer}.
 *
 * 

From the Atom block it reads atomic coordinates, element types and formal * charges. From the Bond block it reads the bonds and the orders. Additionally, * it reads 'M CHG', 'G ', 'M RAD' and 'M ISO' lines from the property * block. * *

If all z coordinates are 0.0, then the xy coordinates are taken as 2D, * otherwise the coordinates are read as 3D. * *

The title of the MOL file is read and can be retrieved with: *

 *   molecule.getProperty(CDKConstants.TITLE);
 * 
* *

RGroups which are saved in the MDL molfile as R#, are renamed according to * their appearance, e.g. the first R# is named R1. With PseudAtom.getLabel() * "R1" is returned (instead of R#). This is introduced due to the SAR table * generation procedure of Scitegics PipelinePilot. * * @author steinbeck * @author Egon Willighagen * @cdk.module io * @cdk.githash * @cdk.iooptions * @cdk.created 2000-10-02 * @cdk.keyword file format, MDL molfile * @cdk.keyword file format, SDF * @cdk.bug 1587283 */ public class MDLV2000Reader extends DefaultChemObjectReader { BufferedReader input = null; private static ILoggingTool logger = LoggingToolFactory.createLoggingTool(MDLV2000Reader.class); private BooleanIOSetting forceReadAs3DCoords; private BooleanIOSetting interpretHydrogenIsotopes; private BooleanIOSetting addStereoElements; // Pattern to remove trailing space (String.trim() will remove leading space, which we don't want) private static final Pattern TRAILING_SPACE = Pattern.compile("\\s+$"); /** Delimits Structure-Data (SD) Files. */ private static final String RECORD_DELIMITER = "$$$$"; /** * @deprecated Incorrect spelling */ private static final Set PSUEDO_LABELS = ImmutableSet. builder().add("*").add("A").add("Q") .add("L").add("LP").add("R") // XXX: not in spec .add("R#").build(); /** Valid pseudo labels. */ private static final Set PSEUDO_LABELS = ImmutableSet. builder().add("*").add("A").add("Q") .add("L").add("LP").add("R") // XXX: not in spec .add("R#").build(); public MDLV2000Reader() { this(new StringReader("")); } /** * Constructs a new MDLReader that can read Molecule from a given * InputStream. * * @param in The InputStream to read from */ public MDLV2000Reader(InputStream in) { this(new InputStreamReader(in)); } public MDLV2000Reader(InputStream in, Mode mode) { this(new InputStreamReader(in), mode); } /** * Constructs a new MDLReader that can read Molecule from a given Reader. * * @param in The Reader to read from */ public MDLV2000Reader(Reader in) { this(in, Mode.RELAXED); } public MDLV2000Reader(Reader in, Mode mode) { input = new BufferedReader(in); initIOSettings(); super.mode = mode; } @Override public IResourceFormat getFormat() { return MDLV2000Format.getInstance(); } @Override public void setReader(Reader input) throws CDKException { if (input instanceof BufferedReader) { this.input = (BufferedReader) input; } else { this.input = new BufferedReader(input); } } @Override public void setReader(InputStream input) throws CDKException { setReader(new InputStreamReader(input)); } @SuppressWarnings("unchecked") @Override public boolean accepts(Class classObject) { Class[] interfaces = classObject.getInterfaces(); for (Class anInterface : interfaces) { if (IChemFile.class.equals(anInterface)) return true; if (IChemModel.class.equals(anInterface)) return true; if (IAtomContainer.class.equals(anInterface)) return true; } if (IAtomContainer.class.equals(classObject)) return true; if (IChemFile.class.equals(classObject)) return true; if (IChemModel.class.equals(classObject)) return true; Class superClass = classObject.getSuperclass(); return superClass != null && this.accepts(superClass); } /** * Takes an object which subclasses IChemObject, e.g. Molecule, and will * read this (from file, database, internet etc). If the specific * implementation does not support a specific IChemObject it will throw an * Exception. * * @param object The object that subclasses IChemObject * @return The IChemObject read * @throws CDKException */ @SuppressWarnings("unchecked") @Override public T read(T object) throws CDKException { if (object instanceof IAtomContainer) { return (T) readAtomContainer((IAtomContainer) object); } else if (object instanceof IChemFile) { return (T) readChemFile((IChemFile) object); } else if (object instanceof IChemModel) { return (T) readChemModel((IChemModel) object); } else { throw new CDKException("Only supported are ChemFile and Molecule."); } } private IChemModel readChemModel(IChemModel chemModel) throws CDKException { IAtomContainerSet setOfMolecules = chemModel.getMoleculeSet(); if (setOfMolecules == null) { setOfMolecules = chemModel.getBuilder().newInstance(IAtomContainerSet.class); } IAtomContainer m = readAtomContainer(chemModel.getBuilder().newInstance(IAtomContainer.class)); if (m != null) { setOfMolecules.addAtomContainer(m); } chemModel.setMoleculeSet(setOfMolecules); return chemModel; } /** * Read a ChemFile from a file in MDL SDF format. * * @return The ChemFile that was read from the MDL file. */ private IChemFile readChemFile(IChemFile chemFile) throws CDKException { IChemObjectBuilder builder = chemFile.getBuilder(); IChemSequence sequence = builder.newInstance(IChemSequence.class); try { IAtomContainer m; while ((m = readAtomContainer(builder.newInstance(IAtomContainer.class))) != null) { sequence.addChemModel(newModel(m)); } } catch (CDKException e) { throw e; } catch (IllegalArgumentException exception) { String error = "Error while parsing SDF"; logger.error(error); logger.debug(exception); throw new CDKException(error, exception); } try { input.close(); } catch (Exception exc) { String error = "Error while closing file: " + exc.getMessage(); logger.error(error); throw new CDKException(error, exc); } chemFile.addChemSequence(sequence); return chemFile; } /** * Create a new chem model for a single {@link IAtomContainer}. * * @param container the container to create the model for * @return a new {@link IChemModel} */ private static IChemModel newModel(final IAtomContainer container) { if (container == null) throw new NullPointerException("cannot create chem model for a null container"); final IChemObjectBuilder builder = container.getBuilder(); final IChemModel model = builder.newInstance(IChemModel.class); final IAtomContainerSet containers = builder.newInstance(IAtomContainerSet.class); containers.addAtomContainer(container); model.setMoleculeSet(containers); return model; } /** * Read an IAtomContainer from a file in MDL sd format * * @return The Molecule that was read from the MDL file. */ private IAtomContainer readAtomContainer(IAtomContainer molecule) throws CDKException { IAtomContainer outputContainer = null; Map parities = new HashMap<>(); int linecount = 0; String title = null; String remark = null; String line = ""; try { line = input.readLine(); linecount++; if (line == null) { return null; } if (line.startsWith("$$$$")) { return molecule; } if (line.length() > 0) { title = line; } line = input.readLine(); linecount++; line = input.readLine(); linecount++; if (line.length() > 0) { remark = line; } line = input.readLine(); linecount++; // if the line is empty we hav a problem - either a malformed // molecule entry or just extra new lines at the end of the file if (line.length() == 0) { handleError("Unexpected empty line", linecount, 0, 0); // read till the next $$$$ or EOF while (true) { line = input.readLine(); linecount++; if (line == null) { return null; } if (line.startsWith("$$$$")) { return molecule; // an empty molecule } } } final CTabVersion version = CTabVersion.ofHeader(line); // check the CT block version if (version == CTabVersion.V3000) { handleError("This file must be read with the MDLV3000Reader."); // even if relaxed we can't read V3000 using the V2000 parser throw new CDKException("This file must be read with the MDLV3000Reader."); } else if (version == CTabVersion.UNSPECIFIED) { handleError("This file must be read with the MDLReader."); // okay to read in relaxed mode } int nAtoms = readMolfileInt(line, 0); int nBonds = readMolfileInt(line, 3); final IAtom[] atoms = new IAtom[nAtoms]; final IBond[] bonds = new IBond[nBonds]; // used for applying the MDL valence model int[] explicitValence = new int[nAtoms]; boolean hasX = false, hasY = false, hasZ = false; for (int i = 0; i < nAtoms; i++) { line = input.readLine(); linecount++; final IAtom atom = readAtomFast(line, molecule.getBuilder(), parities, linecount); atoms[i] = atom; Point3d p = atom.getPoint3d(); hasX = hasX || p.x != 0d; hasY = hasY || p.y != 0d; hasZ = hasZ || p.z != 0d; } // convert to 2D, if totalZ == 0 if (!hasX && !hasY && !hasZ) { if (nAtoms == 1) { atoms[0].setPoint2d(new Point2d(0, 0)); } else { for (IAtom atomToUpdate : atoms) { atomToUpdate.setPoint3d(null); } } } else if (!hasZ) { if (!forceReadAs3DCoords.isSet()) { for (IAtom atomToUpdate : atoms) { Point3d p3d = atomToUpdate.getPoint3d(); if (p3d != null) { atomToUpdate.setPoint2d(new Point2d(p3d.x, p3d.y)); atomToUpdate.setPoint3d(null); } } } } boolean hasQueryBonds = false; for (int i = 0; i < nBonds; i++) { line = input.readLine(); linecount++; bonds[i] = readBondFast(line, molecule.getBuilder(), atoms, explicitValence, linecount); hasQueryBonds = hasQueryBonds || (bonds[i].getOrder() == IBond.Order.UNSET && !bonds[i].getFlag(CDKConstants.ISAROMATIC)); } if (!hasQueryBonds) outputContainer = molecule; else outputContainer = new QueryAtomContainer(molecule.getBuilder()); outputContainer.setProperty(CDKConstants.TITLE, title); outputContainer.setProperty(CDKConstants.REMARK, remark); // if the container is empty we can simply set the atoms/bonds // otherwise we add them to the end if (outputContainer.isEmpty()) { outputContainer.setAtoms(atoms); outputContainer.setBonds(bonds); } else { for (IAtom atom : atoms) outputContainer.addAtom(atom); for (IBond bond : bonds) outputContainer.addBond(bond); } // create 0D stereochemistry Parities: for (Map.Entry e : parities.entrySet()) { int parity = e.getValue(); if (parity != 1 && parity != 2) continue; // 3=unspec int idx = 0; IAtom focus = e.getKey(); IAtom[] carriers = new IAtom[4]; int hidx = -1; for (IAtom nbr : molecule.getConnectedAtomsList(focus)) { if (idx == 4) continue Parities; // too many neighbors if (nbr.getAtomicNumber() == 1) { if (hidx >= 0) continue Parities; hidx = idx; } carriers[idx++] = nbr; } // to few neighbors, or already have a hydrogen defined if (idx < 3 || idx < 4 && hidx >= 0) continue; if (idx == 3) carriers[idx++] = focus; if (idx == 4) { Stereo winding = parity == 1 ? Stereo.CLOCKWISE : Stereo.ANTI_CLOCKWISE; // H is always at back, even if explicit! At least this seems to be the case. // we adjust the winding as needed if (hidx == 0 || hidx == 2) winding = winding.invert(); molecule.addStereoElement(new TetrahedralChirality(focus, carriers, winding)); } } // read PROPERTY block readPropertiesFast(input, outputContainer, nAtoms); // read potential SD file data between M END and $$$$ readNonStructuralData(input, outputContainer); if (interpretHydrogenIsotopes.isSet()) { fixHydrogenIsotopes(molecule, Isotopes.getInstance()); } // note: apply the valence model last so that all fixes (i.e. hydrogen // isotopes) are in place we need to use a offset as this atoms // could be added to a molecule which already had atoms present int offset = outputContainer.getAtomCount() - nAtoms; for (int i = offset; i < outputContainer.getAtomCount(); i++) { int valence = explicitValence[i - offset]; if (valence < 0) { hasQueryBonds = true; // also counts aromatic bond as query } else { int unpaired = outputContainer.getConnectedSingleElectronsCount(outputContainer.getAtom(i)); applyMDLValenceModel(outputContainer.getAtom(i), valence + unpaired); } } // sanity check that we have a decent molecule, query bonds mean we // don't have a hydrogen count for atoms and stereo perception isn't // currently possible if (!hasQueryBonds && addStereoElements.isSet() && hasX && hasY) { if (hasZ) { // has 3D coordinates outputContainer.setStereoElements(StereoElementFactory.using3DCoordinates(outputContainer) .createAll()); } else if (!forceReadAs3DCoords.isSet()) { // has 2D coordinates (set as 2D coordinates) outputContainer.setStereoElements(StereoElementFactory.using2DCoordinates(outputContainer) .createAll()); } } } catch (CDKException exception) { String error = "Error while parsing line " + linecount + ": " + line + " -> " + exception.getMessage(); logger.error(error); throw exception; } catch (IOException exception) { exception.printStackTrace(); String error = "Error while parsing line " + linecount + ": " + line + " -> " + exception.getMessage(); logger.error(error); handleError("Error while parsing line: " + line, linecount, 0, 0, exception); } return outputContainer; } /** * Applies the MDL valence model to atoms using the explicit valence (bond * order sum) and charge to determine the correct number of implicit * hydrogens. The model is not applied if the explicit valence is less than * 0 - this is the case when a query bond was read for an atom. * * @param atom the atom to apply the model to * @param explicitValence the explicit valence (bond order sum) */ private void applyMDLValenceModel(IAtom atom, int explicitValence) { if (atom.getValency() != null) { if (atom.getValency() >= explicitValence) atom.setImplicitHydrogenCount(atom.getValency() - explicitValence); else atom.setImplicitHydrogenCount(0); } else { Integer element = atom.getAtomicNumber(); if (element == null) element = 0; Integer charge = atom.getFormalCharge(); if (charge == null) charge = 0; int implicitValence = MDLValence.implicitValence(element, charge, explicitValence); if (implicitValence < explicitValence) { atom.setValency(explicitValence); atom.setImplicitHydrogenCount(0); } else { atom.setValency(implicitValence); atom.setImplicitHydrogenCount(implicitValence - explicitValence); } } } private void fixHydrogenIsotopes(IAtomContainer molecule, IsotopeFactory isotopeFactory) { for (IAtom atom : AtomContainerManipulator.getAtomArray(molecule)) { if (atom instanceof IPseudoAtom) { IPseudoAtom pseudo = (IPseudoAtom) atom; if ("D".equals(pseudo.getLabel())) { IAtom newAtom = molecule.getBuilder().newInstance(IAtom.class, atom); newAtom.setSymbol("H"); newAtom.setAtomicNumber(1); isotopeFactory.configure(newAtom, isotopeFactory.getIsotope("H", 2)); AtomContainerManipulator.replaceAtomByAtom(molecule, atom, newAtom); } else if ("T".equals(pseudo.getLabel())) { IAtom newAtom = molecule.getBuilder().newInstance(IAtom.class, atom); newAtom.setSymbol("H"); newAtom.setAtomicNumber(1); isotopeFactory.configure(newAtom, isotopeFactory.getIsotope("H", 3)); AtomContainerManipulator.replaceAtomByAtom(molecule, atom, newAtom); } } } } @Override public void close() throws IOException { input.close(); } private void initIOSettings() { forceReadAs3DCoords = addSetting(new BooleanIOSetting("ForceReadAs3DCoordinates", IOSetting.Importance.LOW, "Should coordinates always be read as 3D?", "false")); interpretHydrogenIsotopes = addSetting(new BooleanIOSetting("InterpretHydrogenIsotopes", IOSetting.Importance.LOW, "Should D and T be interpreted as hydrogen isotopes?", "true")); addStereoElements = addSetting(new BooleanIOSetting("AddStereoElements", IOSetting.Importance.LOW, "Assign stereo configurations to stereocenters utilising 2D/3D coordinates.", "true")); } public void customizeJob() { for (IOSetting setting : getSettings()) { fireIOSettingQuestion(setting); } } private String removeNonDigits(String input) { StringBuilder sb = new StringBuilder(); for (int i = 0; i < input.length(); i++) { char inputChar = input.charAt(i); if (Character.isDigit(inputChar)) sb.append(inputChar); } return sb.toString(); } IAtom readAtomFast(String line, IChemObjectBuilder builder, int lineNum) throws CDKException, IOException { return readAtomFast(line, builder, Collections.emptyMap(), lineNum); } /** * Parse an atom line from the atom block using the format: {@code * xxxxx.xxxxyyyyy.yyyyzzzzz.zzzz aaaddcccssshhhbbbvvvHHHrrriiimmmnnneee} * where:

  • x: x coordinate
  • y: y coordinate
  • z: z * coordinate
  • a: atom symbol
  • d: mass difference
  • *
  • c: charge
  • s: stereo parity
  • h: hydrogen count + 1 * (not read - query)
  • b: stereo care (not read - query)
  • v: * valence
  • H: H0 designator (not read - query)
  • r: not * used
  • i: not used
  • m: atom reaction mapping
  • n: * inversion/retention flag
  • e: exact change flag
* * The parsing is strict and does not allow extra columns (i.e. NMR shifts) * malformed input. * * @param line input line * @param builder chem object builder to create the atom * @param parities map of atom parities for creation 0D stereochemistry * @param lineNum the line number - for printing error messages * @return a new atom instance */ IAtom readAtomFast(String line, IChemObjectBuilder builder, Map parities, int lineNum) throws CDKException, IOException { // The line may be truncated and it's checked in reverse at the specified // lengths: // 1 2 3 4 5 6 // 123456789012345678901234567890123456789012345678901234567890123456789 // | | | | | | | | | | | | | // xxxxx.xxxxyyyyy.yyyyzzzzz.zzzz aaaddcccssshhhbbbvvvHHHrrriiimmmnnneee String symbol; double x, y, z; int massDiff = 0, charge = 0, parity = 0, valence = 0, mapping = 0; int length = length(line); if (length > 69) // excess data we should check all fields length = 69; // given the length we jump to the position and parse all fields // that could be present (note - fall through switch) switch (length) { case 69: // eee: exact charge flag [reaction, query] case 66: // nnn: inversion / retention [reaction] case 63: // mmm: atom-atom mapping [reaction] mapping = readMolfileInt(line, 60); case 60: // iii: not used case 57: // rrr: not used case 54: // HHH: H0 designation [redundant] case 51: // vvv: valence valence = readMolfileInt(line, 48); case 48: // bbb: stereo care [query] case 45: // hhh: hydrogen count + 1 [query] case 42: // sss: stereo parity parity = toInt(line.charAt(41)); case 39: // ccc: charge charge = toCharge(line.charAt(38)); case 36: // dd: mass difference massDiff = sign(line.charAt(34)) * toInt(line.charAt(35)); case 34: // x y z and aaa: atom coordinates and symbol case 33: // symbol is left aligned case 32: x = readMDLCoordinate(line, 0); y = readMDLCoordinate(line, 10); z = readMDLCoordinate(line, 20); symbol = line.substring(31, 34).trim().intern(); break; default: handleError("invalid line length", lineNum, 0, 0); throw new CDKException("invalid line length, " + length + ": " + line); } IAtom atom = createAtom(symbol, builder, lineNum); atom.setPoint3d(new Point3d(x, y, z)); atom.setFormalCharge(charge); atom.setStereoParity(parity); if (parity != 0) parities.put(atom, parity); // if there was a mass difference, set the mass number if (massDiff != 0 && atom.getAtomicNumber() > 0) atom.setMassNumber(Isotopes.getInstance().getMajorIsotope(atom.getAtomicNumber()).getMassNumber() + massDiff); if (valence > 0 && valence < 16) atom.setValency(valence == 15 ? 0 : valence); if (mapping != 0) atom.setProperty(CDKConstants.ATOM_ATOM_MAPPING, mapping); return atom; } /** * Read a bond from a line in the MDL bond block. The bond block is * formatted as follows, {@code 111222tttsssxxxrrrccc}, where: *
    *
  • 111: first atom number
  • *
  • 222: second atom number
  • *
  • ttt: bond type
  • *
  • xxx: bond stereo
  • *
  • rrr: bond topology
  • *
  • ccc: reaction center
  • *
* * @param line the input line * @param builder builder to create objects with * @param atoms atoms read from the atom block * @param explicitValence array to fill with explicit valence * @param lineNum the input line number * @return a new bond * @throws CDKException thrown if the input was malformed or didn't make * sense */ IBond readBondFast(String line, IChemObjectBuilder builder, IAtom[] atoms, int[] explicitValence, int lineNum) throws CDKException { // The line may be truncated and it's checked in reverse at the specified // lengths. Absolutely required is atom indices, bond type and stereo. // 1 2 // 123456789012345678901 // | | | | // 111222tttsssxxxrrrccc int length = length(line); if (length > 21) length = 21; int u, v, type, stereo = 0; switch (length) { case 21: // ccc: reaction centre status case 18: // rrr: bond topology case 15: // xxx: not used case 12: // sss: stereo stereo = readUInt(line, 9, 3); case 9: // 111222ttt: atoms, type and stereo u = readMolfileInt(line, 0) - 1; v = readMolfileInt(line, 3) - 1; type = readMolfileInt(line, 6); break; default: throw new CDKException("invalid line length: " + length + " " + line); } IBond bond = builder.newInstance(IBond.class, atoms[u], atoms[v]); switch (type) { case 1: // single bond.setOrder(IBond.Order.SINGLE); bond.setStereo(toStereo(stereo, type)); break; case 2: // double bond.setOrder(IBond.Order.DOUBLE); bond.setStereo(toStereo(stereo, type)); break; case 3: // triple bond.setOrder(IBond.Order.TRIPLE); break; case 4: // aromatic bond.setOrder(IBond.Order.UNSET); bond.setFlag(CDKConstants.ISAROMATIC, true); bond.setFlag(CDKConstants.SINGLE_OR_DOUBLE, true); atoms[u].setFlag(CDKConstants.ISAROMATIC, true); atoms[v].setFlag(CDKConstants.ISAROMATIC, true); break; case 5: // single or double case 6: // single or aromatic case 7: // double or aromatic case 8: // any bond = CTFileQueryBond.ofType(bond, type); break; default: throw new CDKException("unrecognised bond type: " + type + ", " + line); } if (type < 4) { explicitValence[u] += type; explicitValence[v] += type; } else { explicitValence[u] = explicitValence[v] = Integer.MIN_VALUE; } return bond; } /** * Reads the property block from the {@code input} setting the values in the * container. * * @param input input resource * @param container the structure with atoms / bonds present * @param nAtoms the number of atoms in the atoms block * @throws IOException low-level IO error */ void readPropertiesFast(final BufferedReader input, final IAtomContainer container, final int nAtoms) throws IOException, CDKException { String line; // first atom index in this Molfile, the container may have // already had atoms present before reading the file int offset = container.getAtomCount() - nAtoms; Map sgroups = new LinkedHashMap<>(); LINES: while ((line = input.readLine()) != null) { int index, count, lnOffset; Sgroup sgroup; int length = line.length(); final PropertyKey key = PropertyKey.of(line); switch (key) { // A aaa // x... // // atom alias is stored as label on a pseudo atom case ATOM_ALIAS: index = readMolfileInt(line, 3) - 1; final String label = input.readLine(); if (label == null) return; label(container, offset + index, label); break; // V aaa v... // // an atom value is stored as comment on an atom case ATOM_VALUE: index = readMolfileInt(line, 3) - 1; final String comment = line.substring(7); container.getAtom(offset + index).setProperty(CDKConstants.COMMENT, comment); break; // G aaappp // x... // // Abbreviation is required for compatibility with previous versions of MDL ISIS/Desktop which // allowed abbreviations with only one attachment. The attachment is denoted by two atom // numbers, aaa and ppp. All of the atoms on the aaa side of the bond formed by aaa-ppp are // abbreviated. The coordinates of the abbreviation are the coordinates of aaa. The text of the // abbreviation is on the following line (x...). In current versions of ISIS, abbreviations can have any // number of attachments and are written out using the Sgroup appendixes. However, any ISIS // abbreviations that do have one attachment are also written out in the old style, again for // compatibility with older ISIS versions, but this behavior might not be supported in future // versions. case GROUP_ABBREVIATION: // not supported, existing parsing doesn't do what is // mentioned in the specification above // final int from = readMolfileInt(line, 3) - 1; // final int to = readMolfileInt(line, 6) - 1; final String group = input.readLine(); if (group == null) return; break; // M CHGnn8 aaa vvv ... // // vvv: -15 to +15. Default of 0 = uncharged atom. When present, this property supersedes // all charge and radical values in the atom block, forcing a 0 charge on all atoms not // listed in an M CHG or M RAD line. case M_CHG: count = readUInt(line, 6, 3); for (int i = 0, st = 10; i < count && st + 7 <= length; i++, st += 8) { index = readMolfileInt(line, st) - 1; int charge = readMolfileInt(line, st + 4); container.getAtom(offset + index).setFormalCharge(charge); } break; // M ISOnn8 aaa vvv ... // // vvv: Absolute mass of the atom isotope as a positive integer. When present, this property // supersedes all isotope values in the atom block. Default (no entry) means natural // abundance. The difference between this absolute mass value and the natural // abundance value specified in the PTABLE.DAT file must be within the range of -18 // to +12. case M_ISO: count = readUInt(line, 6, 3); for (int i = 0, st = 10; i < count && st + 7 <= length; i++, st += 8) { index = readMolfileInt(line, st) - 1; int mass = readMolfileInt(line, st + 4); container.getAtom(offset + index).setMassNumber(mass); } break; // M RADnn8 aaa vvv ... // // vvv: Default of 0 = no radical, 1 = singlet (:), 2 = doublet ( . or ^), 3 = triplet (^^). When // present, this property supersedes all charge and radical values in the atom block, // forcing a 0 (zero) charge and radical on all atoms not listed in an M CHG or // M RAD line. case M_RAD: count = readUInt(line, 6, 3); for (int i = 0, st = 10; i < count && st + 7 <= length; i++, st += 8) { index = readMolfileInt(line, st) - 1; int value = readMolfileInt(line, st + 4); SPIN_MULTIPLICITY multiplicity = SPIN_MULTIPLICITY.ofValue(value); for (int e = 0; e < multiplicity.getSingleElectrons(); e++) container.addSingleElectron(offset + index); } break; // M RGPnn8 aaa rrr ... // // rrr: Rgroup number, value from 1 to 32 *, labels position of Rgroup on root. // // see also, RGroupQueryReader case M_RGP: count = readUInt(line, 6, 3); for (int i = 0, st = 10; i < count && st + 7 <= length; i++, st += 8) { index = readMolfileInt(line, st) - 1; int number = readMolfileInt(line, st + 4); label(container, offset + index, "R" + number); } break; // M ZZC aaa c... // // c: first character of the label, extends to EOL. // // Proprietary atom labels created by ACD/Labs ChemSketch using the Manual Numbering Tool. // This atom property appears to be undocumented, but experimentation leads to the following // specification (tested with ACD/ChemSketch version 12.00 Build 29305, 25 Nov 2008) // // It's not necessary to label any/all atoms but if a label is present, the following applies: // // The atom label(s) consist of an optional prefix, a required numeric label, and optional suffix. // // The numeric label is an integer in the range 0 - 999 inclusive. // // If present, the prefix and suffix can each contain 1 - 50 characters, from the set of printable // ASCII characters shown here // // !"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_`abcdefghijklmnopqrstuvwxyz{|}~" // // In addition, both the prefix and suffix may contain leading and/or trailing and/or embedded // whitespace, included within the limit of 50 characters. These should be preserved when read. // // Long labels in the mol/sdfile are not truncated or wrapped onto multiple lines. As a result, the // line could be 114 characters in length (excluding the newline). // // By stopping and restarting the Manual Numbering Tool, it's possible to create non-sequential // or even duplicate numbers or labels. This is reasonable for the intended purpose of the tool - // labelling the structure as you wish. If unique labels are required, downstream processing will be // necessary to enforce this. // case M_ZZC: if (mode == Mode.STRICT) { throw new CDKException("Atom property ZZC is illegal in STRICT mode"); } index = readMolfileInt(line, 7) - 1; String atomLabel = line.substring(11); // DO NOT TRIM container.getAtom(offset + index).setProperty(CDKConstants.ACDLABS_LABEL, atomLabel); break; // M STYnn8 sss ttt ... // sss: Sgroup number // ttt: Sgroup type: SUP = abbreviation Sgroup (formerly called superatom), MUL = multiple group, // SRU = SRU type, MON = monomer, MER = Mer type, COP = copolymer, CRO = crosslink, // MOD = modification, GRA = graft, COM = component, MIX = mixture, // FOR = formulation, DAT = data Sgroup, ANY = any polymer, GEN = generic. // // Note: For a given Sgroup, an STY line giving its type must appear before any other line that // supplies information about it. For a data Sgroup, an SDT line must describe the data // field before the SCD and SED lines that contain the data (see Data Sgroup Data below). // When a data Sgroup is linked to another Sgroup, the Sgroup must already have been defined. // // Sgroups can be in any order on the Sgroup Type line. Brackets are drawn around Sgroups with the // M SDI lines defining the coordinates. case M_STY: count = readMolfileInt(line, 6); for (int i = 0; i < count; i++) { lnOffset = 10 + (i * 8); index = readMolfileInt(line, lnOffset); if (mode == Mode.STRICT && sgroups.containsKey(index)) handleError("STY line must appear before any other line that supplies Sgroup information"); sgroup = new Sgroup(); sgroups.put(index, sgroup); SgroupType type = SgroupType.parseCtabKey(line.substring(lnOffset + 4, lnOffset + 7)); if (type != null) sgroup.setType(type); } break; // Sgroup Subtype [Sgroup] // M SSTnn8 sss ttt ... // ttt: Polymer Sgroup subtypes: ALT = alternating, RAN = random, BLO = block case M_SST: count = readMolfileInt(line, 6); for (int i = 0, st = 10; i < count && st + 7 <= length; i++, st += 8) { sgroup = ensureSgroup(sgroups, readMolfileInt(line, st)); if (mode == Mode.STRICT && sgroup.getType() != SgroupType.CtabCopolymer) handleError("SST (Sgroup Subtype) specified for a non co-polymer group"); String sst = line.substring(st+4, st+7); if (mode == Mode.STRICT && !("ALT".equals(sst) || "RAN".equals(sst) || "BLO".equals(sst))) handleError("Invalid sgroup subtype: " + sst + " expected (ALT, RAN, or BLO)"); sgroup.putValue(SgroupKey.CtabSubType, sst); } break; // Sgroup Atom List [Sgroup] // M SAL sssn15 aaa ... // aaa: Atoms in Sgroup sss case M_SAL: sgroup = ensureSgroup(sgroups, readMolfileInt(line, 7)); count = readMolfileInt(line, 10); for (int i = 0, st = 14; i < count && st + 3 <= length; i++, st += 4) { index = readMolfileInt(line, st) - 1; sgroup.addAtom(container.getAtom(offset + index)); } break; // Sgroup Bond List [Sgroup] // M SBL sssn15 bbb ... // bbb: Bonds in Sgroup sss. // (For data Sgroups, bbb’s are the containment bonds, for all other // Sgroup types, bbb’s are crossing bonds.) case M_SBL: sgroup = ensureSgroup(sgroups, readMolfileInt(line, 7)); count = readMolfileInt(line, 10); for (int i = 0, st = 14; i < count && st + 3 <= length; i++, st += 4) { index = readMolfileInt(line, st) - 1; sgroup.addBond(container.getBond(offset + index)); } break; // Sgroup Hierarchy Information [Sgroup] // M SPLnn8 ccc ppp ... // ccc: Sgroup index of the child Sgroup // ppp: Sgroup index of the parent Sgroup (ccc and ppp must already be defined via an // STY line prior to encountering this line) case M_SPL: count = readMolfileInt(line, 6); for (int i = 0, st = 10; i < count && st + 6 <= length; i++, st += 8) { sgroup = ensureSgroup(sgroups, readMolfileInt(line, st)); sgroup.addParent(ensureSgroup(sgroups, readMolfileInt(line, st+4))); } break; // Sgroup Connectivity [Sgroup] // M SCNnn8 sss ttt ... // ttt: HH = head-to-head, HT = head-to-tail, EU = either unknown. // Left justified. case M_SCN: count = readMolfileInt(line, 6); for (int i = 0, st = 10; i < count && st + 6 <= length; i++, st += 8) { sgroup = ensureSgroup(sgroups, readMolfileInt(line, st)); String con = line.substring(st + 4, Math.min(length, st + 7)).trim(); if (mode == Mode.STRICT && !("HH".equals(con) || "HT".equals(con) || "EU".equals(con))) handleError("Unknown SCN type (expected: HH, HT, or EU) was " + con); sgroup.putValue(SgroupKey.CtabConnectivity, con); } break; // Sgroup Display Information // M SDI sssnn4 x1 y1 x2 y2 // x1,y1, Coordinates of bracket endpoints // x2,y2: case M_SDI: sgroup = ensureSgroup(sgroups, readMolfileInt(line, 7)); count = readMolfileInt(line, 10); assert count == 4; // fixed? sgroup.addBracket(new SgroupBracket(readMDLCoordinate(line, 13), readMDLCoordinate(line, 23), readMDLCoordinate(line, 33), readMDLCoordinate(line, 43))); break; // Sgroup subscript // M SMT sss m... // m...: Text of subscript Sgroup sss. // (For multiple groups, m... is the text representation of the multiple group multiplier. // For abbreviation Sgroups, m... is the text of the abbreviation Sgroup label.) case M_SMT: sgroup = ensureSgroup(sgroups, readMolfileInt(line, 7)); sgroup.putValue(SgroupKey.CtabSubScript, line.substring(11).trim()); break; // Sgroup Bracket Style // The format for the Sgroup bracket style is as follows: // M SBTnn8 sss ttt ... // where: // sss: Index of Sgroup // ttt: Bracket display style: 0 = default, 1 = curved (parenthetic) brackets // This appendix supports altering the display style of the Sgroup brackets. case M_SBT: count = readMolfileInt(line, 6); for (int i = 0, st = 10; i < count && st + 7 <= length; i++, st += 8) { sgroup = ensureSgroup(sgroups, readMolfileInt(line, st)); sgroup.putValue(SgroupKey.CtabBracketStyle, readMolfileInt(line, st+4)); } break; // Sgroup Expansion // M SDS EXPn15 sss ... // sss: Sgroup index of expanded abbreviation Sgroups case M_SDS: if ("EXP".equals(line.substring(7, 10))) { count = readMolfileInt(line, 10); for (int i = 0, st = 14; i < count && st + 3 <= length; i++, st += 4) { sgroup = ensureSgroup(sgroups, readMolfileInt(line, st)); sgroup.putValue(SgroupKey.CtabExpansion, true); } } else if (mode == Mode.STRICT) { handleError("Expected EXP to follow SDS tag"); } break; // Multiple Group Parent Atom List [Sgroup] // M SPA sssn15 aaa ... // aaa: Atoms in paradigmatic repeating unit of multiple group sss // Note: To ensure that all current molfile readers consistently // interpret chemical structures, multiple groups are written // in their fully expanded state to the molfile. The M SPA atom // list is a subset of the full atom list that is defined by the // Sgroup Atom List M SAL entry. case M_SPA: sgroup = ensureSgroup(sgroups, readMolfileInt(line, 7)); count = readMolfileInt(line, 10); Set parentAtomList = sgroup.getValue(SgroupKey.CtabParentAtomList); if (parentAtomList == null) { sgroup.putValue(SgroupKey.CtabParentAtomList, parentAtomList = new HashSet()); } for (int i = 0, st = 14; i < count && st + 3 <= length; i++, st += 4) { index = readMolfileInt(line, st) - 1; parentAtomList.add(container.getAtom(offset + index)); } break; // Sgroup Component Numbers [Sgroup] // M SNCnn8 sss ooo ... // sss: Index of component Sgroup // ooo: Integer component order (1...256). This limit applies only to MACCS-II case M_SNC: count = readMolfileInt(line, 6); for (int i = 0, st = 10; i < count && st + 7 <= length; i++, st += 8) { sgroup = ensureSgroup(sgroups, readMolfileInt(line, st)); sgroup.putValue(SgroupKey.CtabComponentNumber, readMolfileInt(line, st+4)); } break; // M END // // This entry goes at the end of the properties block and is required for molfiles which contain a // version stamp in the counts line. case M_END: break LINES; } } if (!sgroups.isEmpty()) { // load Sgroups into molecule, first we downcast List sgroupOrgList = new ArrayList<>(sgroups.values()); List sgroupCpyList = new ArrayList<>(sgroupOrgList.size()); for (int i = 0; i < sgroupOrgList.size(); i++) { Sgroup cpy = sgroupOrgList.get(i).downcast(); sgroupCpyList.add(cpy); } // update replaced parents for (int i = 0; i < sgroupOrgList.size(); i++) { Sgroup newSgroup = sgroupCpyList.get(i); Set oldParents = new HashSet<>(newSgroup.getParents()); newSgroup.removeParents(oldParents); for (Sgroup parent : oldParents) { newSgroup.addParent(sgroupCpyList.get(sgroupOrgList.indexOf(parent))); } } container.setProperty(CDKConstants.CTAB_SGROUPS, sgroupCpyList); } } private Sgroup ensureSgroup(Map map, int idx) throws CDKException { Sgroup sgroup = map.get(idx); if (sgroup == null) { if (mode == Mode.STRICT) handleError("Sgroups must first be defined by a STY property"); map.put(idx, sgroup = new Sgroup()); } return sgroup; } /** * Convert an MDL V2000 stereo value to the CDK {@link IBond.Stereo}. The * method should only be invoked for single/double bonds. If strict mode is * enabled irrational bond stereo/types cause errors (e.g. up double bond). * * @param stereo stereo value * @param type bond type * @return bond stereo * @throws CDKException the stereo value was invalid (strict mode). */ private IBond.Stereo toStereo(final int stereo, final int type) throws CDKException { switch (stereo) { case 0: return type == 2 ? IBond.Stereo.E_Z_BY_COORDINATES : IBond.Stereo.NONE; case 1: if (mode == Mode.STRICT && type == 2) throw new CDKException("stereo flag was 'up' but bond order was 2"); return IBond.Stereo.UP; case 3: if (mode == Mode.STRICT && type == 1) throw new CDKException("stereo flag was 'cis/trans' but bond order was 1"); return IBond.Stereo.E_OR_Z; case 4: if (mode == Mode.STRICT && type == 2) throw new CDKException("stereo flag was 'up/down' but bond order was 2"); return IBond.Stereo.UP_OR_DOWN; case 6: if (mode == Mode.STRICT && type == 2) throw new CDKException("stereo flag was 'down' but bond order was 2"); return IBond.Stereo.DOWN; } if (mode == Mode.STRICT) throw new CDKException("unknown bond stereo type: " + stereo); return IBond.Stereo.NONE; } /** * Determine the length of the line excluding trailing whitespace. * * @param str a string * @return the length when trailing white space is removed */ static int length(final String str) { int i = str.length() - 1; while (i >= 0 && str.charAt(i) == ' ') { i--; } return i + 1; } /** * Create an atom for the provided symbol. If the atom symbol is a periodic * element a new 'Atom' is created otherwise if the symbol is an allowed * query atom ('R', 'Q', 'A', '*', 'L', 'LP') a new 'PseudoAtom' is created. * If the symbol is invalid an exception is thrown. * * @param symbol input symbol * @param builder chem object builder * @return a new atom * @throws CDKException the symbol is not allowed */ private IAtom createAtom(String symbol, IChemObjectBuilder builder, int lineNum) throws CDKException { if (isPeriodicElement(symbol)) return builder.newInstance(IAtom.class, symbol); if (!isPseudoElement(symbol)) { handleError("invalid symbol: " + symbol, lineNum, 31, 34); // when strict only accept labels from the specification if (mode == Mode.STRICT) throw new CDKException("invalid symbol: " + symbol); } // will be renumbered later by RGP if R1, R2 etc. if not renumbered then // 'R' is a better label than 'R#' if now RGP is specified if (symbol.equals("R#")) symbol = "R"; IAtom atom = builder.newInstance(IPseudoAtom.class, symbol); atom.setSymbol(symbol); atom.setAtomicNumber(0); // avoid NPE downstream return atom; } /** * Is the symbol a periodic element. * * @param symbol a symbol from the input * @return the symbol is a pseudo atom */ private static boolean isPeriodicElement(final String symbol) { // XXX: PeriodicTable is slow - switch without file IO would be optimal Integer elem = PeriodicTable.getAtomicNumber(symbol); return elem != null && elem > 0; } /** * Is the atom symbol a non-periodic element (i.e. pseudo). Valid pseudo * atoms are 'R#', 'A', 'Q', '*', 'L' and 'LP'. We also accept 'R' but this * is not listed in the specification. * * @param symbol a symbol from the input * @return the symbol is a valid pseudo element */ static boolean isPseudoElement(final String symbol) { return PSEUDO_LABELS.contains(symbol); } /** * Read a coordinate from an MDL input. The MDL V2000 input coordinate has * 10 characters, 4 significant figures and is prefixed with whitespace for * padding: 'xxxxx.xxxx'. Knowing the format allows us to use an optimised * parser which does not consider exponents etc. * * @param line input line * @param offset first character of the coordinate * @return the specified value * @throws CDKException the coordinates specification was not valid */ static double readMDLCoordinate(final String line, int offset) throws CDKException { // to be valid the decimal should be at the fifth index (4 sig fig) if (line.charAt(offset + 5) != '.') throw new CDKException("invalid coordinate specification"); int start = offset; while (line.charAt(start) == ' ') start++; int sign = sign(line.charAt(start)); if (sign < 0) start++; int integral = readUInt(line, start, (offset + 5) - start); int fraction = readUInt(line, offset + 6, 4); return sign * (integral * 10000l + fraction) / 10000d; } /** * Convert the a character (from an MDL V2000 input) to a charge value: * 1 = +1, 2 = +2, 3 = +3, 4 = doublet radical, 5 = -1, 6 = -2, 7 = -3. * * @param c a character * @return formal charge */ private static int toCharge(final char c) { switch (c) { case '1': return +3; case '2': return +2; case '3': return +1; case '4': return 0; // doublet radical - superseded by M RAD case '5': return -1; case '6': return -2; case '7': return -3; } return 0; } /** * Obtain the sign of the character, -1 if the character is '-', +1 * otherwise. * * @param c a character * @return the sign */ private static int sign(final char c) { return c == '-' ? -1 : +1; } /** * Convert a character (ASCII code points) to an integer. If the character * was not a digit (i.e. space) the value defaults to 0. * * @param c a character * @return the numerical value */ private static int toInt(final char c) { // Character.getNumericalValue allows all of unicode which we don't want // or need it - imagine an MDL file with roman numerals! return c >= '0' && c <= '9' ? c - '0' : 0; } /** * Read an unsigned int value from the given index with the expected number * of digits. * * @param line input line * @param index start index * @param digits number of digits (max) * @return an unsigned int */ private static int readUInt(final String line, int index, int digits) { int result = 0; while (digits-- > 0) result = (result * 10) + toInt(line.charAt(index++)); return result; } /** * Optimised method for reading a integer from 3 characters in a string at a * specified index. MDL V2000 Molfile make heavy use of the 3 character ints * in the atom/bond and property blocks. The integer may be signed and * pre/post padded with white space. * * @param line input * @param index start index * @return the value specified in the string */ private static int readMolfileInt(final String line, final int index) { int sign = 1; int result = 0; char c; switch ((c = line.charAt(index))) { case ' ': break; case '-': sign = -1; break; case '0': case '1': case '2': case '3': case '4': case '5': case '6': case '7': case '8': case '9': result = (c - '0'); break; default: return 0; } switch ((c = line.charAt(index + 1))) { case ' ': if (result > 0) return sign * result; break; case '-': if (result > 0) return sign * result; sign = -1; break; case '0': case '1': case '2': case '3': case '4': case '5': case '6': case '7': case '8': case '9': result = (result * 10) + (c - '0'); break; default: return sign * result; } switch ((c = line.charAt(index + 2))) { case ' ': if (result > 0) return sign * result; break; case '-': if (result > 0) return sign * result; sign = -1; break; case '0': case '1': case '2': case '3': case '4': case '5': case '6': case '7': case '8': case '9': result = (result * 10) + (c - '0'); break; default: return sign * result; } return sign * result; } /** * Labels the atom at the specified index with the provide label. If the * atom was not already a pseudo atom then the original atom is replaced. * * @param container structure * @param index atom index to replace * @param label the label for the atom * @see IPseudoAtom#setLabel(String) */ static void label(final IAtomContainer container, final int index, final String label) { final IAtom atom = container.getAtom(index); final IPseudoAtom pseudoAtom = atom instanceof IPseudoAtom ? (IPseudoAtom) atom : container.getBuilder() .newInstance(IPseudoAtom.class); if (atom == pseudoAtom) { pseudoAtom.setLabel(label); } else { pseudoAtom.setSymbol(label); pseudoAtom.setAtomicNumber(0); pseudoAtom.setPoint2d(atom.getPoint2d()); pseudoAtom.setPoint3d(atom.getPoint3d()); pseudoAtom.setMassNumber(atom.getMassNumber()); pseudoAtom.setFormalCharge(atom.getFormalCharge()); pseudoAtom.setValency(atom.getValency()); pseudoAtom.setLabel(label); // XXX: would be faster to track all replacements and do it all in one AtomContainerManipulator.replaceAtomByAtom(container, atom, pseudoAtom); } } /** * Reads an atom from the input allowing for non-standard formatting (i.e * truncated lines) and chemical shifts. * * @param line input line * @param builder chem object builder * @param linecount the current line count * @return an atom to add to a container * @throws CDKException a CDK error occurred * @throws IOException the isotopes file could not be read */ private IAtom readAtomSlow(String line, IChemObjectBuilder builder, int linecount) throws CDKException, IOException { IAtom atom; Matcher trailingSpaceMatcher = TRAILING_SPACE.matcher(line); if (trailingSpaceMatcher.find()) { handleError("Trailing space found", linecount, trailingSpaceMatcher.start(), trailingSpaceMatcher.end()); line = trailingSpaceMatcher.replaceAll(""); } double x = Double.parseDouble(line.substring(0, 10).trim()); double y = Double.parseDouble(line.substring(10, 20).trim()); double z = Double.parseDouble(line.substring(20, 30).trim()); String element = line.substring(31, Math.min(line.length(), 34)).trim(); if (line.length() < 34) { handleError("Element atom type does not follow V2000 format type should of length three" + " and padded with space if required", linecount, 31, 34); } logger.debug("Atom type: ", element); IsotopeFactory isotopeFactory = Isotopes.getInstance(); if (isotopeFactory.isElement(element)) { atom = isotopeFactory.configure(builder.newInstance(IAtom.class, element)); } else if ("A".equals(element)) { atom = builder.newInstance(IPseudoAtom.class, element); } else if ("Q".equals(element)) { atom = builder.newInstance(IPseudoAtom.class, element); } else if ("*".equals(element)) { atom = builder.newInstance(IPseudoAtom.class, element); } else if ("LP".equals(element)) { atom = builder.newInstance(IPseudoAtom.class, element); } else if ("L".equals(element)) { atom = builder.newInstance(IPseudoAtom.class, element); } else if (element.equals("R") || (element.length() > 0 && element.charAt(0) == 'R')) { logger.debug("Atom ", element, " is not an regular element. Creating a PseudoAtom."); //check if the element is R String[] rGroup = element.split("^R"); if (rGroup.length > 1) { try { element = "R" + Integer.valueOf(rGroup[(rGroup.length - 1)]); atom = builder.newInstance(IPseudoAtom.class, element); } catch (Exception ex) { // This happens for atoms labeled "R#". // The Rnumber may be set later on, using RGP line atom = builder.newInstance(IPseudoAtom.class, "R"); } } else { atom = builder.newInstance(IPseudoAtom.class, element); } } else { handleError("Invalid element type. Must be an existing " + "element, or one in: A, Q, L, LP, *.", linecount, 32, 35); atom = builder.newInstance(IPseudoAtom.class, element); atom.setSymbol(element); } // store as 3D for now, convert to 2D (if totalZ == 0.0) later atom.setPoint3d(new Point3d(x, y, z)); // parse further fields if (line.length() >= 36) { String massDiffString = line.substring(34, 36).trim(); logger.debug("Mass difference: ", massDiffString); if (!(atom instanceof IPseudoAtom)) { try { int massDiff = Integer.parseInt(massDiffString); if (massDiff != 0) { IIsotope major = Isotopes.getInstance().getMajorIsotope(element); atom.setMassNumber(major.getMassNumber() + massDiff); } } catch (NumberFormatException | IOException exception) { handleError("Could not parse mass difference field.", linecount, 35, 37, exception); } } else { logger.error("Cannot set mass difference for a non-element!"); } } else { handleError("Mass difference is missing", linecount, 34, 36); } // set the stereo partiy Integer parity = line.length() > 41 ? Character.digit(line.charAt(41), 10) : 0; atom.setStereoParity(parity); if (line.length() >= 51) { String valenceString = removeNonDigits(line.substring(48, 51)); logger.debug("Valence: ", valenceString); if (!(atom instanceof IPseudoAtom)) { try { int valence = Integer.parseInt(valenceString); if (valence != 0) { //15 is defined as 0 in mol files if (valence == 15) atom.setValency(0); else atom.setValency(valence); } } catch (Exception exception) { handleError("Could not parse valence information field", linecount, 49, 52, exception); } } else { logger.error("Cannot set valence information for a non-element!"); } } if (line.length() >= 39) { String chargeCodeString = line.substring(36, 39).trim(); logger.debug("Atom charge code: ", chargeCodeString); int chargeCode = Integer.parseInt(chargeCodeString); if (chargeCode == 0) { // uncharged species } else if (chargeCode == 1) { atom.setFormalCharge(+3); } else if (chargeCode == 2) { atom.setFormalCharge(+2); } else if (chargeCode == 3) { atom.setFormalCharge(+1); } else if (chargeCode == 4) { } else if (chargeCode == 5) { atom.setFormalCharge(-1); } else if (chargeCode == 6) { atom.setFormalCharge(-2); } else if (chargeCode == 7) { atom.setFormalCharge(-3); } } else { handleError("Atom charge is missing", linecount, 36, 39); } try { // read the mmm field as position 61-63 String reactionAtomIDString = line.substring(60, 63).trim(); logger.debug("Parsing mapping id: ", reactionAtomIDString); try { int reactionAtomID = Integer.parseInt(reactionAtomIDString); if (reactionAtomID != 0) { atom.setProperty(CDKConstants.ATOM_ATOM_MAPPING, reactionAtomID); } } catch (Exception exception) { logger.error("Mapping number ", reactionAtomIDString, " is not an integer."); logger.debug(exception); } } catch (Exception exception) { // older mol files don't have all these fields... logger.warn("A few fields are missing. Older MDL MOL file?"); } //shk3: This reads shifts from after the molecule. I don't think this is an official format, but I saw it frequently 80=>78 for alk if (line.length() >= 78) { double shift = Double.parseDouble(line.substring(69, 80).trim()); atom.setProperty("first shift", shift); } if (line.length() >= 87) { double shift = Double.parseDouble(line.substring(79, 87).trim()); atom.setProperty("second shift", shift); } return atom; } /** * Read a bond line from an MDL V2000 molfile bond block (slow). The * explicit valence is also modified. * * @param line the input from the bond block * @param builder chem object builder * @param atoms array of atoms * @param explicitValence stores the explicit valence of each atom (bond order sum) * @param linecount the current line count * @return a new bond * @throws CDKException the bond line could not be parsed */ private IBond readBondSlow(String line, IChemObjectBuilder builder, IAtom[] atoms, int[] explicitValence, int linecount) throws CDKException { int atom1 = Integer.parseInt(line.substring(0, 3).trim()); int atom2 = Integer.parseInt(line.substring(3, 6).trim()); int order = Integer.parseInt(line.substring(6, 9).trim()); IBond.Stereo stereo = null; if (line.length() >= 12) { int mdlStereo = line.length() > 12 ? Integer.parseInt(line.substring(9, 12).trim()) : Integer.parseInt(line .substring(9).trim()); if (mdlStereo == 1) { // MDL up bond stereo = IBond.Stereo.UP; } else if (mdlStereo == 6) { // MDL down bond stereo = IBond.Stereo.DOWN; } else if (mdlStereo == 0) { if (order == 2) { // double bond stereo defined by coordinates stereo = IBond.Stereo.E_Z_BY_COORDINATES; } else { // bond has no stereochemistry stereo = IBond.Stereo.NONE; } } else if (mdlStereo == 3 && order == 2) { // unknown E/Z stereochemistry stereo = IBond.Stereo.E_OR_Z; } else if (mdlStereo == 4) { //MDL bond undefined stereo = IBond.Stereo.UP_OR_DOWN; } } else { handleError("Missing expected stereo field at line: ", linecount, 10, 12); } if (logger.isDebugEnabled()) { logger.debug("Bond: " + atom1 + " - " + atom2 + "; order " + order); } // interpret CTfile's special bond orders IAtom a1 = atoms[atom1 - 1]; IAtom a2 = atoms[atom2 - 1]; IBond newBond; if (order >= 1 && order <= 3) { IBond.Order cdkOrder = IBond.Order.SINGLE; if (order == 2) cdkOrder = IBond.Order.DOUBLE; if (order == 3) cdkOrder = IBond.Order.TRIPLE; if (stereo != null) { newBond = builder.newInstance(IBond.class, a1, a2, cdkOrder, stereo); } else { newBond = builder.newInstance(IBond.class, a1, a2, cdkOrder); } explicitValence[atom1 - 1] += cdkOrder.numeric(); explicitValence[atom2 - 1] += cdkOrder.numeric(); } else if (order == 4) { // aromatic bond if (stereo != null) { newBond = builder.newInstance(IBond.class, a1, a2, IBond.Order.UNSET, stereo); } else { newBond = builder.newInstance(IBond.class, a1, a2, IBond.Order.UNSET); } // mark both atoms and the bond as aromatic and raise the SINGLE_OR_DOUBLE-flag newBond.setFlag(CDKConstants.SINGLE_OR_DOUBLE, true); newBond.setFlag(CDKConstants.ISAROMATIC, true); a1.setFlag(CDKConstants.ISAROMATIC, true); a2.setFlag(CDKConstants.ISAROMATIC, true); explicitValence[atom1 - 1] = explicitValence[atom2 - 1] = Integer.MIN_VALUE; } else { newBond = new CTFileQueryBond(builder); IAtom[] bondAtoms = {a1, a2}; newBond.setAtoms(bondAtoms); newBond.setOrder(IBond.Order.UNSET); CTFileQueryBond.Type queryBondType = null; switch (order) { case 5: queryBondType = CTFileQueryBond.Type.SINGLE_OR_DOUBLE; break; case 6: queryBondType = CTFileQueryBond.Type.SINGLE_OR_AROMATIC; break; case 7: queryBondType = CTFileQueryBond.Type.DOUBLE_OR_AROMATIC; break; case 8: queryBondType = CTFileQueryBond.Type.ANY; break; } ((CTFileQueryBond) newBond).setType(queryBondType); newBond.setStereo(stereo); explicitValence[atom1 - 1] = explicitValence[atom2 - 1] = Integer.MIN_VALUE; } return newBond; } /** * Read the properties from the V2000 block (slow). * * @param input input source * @param container the container with the atoms / bonds loaded * @param nAtoms the number of atoms in the atom block * @param linecount the line count * @throws IOException internal low-level error * @throws CDKException the properties block could not be parsed */ private void readPropertiesSlow(BufferedReader input, IAtomContainer container, int nAtoms, int linecount) throws IOException, CDKException { logger.info("Reading property block"); String line; while (true) { line = input.readLine(); linecount++; if (line == null) { handleError("The expected property block is missing!", linecount, 0, 0); } if (line.startsWith("M END")) break; boolean lineRead = false; if (line.startsWith("M CHG")) { // FIXME: if this is encountered for the first time, all // atom charges should be set to zero first! int infoCount = Integer.parseInt(line.substring(6, 9).trim()); StringTokenizer st = new StringTokenizer(line.substring(9)); for (int i = 1; i <= infoCount; i++) { String token = st.nextToken(); int atomNumber = Integer.parseInt(token.trim()); token = st.nextToken(); int charge = Integer.parseInt(token.trim()); container.getAtom(atomNumber - 1).setFormalCharge(charge); } } else if (line.matches("A\\s{1,4}\\d+")) { // Reads the pseudo atom property from the mol file // The atom number of the to replaced atom int aliasAtomNumber = Integer.parseInt(line.replaceFirst("A\\s{1,4}", "")); String alias = input.readLine(); linecount++; IAtom aliasAtom = container.getAtom(aliasAtomNumber - 1); // skip if already a pseudoatom if (aliasAtom instanceof IPseudoAtom) { ((IPseudoAtom) aliasAtom).setLabel(alias); continue; } IAtom newPseudoAtom = container.getBuilder().newInstance(IPseudoAtom.class, alias); if (aliasAtom.getPoint2d() != null) newPseudoAtom.setPoint2d(aliasAtom.getPoint2d()); if (aliasAtom.getPoint3d() != null) newPseudoAtom.setPoint3d(aliasAtom.getPoint3d()); AtomContainerManipulator.replaceAtomByAtom(container, aliasAtom, newPseudoAtom); } else if (line.startsWith("M ISO")) { try { String countString = line.substring(6, 10).trim(); int infoCount = Integer.parseInt(countString); StringTokenizer st = new StringTokenizer(line.substring(10)); for (int i = 1; i <= infoCount; i++) { int atomNumber = Integer.parseInt(st.nextToken().trim()); int absMass = Integer.parseInt(st.nextToken().trim()); if (absMass != 0) { IAtom isotope = container.getAtom(atomNumber - 1); isotope.setMassNumber(absMass); } } } catch (NumberFormatException exception) { String error = "Error (" + exception.getMessage() + ") while parsing line " + linecount + ": " + line + " in property block."; logger.error(error); handleError("NumberFormatException in isotope information.", linecount, 7, 11, exception); } } else if (line.startsWith("M RAD")) { try { String countString = line.substring(6, 9).trim(); int infoCount = Integer.parseInt(countString); StringTokenizer st = new StringTokenizer(line.substring(9)); for (int i = 1; i <= infoCount; i++) { int atomNumber = Integer.parseInt(st.nextToken().trim()); int spinMultiplicity = Integer.parseInt(st.nextToken().trim()); MDLV2000Writer.SPIN_MULTIPLICITY spin = MDLV2000Writer.SPIN_MULTIPLICITY.NONE; if (spinMultiplicity > 0) { IAtom radical = container.getAtom(atomNumber - 1); switch (spinMultiplicity) { case 1: spin = MDLV2000Writer.SPIN_MULTIPLICITY.DOUBLET; break; case 2: spin = MDLV2000Writer.SPIN_MULTIPLICITY.SINGLET; break; case 3: spin = MDLV2000Writer.SPIN_MULTIPLICITY.TRIPLET; break; default: logger.debug("Invalid spin multiplicity found: " + spinMultiplicity); break; } for (int j = 0; j < spin.getSingleElectrons(); j++) { container.addSingleElectron(container.getBuilder().newInstance(ISingleElectron.class, radical)); } } } } catch (NumberFormatException exception) { String error = "Error (" + exception.getMessage() + ") while parsing line " + linecount + ": " + line + " in property block."; logger.error(error); handleError("NumberFormatException in radical information", linecount, 7, 10, exception); } } else if (line.startsWith("G ")) { try { String atomNumberString = line.substring(3, 6).trim(); int atomNumber = Integer.parseInt(atomNumberString); //String whatIsThisString = line.substring(6,9).trim(); String atomName = input.readLine(); // convert Atom into a PseudoAtom IAtom prevAtom = container.getAtom(atomNumber - 1); IPseudoAtom pseudoAtom = container.getBuilder().newInstance(IPseudoAtom.class, atomName); if (prevAtom.getPoint2d() != null) { pseudoAtom.setPoint2d(prevAtom.getPoint2d()); } if (prevAtom.getPoint3d() != null) { pseudoAtom.setPoint3d(prevAtom.getPoint3d()); } AtomContainerManipulator.replaceAtomByAtom(container, prevAtom, pseudoAtom); } catch (NumberFormatException exception) { String error = "Error (" + exception.toString() + ") while parsing line " + linecount + ": " + line + " in property block."; logger.error(error); handleError("NumberFormatException in group information", linecount, 4, 7, exception); } } else if (line.startsWith("M RGP")) { StringTokenizer st = new StringTokenizer(line); //Ignore first 3 tokens (overhead). st.nextToken(); st.nextToken(); st.nextToken(); //Process the R group numbers as defined in RGP line. while (st.hasMoreTokens()) { Integer position = Integer.valueOf(st.nextToken()); int rNumber = Integer.valueOf(st.nextToken()); // the container may have already had atoms before the new atoms were read int index = container.getAtomCount() - nAtoms + position - 1; IPseudoAtom pseudoAtom = (IPseudoAtom) container.getAtom(index); if (pseudoAtom != null) { pseudoAtom.setLabel("R" + rNumber); } } } if (line.startsWith("V ")) { Integer atomNumber = Integer.valueOf(line.substring(3, 6).trim()); IAtom atomWithComment = container.getAtom(atomNumber - 1); atomWithComment.setProperty(CDKConstants.COMMENT, line.substring(7)); } if (!lineRead) { logger.warn("Skipping line in property block: ", line); } } } /** * Read non-structural data from input and store as properties the provided * 'container'. Non-structural data appears in a structure data file (SDF) * after an Molfile and before the record deliminator ('$$$$'). The data * consists of one or more Data Header and Data blocks, an example is seen * below. * *
{@code
     * > 29 
     * 0.9132 - 20.0
     *
     * > 29 
     * 63.0 (737 MM)
     * 79.0 (42 MM)
     *
     * > 29 
     * SYLVAN
     *
     * > 29 
     * 09-23-1980
     *
     * > 29 
     * F-0213
     *
     * }
* * * @param input input source * @param container the container * @throws IOException an error occur whilst reading the input */ static void readNonStructuralData(final BufferedReader input, final IAtomContainer container) throws IOException { String line, header = null; boolean wrap = false; final StringBuilder data = new StringBuilder(80); while (!endOfRecord(line = input.readLine())) { final String newHeader = dataHeader(line); if (newHeader != null) { if (header != null) container.setProperty(header, data.toString()); header = newHeader; wrap = false; data.setLength(0); } else { if (data.length() > 0 || !line.equals(" ")) line = line.trim(); if (line.isEmpty()) continue; if (!wrap && data.length() > 0) data.append('\n'); data.append(line); wrap = line.length() == 80; } } if (header != null) container.setProperty(header, data.toString()); } /** * Obtain the field name from a potential SD data header. If the header * does not contain a field name, then null is returned. The method does * not currently return field numbers (e.g. DT<n>). * * @param line an input line * @return the field name */ static String dataHeader(final String line) { if (line.length() > 2 && line.charAt(0) != '>' && line.charAt(1) != ' ') return null; int i = line.indexOf('<', 2); if (i < 0) return null; int j = line.indexOf('>', i); if (j < 0) return null; return line.substring(i + 1, j); } /** * Is the line the end of a record. A line is the end of a record if it * is 'null' or is the SDF deliminator, '$$$$'. * * @param line a line from the input * @return the line indicates the end of a record was reached */ private static boolean endOfRecord(final String line) { return line == null || line.equals(RECORD_DELIMITER); } /** * Enumeration of property keys that can be specified in the V2000 property * block. */ enum PropertyKey { /** Atom Alias. */ ATOM_ALIAS, /** Atom Value. */ ATOM_VALUE, /** Group Abbreviation. */ GROUP_ABBREVIATION, /** Skip lines. */ SKIP, /** Charge [Generic]. */ M_CHG, /** Radical [Generic]. */ M_RAD, /** Isotope [Generic]. */ M_ISO, /** Ring Bond Count [Query]. */ M_RBC, /** Substitution Count [Query]. */ M_SUB, /** Unsaturated Atom [Query]. */ M_UNS, /** Link Atom [Query]. */ M_LIN, /** Atom List [Query]. */ M_ALS, /** Attachment Point [Rgroup]. */ M_APO, /** Atom Attachment Order [Rgroup]. */ M_AAL, /** Rgroup Label Location [Rgroup]. */ M_RGP, /** Rgroup Logic, Unsatisfied Sites, Range of Occurrence [Rgroup]. */ M_LOG, /** Sgroup Type [Sgroup]. */ M_STY, /** Sgroup Subtype [Sgroup]. */ M_SST, /** Sgroup Labels [Sgroup]. */ M_SLB, /** Sgroup Connectivity [Sgroup]. */ M_SCN, /** Sgroup Expansion [Sgroup]. */ M_SDS, /** Sgroup Atom List [Sgroup]. */ M_SAL, /** Sgroup Bond List [Sgroup]. */ M_SBL, /** Multiple Group Parent Atom List [Sgroup]. */ M_SPA, /** Sgroup Subscript [Sgroup]. */ M_SMT, /** Sgroup Correspondence [Sgroup]. */ M_CRS, /** Sgroup Display Information [Sgroup]. */ M_SDI, /** Superatom Bond and Vector Information [Sgroup]. */ M_SBV, /** Data Sgroup Field Description [Sgroup]. */ M_SDT, /** Data Sgroup Display Information [Sgroup]. */ M_SDD, /** Data Sgroup Data. */ M_SCD, /** Data Sgroup Data. */ M_SED, /** Sgroup Hierarchy Information. */ M_SPL, /** Sgroup Component Numbers. */ M_SNC, /** Sgroup Bracket Style. */ M_SBT, /** 3D Feature Properties. */ M_$3D, /** ACDLabs Atom Label */ M_ZZC, /** End of Block. */ M_END, /** Non-property header. */ UNKNOWN; /** Index of 'M XXX' properties for quick lookup. */ private static final Map mSuffix = new HashMap(60); static { for (PropertyKey p : values()) { if (p.name().charAt(0) == 'M') mSuffix.put(p.name().substring(2, 5), p); } } /** * Determine the property key of the provided line. * * @param line an property line * @return the key (defaults to {@link #UNKNOWN}) */ static PropertyKey of(final String line) { if (line.length() < 5) return UNKNOWN; switch (line.charAt(0)) { case 'A': if (line.charAt(1) == ' ' && line.charAt(2) == ' ') return ATOM_ALIAS; return UNKNOWN; case 'G': if (line.charAt(1) == ' ' && line.charAt(2) == ' ') return GROUP_ABBREVIATION; return UNKNOWN; case 'S': if (line.charAt(1) == ' ' && line.charAt(2) == ' ') return SKIP; return UNKNOWN; case 'V': if (line.charAt(1) == ' ' && line.charAt(2) == ' ') return ATOM_VALUE; return UNKNOWN; case 'M': if (line.charAt(1) != ' ' || line.charAt(2) != ' ') return UNKNOWN; PropertyKey property = mSuffix.get(line.substring(3, 6)); if (property != null) return property; return UNKNOWN; } return UNKNOWN; } } /** * Defines the version of the CTab. */ enum CTabVersion { V2000, V3000, UNSPECIFIED; /** * Given a CTab header, what version was specified. The version * is identifier in the by the presence of 'V[2|3]000'. If not * version tag is present the version is unspecified. * *
  5  5  0  0  0  0            999 V2000
         * 
  0  0  0  0  0  0            999 V3000
         *
         * @param header input line (non-null)
         * @return the CTab version
         */
        static CTabVersion ofHeader(String header) {
            if (header.length() < 39) return UNSPECIFIED;
            char c = header.charAt(34);
            if (c != 'v' && c != 'V') return UNSPECIFIED;
            if (header.charAt(35) == '2') // could check for '000'
                return V2000;
            if (header.charAt(35) == '3') // could check for '000'
                return V3000;
            return UNSPECIFIED;
        }
    }

}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy