com.hfg.chem.format.MDL_SDF Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of com_hfg Show documentation
Show all versions of com_hfg Show documentation
com.hfg xml, html, svg, and bioinformatics utility library
package com.hfg.chem.format;
import java.io.BufferedReader;
import java.util.ArrayList;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import com.hfg.chem.Atom;
import com.hfg.chem.CovalentBond;
import com.hfg.chem.Element;
import com.hfg.chem.Molecule;
import com.hfg.bio.seq.format.SeqIOException;
import com.hfg.chem.ValenceModel;
import com.hfg.util.StringBuilderPlus;
import com.hfg.util.StringUtil;
//------------------------------------------------------------------------------
/**
Basic implementation of the MDL SDF format.
@author J. Alex Taylor, hairyfatguy.com
*/
//------------------------------------------------------------------------------
// com.hfg Library
//
// This library is free software; you can redistribute it and/or
// modify it under the terms of the GNU Lesser General Public
// License as published by the Free Software Foundation; either
// version 2.1 of the License, or (at your option) any later version.
//
// This library is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
// Lesser General Public License for more details.
//
// You should have received a copy of the GNU Lesser General Public
// License along with this library; if not, write to the Free Software
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
//
// J. Alex Taylor, President, Founder, CEO, COO, CFO, OOPS hairyfatguy.com
// [email protected]
//------------------------------------------------------------------------------
public class MDL_SDF extends ReadableChemFormatBase
{
private static final Pattern ATTRIBUTE_HEADER_PATTERN = Pattern.compile(">\\s+<(\\S+)>");
// TODO: Add a way to specify the valence model or whether the MDL file uses the pre or post 2017 valence models
private ValenceModel mValenceModel = ValenceModel.MDL_2017;
private List mMolLines = new ArrayList<>(50);
private Integer mAtomCount;
private Integer mBondCount;
//###########################################################################
// CONSTRUCTORS
//###########################################################################
//---------------------------------------------------------------------------
public MDL_SDF()
{
super(null);
}
//---------------------------------------------------------------------------
public MDL_SDF(MoleculeFactory inMoleculeFactory)
{
super(inMoleculeFactory);
}
//###########################################################################
// PUBLIC METHODS
//###########################################################################
//---------------------------------------------------------------------------
@Override
public boolean hasJanusDelimiter()
{
return false;
}
//---------------------------------------------------------------------------
@Override
public boolean isEndOfRecord(String inLine)
{
return inLine.trim().equals("$$$$");
}
//---------------------------------------------------------------------------
@Override
public synchronized T readRecord(BufferedReader inReader) throws ChemIOException
{
if (null == getMoleculeFactory())
{
throw new SeqIOException("No BioSequence factory has been specified!");
}
T mol;
try
{
mol = getMoleculeFactory().createMoleculeObj();
mMolLines.clear();
mAtomCount = null;
mBondCount = null;
StringBuilderPlus currentAttributeValue = new StringBuilderPlus().setDelimiter("\n");
String currentAttributeName = null;
String line;
boolean structureComplete = false;
int lineCount = 1;
while ((line = inReader.readLine()) != null)
{
if (! structureComplete)
{
// The might not be an molfile section
Matcher m = ATTRIBUTE_HEADER_PATTERN.matcher(line);
if (m.matches())
{
structureComplete = true;
if (mMolLines.size() > 0)
{
setStructure(mol, mMolLines);
}
}
else
{
mMolLines.add(line);
if (line.trim().matches("M\\s+END"))
{
structureComplete = true;
setStructure(mol, mMolLines);
continue;
}
}
}
if (structureComplete)
{
line = line.trim();
// The molecule record can optionally be followed by attributes
if (!StringUtil.isSet(line)) // A blank line is used to separate attributes
{
if (StringUtil.isSet(currentAttributeName))
{
mol.setAttribute(currentAttributeName, (currentAttributeValue.length() > 0 ? currentAttributeValue.toString() : null));
currentAttributeName = null;
}
}
else
{
Matcher m = ATTRIBUTE_HEADER_PATTERN.matcher(line);
if (m.matches())
{
if (currentAttributeName != null)
{
// The blank line between attributes may have been missing
mol.setAttribute(currentAttributeName, (currentAttributeValue.length() > 0 ? currentAttributeValue.toString() : null));
}
currentAttributeName = m.group(1);
currentAttributeValue.setLength(0);
}
else if (currentAttributeName != null)
{
currentAttributeValue.delimitedAppend(line);
}
}
}
}
}
catch (Exception e)
{
if (e instanceof ChemIOException)
{
throw (ChemIOException) e;
}
else
{
throw new ChemIOException(e);
}
}
return mol;
}
//---------------------------------------------------------------------------
private void setStructure(T inMolecule, List inMolLines)
{
// inMolecule.setAttribute("molfile", inMolLines);
parseMolHeader(inMolecule, inMolLines);
// Examine the Counts line
// Ex: ' 9 8 0 0 0 0 0 0 0999 V2000'
parseCountsLine(inMolecule, inMolLines);
parseAtomsBlock(inMolecule, inMolLines);
parseBondsBlock(inMolecule, inMolLines);
parseProperties(inMolecule, inMolLines);
if (inMolecule.getAtoms() != null)
{
// Set implicit hydrogen counts
for (Atom molAtom : new ArrayList<>(inMolecule.getAtoms()))
{
int implicitHCount = mValenceModel.calculateImplicitHCount(molAtom);
if (implicitHCount > 0)
{
for (int i = 0; i < implicitHCount; i++)
{
Atom hAtom = new Atom(Element.HYDROGEN);
inMolecule.addAtom(hAtom);
CovalentBond bond = new CovalentBond(molAtom, hAtom);
molAtom.addBond(bond);
hAtom.addBond(bond);
}
}
}
}
}
//---------------------------------------------------------------------------
private void parseMolHeader(T inMolecule, List inMolLines)
{
if (inMolLines.size() >= 3)
{
// The first 3 line constitute the record header
// The first line may contain the name of the molecule
String structureStringName = inMolLines.get(0).trim();
if (StringUtil.isSet(structureStringName)
&& ! structureStringName.equals("NO STRUCTURE"))
{
inMolecule.setName(structureStringName);
}
// Line 2 optionally contains the details of the software used to generate the record
// Line 3 contains an optional comment
}
}
//---------------------------------------------------------------------------
private void parseCountsLine(T inMolecule, List inMolLines)
{
// The 4th line may contain the Counts line.
// The Counts line is composed of 12 fixed-length fields - the first eleven
// are 3 characters long, and the last 6 characters long.
// The first two fields are the number of atoms and bonds respectively.
// Ex: ' 9 8 0 0 0 0 0 0 0999 V2000'
if (inMolLines.size() >= 4)
{
String countsLine = inMolLines.get(3);
if (StringUtil.isSet(countsLine))
{
if (countsLine.length() != 39)
{
throw new ChemIOException("Unexpected Counts line length for " + StringUtil.singleQuote(countsLine) + "!");
}
// The first field is the number of atoms
mAtomCount = Integer.parseInt(countsLine.substring(0, 3).trim());
// The first field is the number of bonds
mBondCount = Integer.parseInt(countsLine.substring(3, 6).trim());
// TODO: Chirality should be the 5th field (but 4th could be empty?)
}
}
}
//---------------------------------------------------------------------------
private void parseAtomsBlock(T inMolecule, List inMolLines)
{
// The 5th line may be the first atom line
// Ex: ' 1.9050 -0.7932 0.0000 C 0 0 0 0 0 0 0 0 0 0 0 0'
if (mAtomCount != null
&& inMolLines.size() >= 4 + mAtomCount)
{
for (int i = 0; i < mAtomCount; i++)
{
String atomLine = inMolLines.get(4 + i);
String[] fields = atomLine.trim().split("\\s+");
Element element = Element.valueOf(fields[3]);
Atom atom = new Atom(element)
.setXCoordinate(Float.valueOf(fields[0]))
.setYCoordinate(Float.valueOf(fields[1]))
.setZCoordinate(Float.valueOf(fields[2]));
int chargeValue = Integer.parseInt(fields[5]);
if (chargeValue != 0)
{
int charge = 0;
switch (chargeValue)
{
case 7:
charge = -3;
break;
case 6:
charge = -2;
break;
case 5:
charge = -1;
break;
case 3:
charge = 1;
break;
case 2:
charge = 2;
break;
case 1:
charge = 3;
break;
// TODO: 4 ==> Doublet radical
}
atom.setCharge(charge);
}
inMolecule.addAtom(atom);
}
}
}
//---------------------------------------------------------------------------
private void parseBondsBlock(T inMolecule, List inMolLines)
{
// Bond lines may follow the atom lines
// Ex: ' 2 1 1 0 0 0 0'
if (mBondCount != null
&& inMolLines.size() >= 4 + mAtomCount + mBondCount)
{
List atoms = inMolecule.getAtoms();
for (int i = 0; i < mBondCount; i++)
{
String bondLine = inMolLines.get(4 + mAtomCount + i);
String[] fields = bondLine.trim().split("\\s+");
int atom1Num = Integer.parseInt(fields[0]);
int atom2Num = Integer.parseInt(fields[1]);
Atom atom1 = atoms.get(atom1Num - 1);
Atom atom2 = atoms.get(atom2Num - 1);
CovalentBond bond = new CovalentBond(atom1, atom2);
int bondOrder = Integer.parseInt(fields[2]);
if (bondOrder <= 3)
{
bond.setBondOrder(bondOrder);
}
else if (bondOrder == 4)
{
bond.setBondOrder(1);
atom1.setIsAromatic(true);
atom2.setIsAromatic(true);
bond.setIsAromatic();
}
atom1.addBond(bond);
atom2.addBond(bond);
}
}
}
//---------------------------------------------------------------------------
private void parseProperties(T inMolecule, List inMolLines)
{
// Property lines may follow the Bonds block and will start with an 'M'
// Charge Ex: 'M CHG 1 1 2'
// Isotope Ex: 'M ISO 1 1 2'
if (mAtomCount != null
&& mBondCount != null
&& inMolLines.size() >= 4 + mAtomCount + mBondCount)
{
for (int i = 4 + mAtomCount + mBondCount; i < inMolLines.size(); i++)
{
String propertyLine = inMolLines.get(i);
if (propertyLine.startsWith("M CHG"))
{
// Charge
// The 1st field specifies the number of defined charges (up to 8).
// Ea. defined charge consists of the atom # (1-based) and a charge
String[] fields = propertyLine.split("\\s+");
for (int index = 3; index < fields.length - 1; index+=2)
{
int atomNum = Integer.parseInt(fields[index]);
int charge = Integer.parseInt(fields[index + 1]);
Atom atom = inMolecule.getAtoms().get(atomNum - 1);
atom.setCharge(charge);
}
}
else if (propertyLine.startsWith("M ISO"))
{
// Isotope
// TODO
}
}
}
}
}