All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.openscience.cdk.io.RGroupQueryReader Maven / Gradle / Ivy

There is a newer version: 2.10
Show newest version
/*
 * Copyright (C) 2010  Mark Rijnbeek 
 *
 * Contact: cdk-devel@lists.sourceforge.net
 *
 * This program is free software; you can redistribute it and/or
 * modify it under the terms of the GNU Lesser General Public License
 * as published by the Free Software Foundation; either version 2.1
 * of the License, or (at your option) any later version.
 * All we ask is that proper credit is given for our work, which includes
 * - but is not limited to - adding the above copyright notice to the beginning
 * of your source code files, and to any copyright notice that you may
 * distribute with programs based on this work.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU Lesser General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
 *
 */
package org.openscience.cdk.io;

import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.Reader;
import java.io.StringReader;

import java.util.ArrayList;
import java.util.HashMap;
import java.util.Map;
import java.util.StringTokenizer;

import org.openscience.cdk.exception.CDKException;
import org.openscience.cdk.interfaces.IAtom;
import org.openscience.cdk.interfaces.IAtomContainer;
import org.openscience.cdk.interfaces.IBond;
import org.openscience.cdk.interfaces.IChemObject;
import org.openscience.cdk.interfaces.IChemObjectBuilder;
import org.openscience.cdk.interfaces.IPseudoAtom;
import org.openscience.cdk.io.formats.IResourceFormat;
import org.openscience.cdk.io.formats.RGroupQueryFormat;
import org.openscience.cdk.isomorphism.matchers.IRGroupQuery;
import org.openscience.cdk.isomorphism.matchers.RGroup;
import org.openscience.cdk.isomorphism.matchers.RGroupList;
import org.openscience.cdk.isomorphism.matchers.RGroupQuery;
import org.openscience.cdk.tools.ILoggingTool;
import org.openscience.cdk.tools.LoggingToolFactory;

/**
 * A reader for Symyx' Rgroup files (RGFiles).
 * An RGfile describes a single molecular query with Rgroups.
 * Each RGfile is a combination of Ctabs defining the root molecule and each
 * member of each Rgroup in the query.
 *
 * 

The RGFile format is described in the manual * * "CTFile Formats" , Chapter 5. * * @cdk.module io * @cdk.githash * @cdk.iooptions * * @cdk.keyword Rgroup * @cdk.keyword R group * @cdk.keyword R-group * @author Mark Rijnbeek */ public class RGroupQueryReader extends DefaultChemObjectReader { /** * Private bean style class to capture LOG (logic) lines. */ private class RGroupLogic { int rgoupNumberRequired; boolean restH; String occurence; } BufferedReader input = null; private static ILoggingTool logger = LoggingToolFactory.createLoggingTool(RGroupQueryReader.class); /** * Default constructor, input not set. */ public RGroupQueryReader() { this(new StringReader("")); } /** * Constructs a new RgroupQueryReader that can read RgroupAtomContainerSet * from a given InputStream. * @param in The InputStream to read from. */ public RGroupQueryReader(InputStream in) { this(new InputStreamReader(in)); } /** * Constructs a new RgroupQueryReader that can read RgroupAtomContainerSet * from a given Reader. * @param in The Reader to read from. */ public RGroupQueryReader(Reader in) { input = new BufferedReader(in); } /** * Sets the input Reader. * @param input Reader object * @throws CDKException */ @Override public void setReader(Reader input) throws CDKException { if (input instanceof BufferedReader) { this.input = (BufferedReader) input; } else { this.input = new BufferedReader(input); } } @Override public void setReader(InputStream input) throws CDKException { setReader(new InputStreamReader(input)); } @Override public IResourceFormat getFormat() { return RGroupQueryFormat.getInstance(); } @Override public boolean accepts(Class classObject) { Class[] interfaces = classObject.getInterfaces(); for (Class anInterface : interfaces) { if (IRGroupQuery.class.equals(anInterface)) return true; } Class superClass = classObject.getSuperclass(); if (superClass != null) return this.accepts(superClass); return false; } @Override public void close() throws IOException { input.close(); } /** * Check input IChemObject and proceed to parse. * Accepts/returns IChemObject of type RGroupQuery only. * @return IChemObject read from file * @param object class must be of type RGroupQuery */ @Override public T read(T object) throws CDKException { if (object instanceof RGroupQuery) { return (T) parseRGFile((RGroupQuery) object); } else { throw new CDKException("Reader only supports " + RGroupQuery.class.getName() + " objects"); } } /** * Parse the RGFile. Uses of {@link org.openscience.cdk.io.MDLV2000Reader} * to parse individual $CTAB blocks. * * @param rGroupQuery empty * @return populated query * @throws CDKException */ private RGroupQuery parseRGFile(RGroupQuery rGroupQuery) throws CDKException { IChemObjectBuilder defaultChemObjectBuilder = rGroupQuery.getBuilder(); String line = ""; int lineCount = 0; String eol = System.getProperty("line.separator"); StringTokenizer strTk = null; /* Variable to capture the LOG line(s) */ Map logicDefinitions = new HashMap(); /* * Variable to captures attachment order for Rgroups. Contains: - pseudo * atom (Rgroup) - map with (integer,bond) meaning "bond" has attachment * order "integer" (1,2,3) for the Rgroup The order is based on the atom * block, unless there is an AAL line for the pseudo atom. */ Map> attachmentPoints = new HashMap>(); try { // Process the Header block_________________________________________ //__________________________________________________________________ logger.info("Process the Header block"); checkLineBeginsWith(input.readLine(), "$MDL", ++lineCount); checkLineBeginsWith(input.readLine(), "$MOL", ++lineCount); checkLineBeginsWith(input.readLine(), "$HDR", ++lineCount); for (int i = 1; i <= 3; i++) { lineCount++; if (input.readLine() == null) { throw new CDKException("RGFile invalid, empty/null header line at #" + lineCount); } //optional: parse header info here (not implemented) } checkLineBeginsWith(input.readLine(), "$END HDR", ++lineCount); //Process the root structure (scaffold)_____________________________ //__________________________________________________________________ logger.info("Process the root structure (scaffold)"); checkLineBeginsWith(input.readLine(), "$CTAB", ++lineCount); //Force header StringBuilder sb = new StringBuilder(RGroup.ROOT_LABEL + "\n\n\n"); line = input.readLine(); ++lineCount; while (line != null && !line.equals("$END CTAB")) { sb.append(line + eol); //LOG lines: Logic, Unsatisfied Sites, Range of Occurrence. if (line.startsWith("M LOG")) { strTk = new StringTokenizer(line); strTk.nextToken(); strTk.nextToken(); strTk.nextToken(); RGroupLogic log = null; log = new RGroupLogic(); int rgroupNumber = Integer.valueOf(strTk.nextToken()); String tok = strTk.nextToken(); log.rgoupNumberRequired = tok.equals("0") ? 0 : Integer.valueOf(tok); log.restH = strTk.nextToken().equals("1") ? true : false; tok = ""; while (strTk.hasMoreTokens()) { tok += strTk.nextToken(); } log.occurence = tok; logicDefinitions.put(rgroupNumber, log); } line = input.readLine(); ++lineCount; } String rootStr = sb.toString(); //Let MDL reader process $CTAB block of the root structure. MDLV2000Reader reader = new MDLV2000Reader(new StringReader(rootStr), ISimpleChemObjectReader.Mode.STRICT); IAtomContainer root = reader.read(defaultChemObjectBuilder.newInstance(IAtomContainer.class)); rGroupQuery.setRootStructure(root); //Atom attachment order: parse AAL lines first strTk = new StringTokenizer(rootStr, eol); while (strTk.hasMoreTokens()) { line = strTk.nextToken(); if (line.startsWith("M AAL")) { StringTokenizer stAAL = new StringTokenizer(line); stAAL.nextToken(); stAAL.nextToken(); int pos = Integer.valueOf(stAAL.nextToken()); IAtom rGroup = root.getAtom(pos - 1); stAAL.nextToken(); Map bondMap = new HashMap(); while (stAAL.hasMoreTokens()) { pos = Integer.valueOf(stAAL.nextToken()); IAtom partner = root.getAtom(pos - 1); IBond bond = root.getBond(rGroup, partner); int order = Integer.valueOf(stAAL.nextToken()); bondMap.put(order, bond); logger.info("AAL " + order + " " + ((IPseudoAtom) rGroup).getLabel() + "-" + partner.getSymbol()); } if (bondMap.size() != 0) { attachmentPoints.put(rGroup, bondMap); } } } //Deal with remaining attachment points (non AAL) for (IAtom atom : root.atoms()) { if (atom instanceof IPseudoAtom) { IPseudoAtom rGroup = (IPseudoAtom) atom; if (rGroup.getLabel().startsWith("R") && !rGroup.getLabel().equals("R") && // only numbered ones !attachmentPoints.containsKey(rGroup)) { //Order reflects the order of atoms in the Atom Block int order = 0; Map bondMap = new HashMap(); for (IAtom atom2 : root.atoms()) { if (!atom.equals(atom2)) { for (IBond bond : root.bonds()) { if (bond.contains(atom) && bond.contains(atom2)) { bondMap.put(++order, bond); logger.info("Def " + order + " " + rGroup.getLabel() + "-" + atom2.getSymbol()); break; } } } } if (bondMap.size() != 0) { attachmentPoints.put(rGroup, bondMap); } } } } //Done with attachment points rGroupQuery.setRootAttachmentPoints(attachmentPoints); logger.info("Attachm.points defined for " + attachmentPoints.size() + " R# atoms"); //Process each Rgroup's $CTAB block(s)_____________________________ //__________________________________________________________________ //Set up the RgroupLists, one for each unique R# (# = 1..32 max) Map rGroupDefinitions = new HashMap(); for (IAtom atom : root.atoms()) { if (atom instanceof IPseudoAtom) { IPseudoAtom rGroup = (IPseudoAtom) atom; if (RGroupQuery.isValidRgroupQueryLabel(rGroup.getLabel())) { int rgroupNum = Integer.valueOf(rGroup.getLabel().substring(1)); RGroupList rgroupList = new RGroupList(rgroupNum); if (!rGroupDefinitions.containsKey(rgroupNum)) { logger.info("Define Rgroup R" + rgroupNum); RGroupLogic logic = logicDefinitions.get(rgroupNum); if (logic != null) { rgroupList.setRestH(logic.restH); rgroupList.setOccurrence(logic.occurence); rgroupList.setRequiredRGroupNumber(logic.rgoupNumberRequired); } else { rgroupList.setRestH(false); rgroupList.setOccurrence(">0"); rgroupList.setRequiredRGroupNumber(0); } rgroupList.setRGroups(new ArrayList()); rGroupDefinitions.put(rgroupNum, rgroupList); } } } } //Parse all $CTAB blocks per Rgroup (there can be more than one) line = input.readLine(); ++lineCount; boolean hasMoreRGP = true; while (hasMoreRGP) { checkLineBeginsWith(line, "$RGP", lineCount); line = input.readLine(); ++lineCount; logger.info("line for num is " + line); int rgroupNum = Integer.valueOf(line.trim()); line = input.readLine(); ++lineCount; boolean hasMoreCTAB = true; while (hasMoreCTAB) { checkLineBeginsWith(line, "$CTAB", lineCount); sb = new StringBuilder(RGroup.makeLabel(rgroupNum) + "\n\n\n"); line = input.readLine(); while (line != null && !line.startsWith("$END CTAB")) { sb.append(line + eol); line = input.readLine(); ++lineCount; } String groupStr = sb.toString(); reader = new MDLV2000Reader(new StringReader(groupStr), ISimpleChemObjectReader.Mode.STRICT); IAtomContainer group = reader.read(defaultChemObjectBuilder.newInstance(IAtomContainer.class)); RGroup rGroup = new RGroup(); rGroup.setGroup(group); //Parse the Rgroup's attachment points (APO) strTk = new StringTokenizer(groupStr, eol); while (strTk.hasMoreTokens()) { line = strTk.nextToken(); if (line.startsWith("M APO")) { StringTokenizer stAPO = new StringTokenizer(line); stAPO.nextToken(); stAPO.nextToken(); stAPO.nextToken(); while (stAPO.hasMoreTokens()) { int pos = Integer.valueOf(stAPO.nextToken()); int apo = Integer.valueOf(stAPO.nextToken()); IAtom at = group.getAtom(pos - 1); switch (apo) { case 1: rGroup.setFirstAttachmentPoint(at); break; case 2: rGroup.setSecondAttachmentPoint(at); break; case 3: { rGroup.setFirstAttachmentPoint(at); rGroup.setSecondAttachmentPoint(at); } break; } } } } RGroupList rList = rGroupDefinitions.get(rgroupNum); if (rList == null) { throw new CDKException("R" + rgroupNum + " not defined but referenced in $RGP."); } else { rList.getRGroups().add(rGroup); } line = input.readLine(); ++lineCount; if (line.startsWith("$END RGP")) { logger.info("end of RGP block"); hasMoreCTAB = false; } } line = input.readLine(); ++lineCount; if (line.startsWith("$END MOL")) { hasMoreRGP = false; } } rGroupQuery.setRGroupDefinitions(rGroupDefinitions); logger.info("Number of lines was " + lineCount); return rGroupQuery; } catch (CDKException exception) { String error = "CDK Error while parsing line " + lineCount + ": " + line + " -> " + exception.getMessage(); logger.error(error); logger.debug(exception); throw exception; } catch (IOException | IllegalArgumentException exception) { exception.printStackTrace(); String error = exception.getClass() + "Error while parsing line " + lineCount + ": " + line + " -> " + exception.getMessage(); logger.error(error); logger.debug(exception); throw new CDKException(error, exception); } } /** * Checks that a given line starts as expected, according to RGFile format. * @param line * @param expect * @param lineCount * @throws CDKException */ private void checkLineBeginsWith(String line, String expect, int lineCount) throws CDKException { if (line == null) { throw new CDKException("RGFile invalid, empty/null line at #" + lineCount); } if (!line.startsWith(expect)) { throw new CDKException("RGFile invalid, line #" + lineCount + " should start with:" + expect + "."); } } }





© 2015 - 2025 Weber Informatics LLC | Privacy Policy