All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.xmlcml.cml.tools.SMILESTool Maven / Gradle / Ivy

/**
 *    Copyright 2011 Peter Murray-Rust et. al.
 *
 *    Licensed under the Apache License, Version 2.0 (the "License");
 *    you may not use this file except in compliance with the License.
 *    You may obtain a copy of the License at
 *
 *        http://www.apache.org/licenses/LICENSE-2.0
 *
 *    Unless required by applicable law or agreed to in writing, software
 *    distributed under the License is distributed on an "AS IS" BASIS,
 *    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 *    See the License for the specific language governing permissions and
 *    limitations under the License.
 */

package org.xmlcml.cml.tools;

import java.util.ArrayList;
import java.util.HashSet;
import java.util.List;
import java.util.Set;
import java.util.Stack;
import java.util.regex.Pattern;

import nu.xom.Attribute;

import org.apache.log4j.Level;
import org.apache.log4j.Logger;
import org.xmlcml.cml.base.AbstractTool;
import org.xmlcml.cml.base.CMLConstants;
import org.xmlcml.cml.base.CMLElements;
import org.xmlcml.cml.element.CMLAtom;
import org.xmlcml.cml.element.CMLAtomArray;
import org.xmlcml.cml.element.CMLAtomParity;
import org.xmlcml.cml.element.CMLBond;
import org.xmlcml.cml.element.CMLBondStereo;
import org.xmlcml.cml.element.CMLMolecule;
import org.xmlcml.cml.element.CMLMolecule.HydrogenControl;
import org.xmlcml.euclid.Util;
import org.xmlcml.molutil.ChemicalElement;


/**
 * additional tools for currentBond. not fully developed
 * 
 * @author pmr
 * 
 */
public class SMILESTool extends AbstractTool {

	private static Logger LOG = Logger.getLogger(SMILESTool.class);
	static {
		LOG.setLevel(Level.WARN);
	}
	
    /** dewisott */
    public final static char C_SINGLE   = '-';
    /** dewisott */
    public final static char C_DOUBLE   = '=';
    /** dewisott */
    public final static char C_TRIPLE   = '#';
    /** dewisott */
    public final static char C_AROMATIC = ':';
    /** dewisott */
    public final static char C_NONE       = 0;
    /** dewisott */
    public final static char C_FORBIDDEN  = 1;
    /** dewisott */
    public final static char C_ZERO       = '0';
    /** dewisott */
    public final static char C_DOT        = '.';
    /** dewisott */
    public final static char C_LBRAK      = '(';
    /** dewisott */
    public final static char C_RBRAK      = ')';
    /** dewisott */
    public final static char C_LSQUARE    = '[';
    /** dewisott */
    public final static char C_MINUS      = '-';
    /** dewisott */
    public final static char C_PLUS       = '+';
    /** dewisott */
    public final static char C_RSQUARE    = ']';
    /** dewisott */
    public final static char C_SLASH      = '/';
    /** dewisott */
    public final static char C_BACKSLASH  = '\\';
    /** dewisott */
    public final static char C_PERC       = '%';
    /** dewisott */
    public final static String S_AT     = "@";
    /** dewisott */
    public final static String S_ATAT   = "@@";

    /** */
    public final static char C_c 		= 'c';
    /** */
    public final static char C_n 		= 'n';
    /** */
    public final static char C_o 		= 'o';
    /** */
    public final static char C_p 		= 'p';
    /** */
    public final static char C_s 		= 's';
    /** */
    public final static String S_as 	= "as";
    /** */
    public final static String S_se 	= "se";
    /** */
    public final static char C__ ='_';
    /** */
    public final static char C_$ ='$';

    private static String AROMATIC 	= "aromatic";
    private static String CHIRAL 	= "chiral";
    private static String TRUE 		= "true";

    private static String B   		= "B";
    private static String C   		= "C";
    private static String N   		= "N";
    private static String O   		= "O";
    private static String P   		= "P";
    private static String S   		= "S";
    private static String F   		= "F";
    private static String CL   		= "Cl";
    private static String BR   		= "Br";
    private static String I   		= "I";

    private static String DU   		= "Du";
    //private static String SI   		= "Si";
    private static String SLASH   	= "slash";
    
    private CMLMolecule molecule;
    private CMLAtom currentAtom;
    private CMLBond currentBond;
    private CMLAtom lastAtom;
    private char bondChar;
    private int natoms;
    private int nrs;
    private HydrogenControl hydrogenControl;
    private List atomIdList;
    private List atomChunkList;
//    private List bondIdList;
//    private List bondChunkList;
//    private List ringIdList;
//    private List ringChunkList;
    private String rawSmiles;
private String smilescopy;
private int sLength;
//	private String scopy;
    
    
	/**
	 * Contains an atom and the order of the bond that it will form when a corresponding ring closure is found
	 */
	private class RingOpening {
		/**The atom that will be bonded to when a corresponding ring closure is found*/
		CMLAtom atom;
		/**The order of the bond about to be formed can be either C_SINGLE, C_DOUBLE, C_TRIPLE, or C_AROMATIC.*/
		char bondChar;


		/**
		 * An Atom and a bond order
		 * @param a
		 * @param bondOrderVal
		 */
		RingOpening(CMLAtom a, char bondOrderChar) {
			atom = a;
			bondChar = bondOrderChar;
		}
	}
	
    /** constructor
     */
    public SMILESTool() {
    	hydrogenControl = HydrogenControl.NO_EXPLICIT_HYDROGENS;
    }
    
    public SMILESTool(CMLMolecule molecule) {
    	this();
    	this.setMolecule(molecule);
    }
    
	/** parse SMILES.
     * 
     * @param sss
     */
    public void parseSMILES(String sss) {

    	if (sss == null) {
    		return;
    	}
    	initSmiles(sss);

        final Stack stack = new Stack();
        final RingOpening[] rings = new RingOpening[99];
        int i = 0;
        char c = 0;
        char slashChar = C_NONE;
        boolean hasDot = false;
        while (i < sLength) {
        	lastAtom=currentAtom;
            c = rawSmiles.charAt(i);
            if (c == C_LBRAK) {
                stack.push(currentAtom);
                i++;
            } else if (c == C_RBRAK) {
                if (stack.isEmpty()) {
                    throw new RuntimeException("Unexpected "+C_RBRAK);
                }
                currentAtom = stack.pop();
                bondChar = C_NONE;
                i++;
            } else if (c == C_LSQUARE) {
                final int idx = rawSmiles.indexOf(C_RSQUARE, i);
                if (idx == -1) {
                    throw new RuntimeException("Unbalanced "+C_LSQUARE);
                }
                int atomStartChar = i;
//                String atomChunk = rawSmiles.substring(i, idx+1);
                String atomString = rawSmiles.substring(i+1, idx);
                i = idx + 1;
                currentAtom = addExtendedAtom(atomString, slashChar, atomStartChar, atomString);
                bondChar = C_NONE;
                slashChar = C_NONE;
            } else if(
                c == C_SINGLE ||
                c == C_AROMATIC ||
                c == C_DOUBLE ||
                c == C_TRIPLE
                ) {
                if (bondChar != C_NONE || i==0 || i==sLength-1) {
                    throw new RuntimeException("Bond not expected here: "+rawSmiles.substring(i));
                }
                bondChar = c;
                i++;
            } else if(
                c == C_SLASH ||
                c == C_BACKSLASH
                ) {
                slashChar = c;
                currentAtom.setAttribute(SLASH, ""+c);
                i++;
            } else if(c == C_DOT) {
            	hasDot = true;
                currentAtom = null;
                i++;
            } else if(Character.isDigit(c) || c == C_PERC ) {
            	if (currentAtom ==null){
                    throw new RuntimeException("Ring: "+ c + " does not have the starting or ending atom defined!");	
            	}
            	
            	int ring;
            	if (c == C_PERC){//support for using % syntax to define ring openings
            		i++;
            		if(i +1 < sLength){
            			try{
            				ring= Integer.parseInt(rawSmiles.substring(i, i+2));
            				i++;
            			}
            			catch (NumberFormatException e){
            				throw new RuntimeException("Expected two digit number after % sign in string. Found: "
            						+rawSmiles.substring(i, i+2));
            			}
            		}
            		else{
            			throw new RuntimeException("Expected two digit number after % sign in SMILES. Found end of SMILES");	
            		}
            	}
            	else{
            		ring =c - C_ZERO;
            	}
                if (rings[ring] == null) {
                	// start of ring
                	rings[ring] = new RingOpening(currentAtom, bondChar);
                	bondChar = C_NONE;
                    
                    //Notes the order rings connect to a chiral atom
                    if (currentAtom.getAttributeValue(CHIRAL)!= null){
                    	CMLElements atomParity = currentAtom.getAtomParityElements();
	                	for (CMLAtomParity atomParityTag : atomParity) {
	                		String[] atomRefs4 =atomParityTag.getAtomRefs4();
	                		
	                		for (int k = 0; k < atomRefs4 .length; k++) {
	                			if (atomRefs4[k].equals("")){
	                				atomRefs4[k]="ring$#!"+ring;
	                				break;
	                			}
	        				}
	                		atomParityTag.setAtomRefs4(atomRefs4);
	                	}
                    }                   
                } else {
                	// end of ring
                	
                	//Updates chiral atom with id of atom that has joined to it
                    if (rings[ring].atom.getAttributeValue(CHIRAL)!= null){
                    	CMLElements atomParity = rings[ring].atom.getAtomParityElements();
	                	for (CMLAtomParity atomParityTag : atomParity) {
	                		String[] atomRefs4 =atomParityTag.getAtomRefs4();
	                		
	                		for (int k = 0; k < atomRefs4 .length; k++) {
	                			if (atomRefs4[k].equals("ring$#!"+ring)){
	                				atomRefs4[k]=currentAtom.getId();
	                				break;
	                			}
	        				}
	                		atomParityTag.setAtomRefs4(atomRefs4);
	                	}
                    }
                    if (bondChar == C_NONE){
                        currentBond = addBond(rings[ring].atom, currentAtom,  rings[ring].bondChar, rawSmiles);
                    }
                    else if (rings[ring].bondChar == C_NONE){
                    	 currentBond = addBond(rings[ring].atom, currentAtom,  bondChar, rawSmiles);
                    	 bondChar = C_NONE;
                    }
                    else if (rings[ring].bondChar == bondChar) {
                    	currentBond = addBond(rings[ring].atom, currentAtom,  bondChar, rawSmiles);
                    }
                    else{
                    	throw new RuntimeException("Ring opening " + ring + " has two specifications as to what order bond it should form: " + sss);
                    }
                    rings[ring] = null;
                }
                i++;
            } else if(Character.isLetter(c)) {
            	int atomStartChar = i;
                final String atomString = grabOrganicAtom(rawSmiles.substring(i));
                i += atomString.length();
                /*CMLAtom atom = */ addAtom(atomString, slashChar, rawSmiles, atomStartChar, atomString);
                bondChar = C_NONE;
                slashChar = C_NONE;
                
            } else if(c == C__ | c== C_$){
            	
            	final CMLAtom atom = new CMLAtom("r"+(++nrs));
            	 molecule.addAtom(atom);
                 setElementType(atom, "R");
                 if (slashChar != C_NONE) {
                     atom.setAttribute(SLASH, ""+slashChar);
                 }
             	atomIdList.add(0, atom.getId());
             	atomChunkList.add(0, "R");
                 if (currentAtom != null) {
                     addBond(currentAtom, atom, bondChar, rawSmiles);
                 }
                 currentAtom = atom;
            	i++;
            }
            else {
                throw new RuntimeException("Cannot interpret SMILES: "+rawSmiles.substring(i));
            }
            
        }
        
        for (int r = 0; r < rings.length; r++) {
			if (rings[r] != null){
				throw new RuntimeException("Ring: "+ r +" not closed! "+smilescopy);
			}
		}
        
        markupDoubleBondCisTrans();
    	removeSmilesSpecificAttributes();
        
        if (hasDot) {
        	new ConnectionTableTool(molecule).partitionIntoMolecules();
        	if (molecule.getMoleculeCount() > 0) {
        	}
        }
        makeAromaticBonds();
        addHydrogens();
//    	convertToKekule();


    }

	private void initSmiles(String sss) {
		smilescopy = sss;
		rawSmiles = expandString(sss);
        rawSmiles = rawSmiles.trim();
        molecule = new CMLMolecule();
        currentAtom = null;
        currentBond = null;
        bondChar = 0;
        atomIdList    = new ArrayList();
        atomChunkList = new ArrayList();
        sLength = rawSmiles.length();
        
        for (int i = 0; i < sLength; i++) {
        	atomIdList.add(null);
        	atomChunkList.add(null);
        }
	}

    //removes Chiral and Slash attributes
    private void removeSmilesSpecificAttributes() {
        CMLAtomArray atomsInMoleculeArray = molecule.getAtomArray();
        List atomsInMolecule = atomsInMoleculeArray.getAtoms(); 
        for (CMLAtom atom : atomsInMolecule) {
        	if (atom.getAttributeValue(SLASH) != null){
        		atom.removeAttribute(SLASH);
        	}
        	if (atom.getAttributeValue(CHIRAL) != null){
        		atom.removeAttribute(CHIRAL);
        	}
        }
		
	}

	//  C\C=C\C=C\C
    private void markupDoubleBondCisTrans() {
        CMLAtomArray atomsInMoleculeArray = molecule.getAtomArray();
        List atomsInMolecule = atomsInMoleculeArray.getAtoms();  
        for (CMLAtom atom : atomsInMolecule) {
        	if (atom.getAttributeValue(SLASH) != null){
        		Set atomsVisited = new HashSet();
        		List atomRefs4sInDoubleBond = new ArrayList();
        	    atomRefs4sInDoubleBond = recurseThroughAtoms(atomsVisited, atomRefs4sInDoubleBond, atom, 0);
        	    if (atomRefs4sInDoubleBond.size() ==4){
        	    	List bondListWhichIncludesDoubleBond = atomRefs4sInDoubleBond.get(1).getLigandBonds();
        	    	for (CMLBond bond : bondListWhichIncludesDoubleBond) {
	        			if (CMLBond.isDouble(bond.getOrder()) && 
	        					bond.query("cml:bondStereo", CMLConstants.CML_XPATH).size() == 0) {
	        				CMLBondStereo bondstereo =new CMLBondStereo();
	        				bondstereo.setAtomRefs4(
	        						new String[]{
	        						atomRefs4sInDoubleBond.get(0).getId(),
	        						atomRefs4sInDoubleBond.get(1).getId(),
	        						
	        						atomRefs4sInDoubleBond.get(2).getId(),
	        						atomRefs4sInDoubleBond.get(3).getId()
	        						});
	        				
	        				char bond1Slashtype=atomRefs4sInDoubleBond.get(1).getAttributeValue(SLASH).charAt(0);
	        				char bond2Slashtype=atomRefs4sInDoubleBond.get(2).getAttributeValue(SLASH).charAt(0);
	        				
	        				if ((bond1Slashtype == C_SLASH  && bond2Slashtype == C_SLASH) ||
	        					(bond1Slashtype == C_BACKSLASH  && bond2Slashtype == C_BACKSLASH)){
		        					bondstereo.setXMLContent("T");	
	        				}
	        				
	        				if ((bond1Slashtype == C_BACKSLASH  && bond2Slashtype == C_SLASH) ||
		        					(bond1Slashtype == C_SLASH  && bond2Slashtype == C_BACKSLASH)){
			        					bondstereo.setXMLContent("C");	
		        			}
	        				bond.appendChild(bondstereo);
	        			}
        	    	}
        	    }
        	}
        }
	}
    


	private List recurseThroughAtoms(Set atomsVisited, 
			List atomRefs4sInDoubleBond, CMLAtom atom, int i) {
		atomsVisited.add(atom);
		atomRefs4sInDoubleBond.add(i,atom);
		if (i==3){
			return atomRefs4sInDoubleBond;
		}
		List connectedAtoms = atom.getLigandAtoms();
		List tempAtomRefs4sInDoubleBond=new ArrayList(atomRefs4sInDoubleBond);
		for (CMLAtom atom2 : connectedAtoms ) {
			if (atom2.getAttributeValue(SLASH)!=null && !atomsVisited.contains(atom2)){
				if (i==1 && !CMLBond.isDouble(molecule.getBond(atom, atom2).getOrder())){
					continue;//bond between 2nd and 3rd atom should be a double bond
				}
				atomRefs4sInDoubleBond= recurseThroughAtoms(atomsVisited, atomRefs4sInDoubleBond, atom2, i+1);
				if (atomRefs4sInDoubleBond.size()==4){
					return atomRefs4sInDoubleBond;
				}
				else{
					atomRefs4sInDoubleBond=tempAtomRefs4sInDoubleBond;
				}
			}
		}
		return atomRefs4sInDoubleBond;
	}


	/**
     * @param i
     * @return id of atom starting at character i
     */
    String getAtomIdAtChar(int i) {
    	return (atomIdList == null || atomIdList.size() <= i) ? null : atomIdList.get(i);
    }
    
    /**
     * @param i
     * @return id of atom starting at character i
     */
    String getAtomChunkAtChar(int i) {
    	return (atomChunkList == null || atomChunkList.size() <= i) ? null : atomChunkList.get(i);
    }
        
//    /**
//     * @param i
//     * @return id of bond starting at character i
//     */
//    String getBondIdAtChar(int i) {
//    	return (bondIdList == null || bondIdList.size() <= i) ? null : bondIdList.get(i);
//    }
//    
//    /**
//     * @param i
//     * @return id of bond starting at character i
//     */
//    String getBondChunkAtChar(int i) {
//    	return (bondChunkList == null || bondChunkList.size() <= i) ? null : bondChunkList.get(i);
//    }
//        
//    /**
//     * @param i
//     * @return id of ring starting at character i
//     */
//    String getRingIdAtChar(int i) {
//    	return (ringIdList == null || ringIdList.size() <= i) ? null : ringIdList.get(i);
//    }
//    
//    /**
//     * @param i
//     * @return id of ring starting at character i
//     */
//    String getRingChunkAtChar(int i) {
//    	return (ringChunkList == null || ringChunkList.size() <= i) ? null : ringChunkList.get(i);
//    }
        
    private String expandString(String s) {
	   String ss = s;
	   while (true) {
		   int ii = ss.indexOf(C_LCURLY);
		   if (ii == -1) {
			   break;
		   }
		   String startS = ss.substring(0, ii);
		   ss = ss.substring(ii);
		   int jj = Util.indexOfBalancedBracket(C_LCURLY, ss);
		   if (jj == -1) {
			   throw new RuntimeException("Unbalanced {}");
		   }
		   String midS = ss.substring(1, jj);
		   String endS = ss.substring(jj+1);
		   if (!endS.startsWith(S_STAR+S_LBRAK)) {
			   throw new RuntimeException("expected * count after }");
		   }
		   endS = endS.substring(2);
		   int idx = endS.indexOf(S_RBRAK);
		   if (idx == -1) {
			   throw new RuntimeException("Unbalanced brackets round count");
		   }
		   int count = Integer.parseInt(endS.substring(0, idx));
		   midS = expandString(midS);
		   endS = endS.substring(idx+1);
		   ss = startS;
		   for (int i = 0; i < count; i++) {
			   ss += midS;
		   }
		   ss += endS;
	   }
	   return ss;
   }
    
    private void addHydrogens() {
    	MoleculeTool moleculeTool = MoleculeTool.getOrCreateTool(molecule);
    	// remember explicit H
//    	hydrogenControl = HydrogenControl.;
    	moleculeTool.adjustHydrogenCountsToValency(hydrogenControl);
    	// decrement aromatic atom H count
    	for (CMLAtom atom : molecule.getAtoms()) {
//    		if (TRUE.equals(atom.getAttributeValue(AROMATIC))) {
//    			atom.setHydrogenCount(atom.getHydrogenCount() - 1);
//    			atom.deleteAnyLigandHydrogenAtom();
//    		}
    	}
    	// make all H explicit
    	moleculeTool.expandImplicitHydrogens(hydrogenControl);
    	molecule.addNamespaceDeclaration("cmlx", "http://www.xml-cml.org/schema/cmlx");
    	molecule.addAttribute(new Attribute("cmlx:explicitHydrogens", "http://www.xml-cml.org/schema/cmlx", "true"));
    	// remove all hydrogenCounts
    	for (CMLAtom atom : molecule.getAtoms()) {
    		Attribute hcount = atom.getAttribute("hydrogenCount");
    		if (hcount != null) {
    			hcount.detach();
    		}
    	}
    }

	private void makeAromaticBonds() {
		for (CMLBond bond : molecule.getBonds()) {
    		CMLAtom atom0 = bond.getAtom(0);
    		CMLAtom atom1 = bond.getAtom(1);
    		if (TRUE.equals(atom0.getAttributeValue(AROMATIC)) &&
        		TRUE.equals(atom1.getAttributeValue(AROMATIC))) {
    			bond.setOrder(CMLBond.AROMATIC);
    		}
    	}
	}
	
//	private void convertToKekule() {
//	}

	/*
	 * Retrieves the one or two letters corresponding to the symbol for the atom
	 * Only organic elements can appear in SMILES strings outside square brackets
	 */
    private String grabOrganicAtom(final String s) {
    	if (s.length() == 0) {
    		throw new RuntimeException("empty element symbol");
    	}
        String atomString = s.substring(0, 1);
    	if (Character.isLowerCase(s.charAt(0))) {
    		char el = s.charAt(0);
    		if (el == C_c ||
    			el == C_n ||
    			el == C_o ||
    			el == C_p ||
    			el == C_s)
    		{
    			;//
    		} else {
    			throw new RuntimeException("element may not start with lowercase: "+s);
    		}
    	}
        else if (s.startsWith(CL) || s.startsWith(BR)) {
	            atomString = s.substring(0, 2);
    	} else if(
    			s.startsWith(B) ||
    			s.startsWith(C) ||
    			s.startsWith(N) ||
    			s.startsWith(O) || 
    			s.startsWith(P) ||
    			s.startsWith(S) ||
    			s.startsWith(F) ||
    			s.startsWith(I) ){
    		atomString = s.substring(0, 1);
    	}
    	else {
        	throw new RuntimeException("Unknown element encountered: "+atomString +
        			" Only organic elements may appear outside square brackets: "+s);
        }
        return atomString;
    }
    
    
	/*
	 * Retrieves the one or two letters corresponding to the symbol for the atom
	 * Presently a check is done to confirm that
	 * this corresponds to an actual element (daylight doesn't do this)
	 */
    private String grabAtom(final String s) {
    	if (s.length() == 0) {
    		throw new RuntimeException("empty element symbol");
    	}
        String atomString = s.substring(0, 1);
        String elementSymbol = atomString;
    	if (Character.isLowerCase(s.charAt(0))) {
    		
    		char el = s.charAt(0);
    		if (s.startsWith(S_as) ||
    			s.startsWith(S_se)){
    			elementSymbol =atomString.toUpperCase() +s.charAt(1);
    			atomString = s.substring(0, 2);
    		}
    		else if (el == C_c ||
    			el == C_n ||
    			el == C_o ||
    			el == C_p ||
    			el == C_s)
    		{
    			elementSymbol =atomString.toUpperCase();
    		} else {
    			throw new RuntimeException("element may not start with lowercase: "+s);
    		}
    	}
    	else {
        	if(s.length() == 2 && Character.isLowerCase(s.charAt(1))) {
        		atomString = s.substring(0, 2);
        		elementSymbol = atomString;
        	}
        }
    	if (elementSymbol.equals(S_STAR)) {
    		atomString = ChemicalElement.AS.R.value;
    	} else {
	        ChemicalElement chemicalElement = ChemicalElement.getChemicalElement(elementSymbol);
	        if (chemicalElement == null) {
	        	throw new RuntimeException("Unknown element: "+atomString);
	        }
    	}
        return atomString;
    }


//atom : '['  symbol   > ']'
//Note that chiral, hcount and sign/charge can appear in any order technically

    private CMLAtom addExtendedAtom(final String s, final char slashChar, int atomStartChar, String atomString) {
// create atom with dummy elementType
        final CMLAtom atom = addAtom(DU, slashChar, rawSmiles, atomStartChar, atomString);
        final int l = s.length();
        int i = 0;
// isotope
        while (true) {
            if (!Character.isDigit(s.charAt(i))) {
                break;
            }
            i++;
        }
        if (i != 0) {
            atom.setIsotopeNumber(Integer.parseInt(s.substring(0, i)));
        }
// elementType
        String ss = s.substring(i, Math.min(s.length(), i+2));
        final String elementType = grabAtom(ss);
        setElementType(atom, elementType);
        i += elementType.length();

        
        int hydrogenCount =0;
        int charge =0;
        String chiral ="";
        while (i < l){
// chirality
            if (s.substring(i).startsWith(S_ATAT)) {
                atom.setAttribute(CHIRAL, S_ATAT);
                chiral=S_ATAT;
                i=i+2;
            } else if (s.substring(i).startsWith(S_AT)) {
            	if(Pattern.matches(S_AT +"[A-Z][A-Z]\\d\\S*", s.substring(i))){
            		i=i+4;
            		LOG.warn("Currently unsupported chiral specification");
            	}
            	else{
	                atom.setAttribute(CHIRAL, S_AT);
	                chiral=S_AT;
	                i++;
            	}
            }
            else if (s.charAt(i) == 'H') {
// hydrogenCount
                i++;
                if (i < l && Character.isDigit(s.charAt(i))) {
                    int startOfCountInString=i;
            		while (i < l) {
            			if (!Character.isDigit(s.charAt(i))) {
                			break;
            			}
            			i++;
    	            }
                	hydrogenCount= Integer.parseInt(s.substring(startOfCountInString, i));
                } else {
                	hydrogenCount =1;
                }
            }
            else if (s.charAt(i) == C_PLUS){ 
// formalCharge
        		charge++;
        		i++;
        		while (i < l) {
        			char sign = s.charAt(i);
        			if (sign == C_PLUS) {
            			charge++;
            			i++;
        			}
        			else if (Character.isDigit(s.charAt(i))) {
    		            charge = s.charAt(i) - C_ZERO;
    		            i++;
    		            break;
        			}
        			else{
        				break;
        			}
        		}
        	}
        	else if (s.charAt(i) == C_MINUS) {
        		charge--;
        		i++;
        		while (i < l) {
        			char sign = s.charAt(i);
        			if (sign == C_MINUS) {
            			charge--;
            			i++;
        			}
        			else if (Character.isDigit(s.charAt(i))) {
    		            charge = s.charAt(i) - C_ZERO;
    		            charge *=-1;
    		            i++;
    		            break;
        			}
        			else{
        				break;
        			}
	            }
            }
            else{
            	throw new RuntimeException("Invalid symbol found in atom description, found: "+s.charAt(i)+" in "+smilescopy );
            }
        }
        //throw new RuntimeException("Sign must be of form - or --.. or -n or + or ++... or +n (found "+sign+") in "+scopy );
        
        atom.setFormalCharge(charge);
        atom.setHydrogenCount(hydrogenCount);
        
// CML parity tag added if applicable       
        if (chiral !=""){
	        CMLAtomParity atomParity =new CMLAtomParity();
	        String[] atomRefs4 =new String[]{"","","",""};
	        
	        if (lastAtom !=null){
	        	atomRefs4[0]=lastAtom.getId();
	        	if (hydrogenCount >=1){
	        		atomRefs4[1]=atom.getId() + "_h1";
	        	}
	        }
	        else{
	        	if (hydrogenCount >=1){
	        		atomRefs4[0]=atom.getId() + "_h1";
	        	}
	        }
        	
        	atomParity.setAtomRefs4(atomRefs4);
	        if (chiral.equals(S_ATAT)){
	        	atomParity.setXMLContent("1");
	        }
	        else if (chiral.equals(S_AT)) {
	        	atomParity.setXMLContent("-1");
	        }
        	atom.appendChild(atomParity);
        }
        
        return atom;
    }

    private void setElementType(final CMLAtom atom, String elementType) {
    	if (elementType.length() < 1 || elementType.length() > 2) {
            throw new RuntimeException("Element of wrong length :"+elementType+":");
        } else if (elementType.length() == 1) {
            if (Character.isLowerCase(elementType.charAt(0))) {
                atom.setAttribute(AROMATIC, TRUE);
                elementType = elementType.toUpperCase();
            }
	    } else if (elementType.length() == 2) {
	        if (Character.isLowerCase(elementType.charAt(0))) {
	            atom.setAttribute(AROMATIC, TRUE);
	            elementType = elementType.substring(0,1).toUpperCase() + elementType.charAt(1) ;
	        }
	    } 
        else {
            if (!Character.isUpperCase(elementType.charAt(0)) &&
                Character.isLowerCase(elementType.charAt(1))) {
                throw new RuntimeException("Bad element :"+elementType);
            }
        }
        atom.setElementType(elementType);
    }

    @SuppressWarnings("unused")
    private CMLAtom addAtom(String elementType, final char slashChar, String rawSmiles, int atomStartChar, String atomString) {
        final CMLAtom atom = new CMLAtom("a"+(++natoms));
        molecule.addAtom(atom);
        setElementType(atom, elementType);
        if (slashChar != C_NONE) {
            atom.setAttribute(SLASH, ""+slashChar);
        }
    	atomIdList.add(atomStartChar, atom.getId());
    	atomChunkList.add(atomStartChar, atomString);
        if (currentAtom != null) {
            CMLBond bond = addBond(currentAtom, atom, bondChar, rawSmiles);
        }
        currentAtom = atom;
        return atom;
    }

    
    private CMLBond addBond(final CMLAtom currentAtom, 
    		final CMLAtom atom, char bondChar, String rawSmiles) {
        final CMLBond bond = new CMLBond(currentAtom, atom);

        if (currentAtom.getAttributeValue(CHIRAL)!= null){
        	CMLElements atomParity = currentAtom.getAtomParityElements();
        	for (CMLAtomParity atomParityTag : atomParity) {
        		String[] atomRefs4 =atomParityTag.getAtomRefs4();
        		
        		for (int i = 0; i < atomRefs4 .length; i++) {
        			if (atomRefs4[i]==""){
        				atomRefs4[i]=atom.getId();
        				break;
        			}
				}

        		atomParityTag.setAtomRefs4(atomRefs4);
			} 
        }
        if (atom.getAttributeValue(CHIRAL)!= null){
        	CMLElements atomParity =atom.getAtomParityElements();
        	for (CMLAtomParity atomParityTag : atomParity) {
        		String[] atomRefs4 =atomParityTag.getAtomRefs4();
        		
        		for (int i = 0; i < atomRefs4 .length; i++) {
        			if (atomRefs4[i]==""){
        				atomRefs4[i]=currentAtom.getId();
        				break;
        			}
				}

        		atomParityTag.setAtomRefs4(atomRefs4);
			} 
        }
        
        molecule.addBond(bond);
        if (bondChar == C_NONE) {
            bondChar = C_SINGLE;
        }
        if (bondChar == C_SINGLE) {
            bond.setOrder(CMLBond.SINGLE_S);
        } else if (bondChar == C_DOUBLE) {
            bond.setOrder(CMLBond.DOUBLE_D);
        } else if (bondChar == C_TRIPLE) {
            bond.setOrder(CMLBond.TRIPLE_T);
        } else if (bondChar == C_AROMATIC) {
            bond.setOrder(CMLBond.AROMATIC);
        } else {
            throw new RuntimeException("Unknown currentBond type :"+bondChar+":");
        }
        return bond;
    }

    /** crude writer
     * @return string
     */
    public String write() {
    	SMILESWriter sWriter = new SMILESWriter(molecule);
    	return sWriter.getString();
    }


    
    /**
     * normalizes ring numbers in SMILES to be as low as possible
     * crude. assumes less than 9 rings open at any time
     * neglect isotopes
     * @param s
     * @return SMILES
     */
    public static String normalizeRings(String s) {
    	int start[] = new int[10];
    	int end[] = new int[10];
    	boolean inring[] = new boolean[10];
    	for (int i = 0; i < 10; i++) {
    		start[i] = -1;
    		end[i] = -1;
    		inring[i] = false;
    	}
    	StringBuilder sb = new StringBuilder();
    	int i = 0;
    	while (i < s.length()) {
    		char c = s.charAt(i);
    		if (c == C_LSQUARE) {
    			int idx = Util.indexOfBalancedBracket(C_LSQUARE, s.substring(i));
    			if (idx == -1) {
    				throw new RuntimeException("No balanced []");
    			}
    			i += idx;
    			sb.append(s.subSequence(i, i+idx));
    		} else if (Character.isDigit(c)) {
    			int currentRingNumber = c - '0';
    			sb.append(c);
    			if (inring[currentRingNumber]) {
    				inring[currentRingNumber] = false;
    				end[currentRingNumber] = i;
					if (start[currentRingNumber] < 0) {
						throw new RuntimeException("start not set for "+currentRingNumber);
					}
    				int lowestFreeRing = 0;
    				for (int ring = 1; ring < currentRingNumber; ring++) {
    					if (
							(end[ring] < 0 && start[ring] < 0) ||
							(start[ring] > 0 && end[ring] > 0 && end[ring] < start[currentRingNumber])) {
    						lowestFreeRing = ring;
    						if (lowestFreeRing < currentRingNumber) {
	    						start[ring] = start[currentRingNumber];
	    						end[ring] = end[currentRingNumber];
	    						sb.setCharAt(start[ring], (char) ('0'+ring));
	    						sb.setCharAt(end[ring], (char) ('0'+ring));
	    						start[currentRingNumber] = -1;
	    						end[currentRingNumber] = -1;
    						}
    						break;
    					}
    				}
    			} else {
    				inring[currentRingNumber] = true;
    				start[currentRingNumber] = i;
    			}
    			i++;
    		} else {
    			sb.append(c);
    			i++;
    		}
    	}
    	String ss = sb.toString();
    	return ss;
    }
    
	/**
	 * @return the molecule
	 */
	public CMLMolecule getMolecule() {
		if (molecule != null) {
			molecule.setNormalizedBondOrders();
		}
		return molecule;
	}
	/**
	 * @param molecule the molecule to set
	 */
	public void setMolecule(CMLMolecule molecule) {
		this.molecule = molecule;
	}

    /**
	 * @return the hydrogenControl
	 */
	public HydrogenControl getHydrogenControl() {
		return hydrogenControl;
	}

	/**
	 * @param hydrogenControl the hydrogenControl to set
	 */
	public void setHydrogenControl(HydrogenControl hydrogenControl) {
		this.hydrogenControl = hydrogenControl;
	}

	/** convenience
	 * 
	 * @param smilesString
	 * @return
	 */
	public static CMLMolecule createMolecule(String smilesString) {
		CMLMolecule molecule = null;
		if (smilesString != null && smilesString.trim().length() != 0) {
			SMILESTool smilesTool = new SMILESTool();
			smilesTool.parseSMILES(smilesString);
			molecule = smilesTool.getMolecule();
		}
		return molecule;
	}

}	




© 2015 - 2025 Weber Informatics LLC | Privacy Policy