All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.xmlcml.cml.chemdraw.CDXML2CMLProcessor Maven / Gradle / Ivy

The newest version!
/* Copyright (C) 2001 Peter Murray-Rust ([email protected])
 *               Copyright 2011 Peter Murray-Rust et. al.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *         http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.xmlcml.cml.chemdraw;

import static org.xmlcml.cml.base.CMLConstants.CMLXSD_ID;
import static org.xmlcml.cml.base.CMLConstants.CML_XPATH;
import static org.xmlcml.cml.chemdraw.CDXConstants.ATT_BOUNDING_BOX;
import static org.xmlcml.cml.chemdraw.CDXConstants.ATT_FONTSIZE;
import static org.xmlcml.cml.chemdraw.CDXConstants.ATT_POINT;
import static org.xmlcml.cml.chemdraw.CDXConstants.ATT_YDELTA;
import static org.xmlcml.cml.chemdraw.CDXConstants.CDX_NAMESPACE;
import static org.xmlcml.cml.chemdraw.CDXConstants.CDX_PREFIX;
import static org.xmlcml.cml.chemdraw.CDXConstants.CDX_YDELTA;
import static org.xmlcml.cml.chemdraw.CDXConstants.MAX_ATOM_LABEL_FONT_SIZE;
import static org.xmlcml.cml.chemdraw.CDXConstants.MAX_MOLECULE_LABEL_FONT_SIZE;
import static org.xmlcml.cml.chemdraw.CDXConstants.MAX_MOLECULE_TO_LABEL_YDELTA;
import static org.xmlcml.cml.chemdraw.CDXConstants.MIN_MOLECULE_LABEL_FONT_SIZE;
import static org.xmlcml.cml.chemdraw.CDXConstants.MIN_MOLECULE_TO_LABEL_YDELTA;
import static org.xmlcml.cml.chemdraw.CDXConstants.TEMP_TEXT;
import static org.xmlcml.euclid.EuclidConstants.S_EMPTY;
import static org.xmlcml.euclid.EuclidConstants.S_SPACE;

import java.util.ArrayList;
import java.util.List;

import org.apache.log4j.Level;
import org.apache.log4j.Logger;
import org.xmlcml.cml.base.CMLConstants;
import org.xmlcml.cml.base.CMLElement;
import org.xmlcml.cml.base.CMLElement.CoordinateType;
import org.xmlcml.cml.base.CMLUtil;
import org.xmlcml.cml.chemdraw.components.CDXColorTable;
import org.xmlcml.cml.chemdraw.components.CDXFontTable;
import org.xmlcml.cml.chemdraw.components.CDXList;
import org.xmlcml.cml.chemdraw.components.CDXML;
import org.xmlcml.cml.chemdraw.components.CDXObject;
import org.xmlcml.cml.chemdraw.components.CDXPage;
import org.xmlcml.cml.chemdraw.components.CDXReactionStep;
import org.xmlcml.cml.chemdraw.components.CDXText;
import org.xmlcml.cml.chemdraw.components.CDXUtil;
import org.xmlcml.cml.element.CMLAtom;
import org.xmlcml.cml.element.CMLBond;
import org.xmlcml.cml.element.CMLBondStereo;
import org.xmlcml.cml.element.CMLCml;
import org.xmlcml.cml.element.CMLLabel;
import org.xmlcml.cml.element.CMLMolecule;
import org.xmlcml.cml.element.CMLMolecule.HydrogenControl;
import org.xmlcml.cml.element.CMLMoleculeList;
import org.xmlcml.cml.element.CMLReaction;
import org.xmlcml.cml.tools.AtomSetTool;
import org.xmlcml.cml.tools.GeometryTool;
import org.xmlcml.cml.tools.MoleculeTool;
import org.xmlcml.euclid.EuclidRuntimeException;
import org.xmlcml.euclid.Real2;
import org.xmlcml.euclid.Real2Range;
import org.xmlcml.euclid.RealRange;
import org.xmlcml.euclid.Transform2;

import nu.xom.Attribute;
import nu.xom.Element;
import nu.xom.Node;
import nu.xom.Nodes;
import nu.xom.ParentNode;

/**
 * attempts to convert a CDXML file for CML.
 * since CDX has many graphics primitives that CML does not some
 * graphics semantics may be lost. Similarly CDX does not support various chemical
 * semantics in CML and heuristics are used.
 * 
 * @author pm286
 *
 */
public class CDXML2CMLProcessor {

	final static Logger LOG = Logger.getLogger(CDXML2CMLProcessor.class);
	static {
		LOG.setLevel(Level.INFO);
	}
	
    final static double BOND_LENGTH = 2.0;
    private CDXObject rootCDXObject;
	private CDXList cdxList;
	private CDXObject page;
    private CMLCml cmlCml = null;
    
    private boolean cleanMolecules = true;
    private boolean flatten = true;
    private boolean rescale = true;
    private boolean removeCDXAttributes = true;
    
    public boolean isRemoveCDXAttributes() {
		return removeCDXAttributes;
	}

	public void setRemoveCDXAttributes(boolean removeCDXAttributes) {
		this.removeCDXAttributes = removeCDXAttributes;
	}

	public CDXML2CMLProcessor() {
    	
    }
    
    public void convertParsedXMLToCML(Element cdxml) {
    	XMLToCDXMLConverter xml2cdxmlConverter = new XMLToCDXMLConverter();
        rootCDXObject = (CDXObject) xml2cdxmlConverter.convertToCDXObject(cdxml);
//   		 
//   		 
//   		  
//   		   

        
 		try {
			if (!(rootCDXObject instanceof CDXList)) {
 	        	 throw new RuntimeException("expected cdxList as root element");
 	        }
 	        cdxList = (CDXList) rootCDXObject;
 	        cdxList.setChemDrawConverterRecursively(this);
 	        cdxml = null;
 		 	if (cdxList.getChildElements().size() == 0) {
 		 		LOG.warn("cdxList has no children");
 		 	}
 		 	for (int j = 0; j < cdxList.getChildElements().size(); j++) {
	 		 	CDXObject obj = (CDXObject) cdxList.getChildElements().get(j);
	 		 	if (obj instanceof CDXML) {
	 		 		cdxml = (CDXML) cdxList.getChildElements().get(0);
	 		 	} else if ("object".equals(obj.getLocalName())) {
	 		 		LOG.error("*********** Uninterpreted object in cdxList");
	 		 	 } else {
	 		 		 throw new RuntimeException("unexpected child of cdxList: "+obj.getLocalName());
	 		 	 }
 	         }
 		 	 if (cdxml == null) {
 		 		LOG.error("Cannot find CDXML element");
 		 	 } else {
	 		 	 convertCDXML(cdxml);
 		 	 }
 		} catch (Exception e) {
 			e.printStackTrace();
 			LOG.error(e);
 		}
	}
    
    public CMLElement getCML() {
    	return cmlCml;
    }
    
	private void convertCDXML(Element cdxml) {
		int pageCount = 0;
		page = null;
		// check for errors and non-page structures
		 for (int i = 0; i < cdxml.getChildElements().size(); i++) {
			 CDXObject child = (CDXObject) cdxml.getChildElements().get(i);
			 if (child instanceof CDXColorTable) {
				 // skip
			 } else  if (child instanceof CDXFontTable) {
				 // skip
		     } else if (child instanceof CDXPage) {
		    	 if (pageCount > 0) {
		    		 throw new RuntimeException("Only one page allowed");
		    	 }
		    	 pageCount++;
				 page = (CDXObject) cdxml.getChildElements().get(0);
		     } else if (child instanceof CDXML) {
		    	LOG.error("**************Cannot process nested CDXML");
			 } else  if (child.getLocalName().equals("object")) {
				LOG.error("Unexpected CDXML child 'object'");
		     } else {
				 throw new RuntimeException("Unexpected CDXML child: "+child.getLocalName());
		     }
		 }
//		 // find a single page
//		 page = null;
//		 for (int i = 0; i < cdxml.getChildElements().size(); i++) {
//			 CDXObject obj = (CDXObject) cdxml.getChildElements().get(i);
//			 if (obj instanceof CDXPage) {
//				 page = (CDXObject) cdxml.getChildElements().get(0);
//			 } else if (obj instanceof CDXFontTable) {
//				 // skip
//			 } else if (obj instanceof CDXColorTable) {
//				 // skip
//			 } else {
//				LOG.warn("Skipped non-page child of CDXML: "+obj);
//			 }
//		 }
		 if (page != null) {
		     tidyToCML();
		 } else {
//		 		 		 LOG.warn("EMPTY page");
		 }
	}
    
	private void tidyToCML() {
		// result is a  object
		cmlCml = new CMLCml();
		cmlCml.addNamespaceDeclaration(CDX_PREFIX, CDX_NAMESPACE);
		expandFontInfo(page);
		// main explorations of content
		page.process2CML(cmlCml);
		// tidying
		CDXML2CMLProcessor.addLabelsToMolecules(cmlCml);
		addHydrogenAtomsToMolecules();
//		this.processReactions();
		this.processReactionsNew();
		cleanRedundantHierarchy();
		transformRGroups();
		addLabelsToAtoms();
		 
		removeCDXAttributes = true;
		if (removeCDXAttributes) {
			removeCDXAttributes();
		}
		if (cleanMolecules) {
			flattenGroupingElements();
			removeAtomsWithChildrenExcludingLabels();
		    removeAtomsWithoutElementTypeOrCoordinates();
			cleanExternalConnectionPoints();
		}
		flipAndRescaleMolecules();
		if (LOG.isDebugEnabled()) {
			CMLUtil.debug(cmlCml, "==cmlCML==");
		}
		ensureXMLIds(cmlCml);
		groupMultipleMolecules();
		removeDeadCDXObjects();
	}
    
	private void removeDeadCDXObjects() {
		Nodes groups = cmlCml.query("//*[local-name()='group' and (.='')]");
		for (int i = 0; i < groups.size(); i++) {
			groups.get(i).detach();
		}
	}

	private void groupMultipleMolecules() {
		Nodes molecules = cmlCml.query("cml:molecule", CMLConstants.CML_XPATH);
		if (molecules.size() > 1) {
			CMLMoleculeList moleculeList = new CMLMoleculeList();
			for (int i = 0; i < molecules.size(); i++) {
				CMLMolecule molecule = (CMLMolecule) molecules.get(i);
				molecule.detach();
				moleculeList.addMolecule(molecule);
			}
			cmlCml.appendChild(moleculeList);
		}
	}

	private void ensureXMLIds(CMLCml cmlCml) {
		Nodes ids = cmlCml.query("//@id");
		for (int i = 0; i < ids.size(); i++) {
			Attribute idAtt = (Attribute) ids.get(i);
			String value = idAtt.getValue();
			value = CDXUtil.ensureXMLID(value);
			idAtt.setValue(value);
		}
	}

	/**
	 * @param nodes
	 */
	private void flattenGroupingElement(Nodes nodes) {
		for (int i = 0; i < nodes.size(); i++) {
			 CMLElement element = (CMLElement)nodes.get(i);
			 transferChildrenToParent(element);
			 element.detach();
		 }
	}

     
     /** child elements screw up some CML viewers
      */
     private void removeAtomsWithChildrenExcludingLabels() {
    	 if (cmlCml != null) {
	    	 Nodes atoms = cmlCml.query("//cml:atom/*[not(local-name()='label')]", CML_XPATH);
	    	 for (int i = 0; i < atoms.size(); i++) {
	    		 CMLAtom atom = (CMLAtom) atoms.get(i);
	    		 atom.detach();
	    	 }
    	 }
     }
     
     /** atoms without elementType or coordinates
      */
     private void removeAtomsWithoutElementTypeOrCoordinates() {
    	 if (cmlCml != null) {
	    	 Nodes atoms = cmlCml.query(
	    			 "//cml:atom[not(@elementType) or not(@x2) or not(@y2)]", CML_XPATH);
	    	 for (int i = 0; i < atoms.size(); i++) {
	    		 atoms.get(i).detach();
	    	 }
    	 }
     }
     
     private void flipAndRescaleMolecules() {
    	 if (cmlCml != null) {
	    	 Nodes molecules = cmlCml.query("//cml:molecule", CML_XPATH);
	    	 for (int i = 0; i < molecules.size(); i++) {
	    		 scale((CMLMolecule)molecules.get(i));
	    	 }
    	 }
     }
     
     private void scale(CMLMolecule molecule) {
    	 // only treat top level molecules
    	 if (molecule.query(".//cml:molecule", CML_XPATH).size() == 0) {
    		 try {
	    		 double bb = MoleculeTool.getAverageBondLength(molecule, CoordinateType.TWOD, false);
	    		 double scale = (rescale) ? BOND_LENGTH / bb : 1.0;
	    		 // this flips y-coordinates
	    		 Transform2 transform = new Transform2(
	    				 new double[]{
	    				 scale, 0.0,   0.0,
	    				 0.0,  -scale, 0.0,
	    				 0.0,   0.0,   1.0
	    				 }
				 );
	    		 MoleculeTool.transform(molecule, transform);
    		 } catch (RuntimeException cmle) {
    			 // no coordinates
    		 }
    	 }
     }

     private void processReactions() {
    	 Nodes reactionNodes = cmlCml.query("//cml:reaction", CML_XPATH);
    	 for (int i = 0; i < reactionNodes.size(); i++) {
    		 CMLReaction reaction = (CMLReaction) reactionNodes.get(i);
    		 ChemDrawReactionConverter chemDrawReactionConverter = new ChemDrawReactionConverter(reaction, cmlCml);
    		 chemDrawReactionConverter.processAfterParsing();
    	 }
    	 if (LOG.isDebugEnabled()) {
    		 CMLUtil.debug(cmlCml, "cmlCml");
    	 }
     }
     
     private void processReactionsNew() {
    	 Nodes reactionNodes = cmlCml.query("//cml:reaction", CML_XPATH);
    	 for (int i = 0; i < reactionNodes.size(); i++) {
    		 CMLReaction reaction = (CMLReaction) reactionNodes.get(i);
    		 CDXReactionStep.processReactionStep(reaction);
    	 }
     }
     
     private void addHydrogenAtomsToMolecules() {
    	 Nodes nodes = cmlCml.query("//cml:molecule", CML_XPATH);
    	 for (int i = 0; i < nodes.size(); i++) {
    		 CMLMolecule molecule = (CMLMolecule)nodes.get(i);
    		 addHydrogens(molecule);
    	 }
     }

    private void addHydrogens(CMLMolecule molecule) {
	    MoleculeTool.adjustHydrogenCountsToValency(molecule, HydrogenControl.USE_HYDROGEN_COUNT);
        GeometryTool.addCalculatedCoordinatesForHydrogens(molecule, CoordinateType.TWOD, HydrogenControl.USE_EXPLICIT_HYDROGENS);
	}

	/**
      * "carbons" with label children are actually not C.
      * Try to guess them
      */
     private void transformRGroups() {
    	 Nodes atomNodes = cmlCml.query("//cml:atom[@elementType='C' and cml:label]", CML_XPATH);
    	 for (int i = 0; i < atomNodes.size(); i++) {
    		 CMLAtom atom = (CMLAtom)atomNodes.get(i);
    		 CMLLabel label = atom.getLabelElements().get(0);
    		 String labelS = label.getCMLValue();
    		 LOG.debug("LAB... "+labelS);
    		 if (labelS.equals("R")) {
    			 atom.setElementType("R");
				 label.detach();
    		 } else if (labelS.equals("C")){
    		 } else if (labelS.equals("C-")){
    		 } else if (labelS.equals("C+")){
    		 } else {
    			 atom.setElementType("R");
    		 }
    	 }
	 }
     
     private void addLabelsToAtoms() {
    	 Nodes labelNodes = cmlCml.query("/cml:cml/cml:label", CML_XPATH);
    	 for (int i = 0; i < labelNodes.size(); i++) {
    		 CMLLabel label = (CMLLabel)labelNodes.get(i);
    		 String fontSizeS = label.getAttributeValue(ATT_FONTSIZE, CDX_NAMESPACE);
    		 if (fontSizeS != null) {
    			 int fontSize = Integer.parseInt(fontSizeS);
    			 if (fontSize <= MAX_ATOM_LABEL_FONT_SIZE) {
    				 CMLAtom atom = getNearestAtom(label);
    				 if (atom != null) {
    					 label.detach();
    					 atom.addLabel(label);
    				 }
    			 }
    		 }
    	 }
	 }

     private CMLAtom getNearestAtom(CMLLabel label) {
    	 CMLAtom closestAtom = null;
    	 String p = label.getAttributeValue(ATT_POINT, CDX_NAMESPACE);
    	 if (p != null) {
    		 try {
    			 double[] dd = org.xmlcml.euclid.Util.splitToDoubleArray(p);
    			 Real2 point = new Real2(dd);
    			 Nodes moleculeNodes = cmlCml.query("//cml:molecule", CML_XPATH);
    			 double closestDist = Double.MAX_VALUE;
    			 for (int i = 0; i < moleculeNodes.size(); i++) {
    				 CMLMolecule molecule = (CMLMolecule) moleculeNodes.get(i);
    				 CMLAtom atom = AtomSetTool.getNearestAtom(molecule, point);
    				 if (atom != null) {
    					 double dist = atom.getXY2().getDistance(point);
    					 if (dist < closestDist) {
    						 closestDist = dist;
    						 closestAtom = atom;
    					 }
    				 }
    			 }
    		 } catch (Exception e) {
    			 //
    		 }
    	 }
    	 return closestAtom;
	 }
     
     private void cleanRedundantHierarchy() {
    	 // top cml node
    	 Nodes nodes = cmlCml.query(
    			 "/cml:cml", CML_XPATH);
    	 if (nodes.size() != 1) {
    		 throw new RuntimeException("need exactly one toplevel cml");
    	 }
    	 CMLCml cmlCml = (CMLCml) nodes.get(0);
    	 // remove moleculeList with only 1 child
    	 nodes = cmlCml.query(
    			 "//cml:moleculeList[count(cml:molecule)=1]", CML_XPATH);
    	 flattenGroupingElement(nodes);
    	 // remove empty moleculeList
    	 nodes = cmlCml.query(
    			 "//cml:moleculeList[count(cml:molecule)=0]", CML_XPATH);
    	 for (int i = 0; i < nodes.size(); i++) {
    		 nodes.get(i).detach();
    	 }
    	 // flatten single top-level list (/cml/list)
    	 nodes = cmlCml.query(
    			 "cml:cml[count(cml:list)=1 and count(*)=1]/cml:list", CML_XPATH);
    	 flattenGroupingElement(nodes);
    	 // remove empty lists
    	 nodes = cmlCml.query(
    			 "//cml:list[count(*)=0]", CML_XPATH);
    	 for (int i = 0; i < nodes.size(); i++) {
    		 nodes.get(i).detach();
    	 }
    	 // put top molecules under moleculeList
    	 nodes = cmlCml.query("./cml:molecule", CML_XPATH);
    	 if (nodes.size() > 1) {
	    	 CMLMoleculeList moleculeList = new CMLMoleculeList();
	    	 cmlCml.appendChild(moleculeList);
	    	 for (int i = 0; i < nodes.size(); i++) {
	    		 nodes.get(i).detach();
	    		 moleculeList.addMolecule((CMLMolecule) nodes.get(i));
	    	 }
    	 }
     }
	
     private void cleanExternalConnectionPoints() {
    	 if (cmlCml != null) {
    		 Nodes molecules = cmlCml.query(
    				 "//cml:molecule", CML_XPATH);
    		 if (molecules.size() == 0) {
    			 return;
    		 }
    		 CMLMolecule molecule = (CMLMolecule) molecules.get(0);
	    	 Nodes atoms = cmlCml.query(
	    			 "//cml:atom[@*[local-name()='NodeType' and .='ExternalConnectionPoint']]", CML_XPATH);
	    	 for (int i = 0; i < atoms.size(); i++) {
	    		 atoms.get(i).detach();
	    	 }
/** 
	   
	   
*/
	    	 atoms = cmlCml.query(
	    			 "//cml:atom[@*[local-name()='NodeType' and .='Fragment']]", CML_XPATH);
	    	 for (int i = 0; i < atoms.size(); i++) {
	    		 // fragment id
	    		 CMLAtom fragment = (CMLAtom) atoms.get(i);
	    		 String fragmentId = fragment.getAttributeValue(CMLXSD_ID);
	    		 // following sibling id
	    		 ParentNode atomArray = fragment.getParent();
	    		 int iatom = atomArray.indexOf(fragment);
	    		 CMLAtom nextAtom = null;
	    		 try {
	    			 nextAtom = (CMLAtom) atomArray.getChild(iatom+1);
	    		 } catch (ArrayIndexOutOfBoundsException aaiobe) {
	    			 LOG.error("Cannot find neighboring atom "+iatom);
	    			 continue;
	    		 }
//	    		 String nextId = nextAtom.getAttributeValue(CMLXSD_ID);
	    		 List bonds = molecule.getBonds();
	    		 for (CMLBond bond : bonds) {
	    			 String[] atomRefs2 = bond.getAtomRefs2();
	    			 int otherAtomNo = -1;
	    			 if (atomRefs2[0].equals(fragmentId)) {
	    				 otherAtomNo = 1;
	    			 } else if (atomRefs2[1].equals(fragmentId)) {
	    				 otherAtomNo = 0;
	    			 }
	    			 if (otherAtomNo >= 0) {
	    				 CMLAtom rootAtom = bond.getOtherAtom(fragment);
	    				 String order = bond.getOrder();
	    				 CMLBondStereo bondStereo = bond.getBondStereo();
	    				 molecule.deleteBond(bond);
	    				 molecule.deleteAtom(fragment);
	    				 CMLAtom atom0 = rootAtom;
	    				 CMLAtom atom1 = nextAtom;
	    				 if (otherAtomNo == 1) {
		    				 atom1 = rootAtom;
		    				 atom0 = nextAtom;
	    				 }
	    				 bond = new CMLBond(atom0, atom1);
	    				 bond.setOrder(order);
	    				 if (bondStereo != null) {
	    					 bond.addBondStereo(bondStereo);
	    				 }
	    				 molecule.addBond(bond);
	    				 break;
	    			 }
	    		 }
	    	 }
    	 }
     }
     
 	 private void removeCDXAttributes() {
    	 if (cmlCml != null) {
	    	 Nodes nodes = cmlCml.query(
	    			 "//*/@B | " +
	    			 "//*/@BondCircularOrdering | " +
	    			 "//*/@BS | " +
	    			 "//*/@Display | " +
	    			 "//*/@Display2 | " +
	    			 "//*/@DoublePosition | " +
	    			 "//*/@E | " +
	    			 "//*/@Order | " +
	    			 "//*/@Z" +
	    			 "", CML_XPATH);
	    	 for (int i = 0; i < nodes.size(); i++) {
	    		 nodes.get(i).detach();
	    	 }
    	 }
 	 }
     
     /** child elements screw up some CML viewers
      */
     private void flattenGroupingElements() {
    	 if (cmlCml != null) {
	    	 Nodes nodes = cmlCml.query("//cml:moleculeList", CML_XPATH);
	    	 flattenGroupingElement(nodes);
	    	 nodes = cmlCml.query("//cml:list", CML_XPATH);
	    	 flattenGroupingElement(nodes);
    	 }
     }

     
     private void expandFontInfo(CDXObject page) {
    	 Nodes texts = page.query(".//t[@"+TEMP_TEXT+"]");
    	 for (int i = 0; i < texts.size(); i++) {
    		 ((CDXText)texts.get(i)).addFontInfoFromTempText();
    	 }
     }

 	public static void addLabelsToMolecules(CMLElement scopeElement) {
 		// labels are normally "underneath" molecules. Since coordinates run 
 		// vertically "downwards" the coord look like:
 		// --------------------------------> +X
 		// |
 		// |    |-----------------|
 		// |    |                 |
 		// |    |    molecule     |
 		// |    |                 |
 		// |    |-----------------|
 		// |
 		// |        |-------|
 		// |        | label |
 		// |        |-------|
 		// |        
 		// V 
 		// +Y
 		
	//		  




© 2015 - 2024 Weber Informatics LLC | Privacy Policy