All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.xmlcml.cml.tools.OscarTool Maven / Gradle / Ivy

/**
 *    Copyright 2011 Peter Murray-Rust et. al.
 *
 *    Licensed under the Apache License, Version 2.0 (the "License");
 *    you may not use this file except in compliance with the License.
 *    You may obtain a copy of the License at
 *
 *        http://www.apache.org/licenses/LICENSE-2.0
 *
 *    Unless required by applicable law or agreed to in writing, software
 *    distributed under the License is distributed on an "AS IS" BASIS,
 *    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 *    See the License for the specific language governing permissions and
 *    limitations under the License.
 */

package org.xmlcml.cml.tools;

import java.io.InputStream;
import java.util.ArrayList;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.Map;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import nu.xom.Attribute;
import nu.xom.Builder;
import nu.xom.Document;
import nu.xom.Element;
import nu.xom.Elements;
import nu.xom.Node;
import nu.xom.Nodes;
import nu.xom.ParentNode;
import nu.xom.Text;

import org.apache.log4j.Logger;
import org.xmlcml.cml.base.CMLConstants;
import org.xmlcml.cml.base.CMLElement;
import org.xmlcml.cml.base.CMLUtil;
import org.xmlcml.cml.element.CMLAction;
import org.xmlcml.cml.element.CMLCml;
import org.xmlcml.cml.element.CMLConditionList;
import org.xmlcml.cml.element.CMLFormula;
import org.xmlcml.cml.element.CMLLink;
import org.xmlcml.cml.element.CMLModule;
import org.xmlcml.cml.element.CMLMolecule;
import org.xmlcml.cml.element.CMLMoleculeList;
import org.xmlcml.cml.element.CMLName;
import org.xmlcml.cml.element.CMLObject;
import org.xmlcml.cml.element.CMLProduct;
import org.xmlcml.cml.element.CMLProductList;
import org.xmlcml.cml.element.CMLProperty;
import org.xmlcml.cml.element.CMLPropertyList;
import org.xmlcml.cml.element.CMLReaction;
import org.xmlcml.cml.element.CMLReactionScheme;
import org.xmlcml.cml.element.CMLScalar;
import org.xmlcml.cml.element.CMLSpectrum;
import org.xmlcml.cml.element.CMLSubstance;
import org.xmlcml.euclid.Util;

/**
 * tool to support reactions. not fully developed
 * 
 * @author pmr
 * 
 */
@SuppressWarnings("unchecked")

public class OscarTool implements CMLConstants {
	private static Logger LOG = Logger.getLogger(OscarTool.class);

    static String OSCAR_CONTAINER = "container";
    static String OSCAR_DATASECTION = "datasection";
    static String OSCAR_NSP = "osc";
    static String OSCAR_NSPUNIT = "oscUnits";
    static String OSCAR_ROLE = "role";
    static String OSCAR_SENTENCE_END = "sentenceEnd";
    
    // this should be read from file...
    static List badFormulas = new ArrayList();
    // strings that are not chemical formulae
    static {
        badFormulas.add("CC");
        badFormulas.add("CD");
        badFormulas.add("SCC");
        badFormulas.add("VIS");
    };
    
    // mappings
    static String OSCAR_DIR = "org/xmlcml/cml/tools/oscar";
    static Map actionMap = null;
    static Map charactersMap = null;
    static Map conditionMap = null;
    static Map mergeableMoleculeMap = null;
    static Map objectMap = null;
    static Map powerUnitsMap = null;
    static Map rawUnitTypeMap = null;
    static Map statePropertyMap = null;
    static Map substanceMap = null;
    static Map temperatureMap = null;
    static Map unitsMap = null;
    static Map unitTypeMap = null;
    static Map verbMap = null;
    static {
//    
//    
//        
        actionMap = getLexicalMap(OSCAR_DIR+U_S+"action.xml");
        charactersMap = getLexicalMap(OSCAR_DIR+U_S+"characters.xml");
        conditionMap = getLexicalMap(OSCAR_DIR+U_S+"condition.xml");
        mergeableMoleculeMap = getLexicalMap(OSCAR_DIR+U_S+"mergeableMolecule.xml");
        objectMap = getLexicalMap(OSCAR_DIR+U_S+"object.xml");
        powerUnitsMap = getLexicalMap(OSCAR_DIR+U_S+"powerUnits.xml");
        rawUnitTypeMap = getLexicalMap(OSCAR_DIR+U_S+"rawUnitType.xml");
        statePropertyMap = getLexicalMap(OSCAR_DIR+U_S+"state.xml");
        substanceMap = getLexicalMap(OSCAR_DIR+U_S+"substance.xml");
        temperatureMap = getLexicalMap(OSCAR_DIR+U_S+"temperature.xml");
        unitsMap = getLexicalMap(OSCAR_DIR+U_S+"units.xml");
        unitTypeMap = getLexicalMap(OSCAR_DIR+U_S+"unitType.xml");
        verbMap = getLexicalMap(OSCAR_DIR+U_S+"verb.xml");
    };

    static Map getLexicalMap(String file) {
        Map map = null;
        try {
            InputStream is = Util.getInputStreamFromResource(file);
            Document doc = new Builder().build(is);
            Element root = doc.getRootElement();
            Elements entrys = root.getChildElements();
            String key = root.getAttributeValue("key");
            boolean regex = "regex".equals(key);
            boolean intKey = "intValue".equals(key);
            String value = root.getAttributeValue("value");
            if (intKey) {
                map = new LinkedHashMap();
            } else if (regex){
                map = new LinkedHashMap();
            } else {
                map = new LinkedHashMap();
            }
            for (int i = 0; i < entrys.size(); i++) {
                Element entry = (Element) entrys.get(i);
                if (intKey) {
                    map.put(new Integer(entry.getAttributeValue(key)), entry.getAttributeValue(value));
                } else if (regex){
                    Pattern pattern = Pattern.compile(entry.getAttributeValue("regex"),
                            Pattern.CASE_INSENSITIVE);
                    map.put(pattern, entry.getAttributeValue(value));
                } else {
                    map.put(entry.getAttributeValue(key), entry.getAttributeValue(value));
                }
            }
        } catch (Exception e) {
            Util.BUG("Error in static for entryMap: ", e);
        }
        return map;
    };
    
    Document doc = null;
    
    CMLCml cml;
    
    CMLModule abstractM;
    CMLModule authorListM;
    CMLModule bodyM;
    CMLModule conclusionsM;
    CMLModule experimentalM;
    CMLModule introductionM;
    CMLModule metadataM;
    CMLModule resultsM;
    CMLModule discussionM;
    CMLModule titleM;
    
    int ncontain = 0;
    /** constructor.
     * 
     * @param doc
     */
    public OscarTool(Document doc) {
        this.doc = doc;
    }


    /** get reaction schemes from Experimental.
     * 
      */
    public void convertToCML() {
        Element elem;
        cml = new CMLCml();
        Element root = doc.getRootElement();
        doc.replaceChild(root, cml);
        CMLUtil.transferChildren(root, cml);
        

        tidyParseBugs();
//        flattenInlineMarkup();
        
        processNamedEntities();
//        processNamedEntities();
        
        processSpectra();
        
        additionalMarkup();
        
        Nodes metadatas = doc.query("/*/METADATA");
        elem = (metadatas.size() == 1) ? (Element) metadatas.get(0) : null;
        processMetadata(elem);
        
        Nodes titles = doc.query("/*/TITLE");
        elem = (titles.size() == 1) ? (Element) titles.get(0) : null;
        processTitle(elem);
        
        Nodes authorLists = doc.query("/*/AUTHORLIST");
        elem = (authorLists.size() == 1) ? (Element) authorLists.get(0) : null;
        processAuthorList(elem);
        
        Nodes abstracts = doc.query("/*/ABSTRACT");
        elem = (abstracts.size() == 1) ? (Element) abstracts.get(0) : null;
        processAbstract(elem);
        
        Nodes bodys = doc.query("/*/BODY");
        elem = (bodys.size() == 1) ? (Element) bodys.get(0) : null;
        processBody(elem);
        
        // tidy containers ending with ( or ending with )
        processBrackets();

        processConjunctions();
        processSentences();
        
        aggregatePropertyAndMolecules();
        aggregateMolecules();

//        tidyIsolatedUnits();
        findSolvents();

        processAmounts();
        
        processDataSections();
        
        removeEmptyContainers();
        flattenModules();
        
        
//        Nodes nodes = cml.query(".//*[local-name()='module']");
//        Nodes mods = cml.query(".//"+CMLModule.NS, CMLConstants.CML_XPATH);
//        if (nodes.size() != mods.size()) {
//            LOG.debug("MODULES....."+nodes.size()+S_SLASH+mods.size());
//        }
    }

    private void tidyParseBugs() {
//        5 hours
//        Bad units for time
//        
//        30 minutes
        Nodes units = cml.query(".//units");
        int nn = units.size();
        for (int i = nn - 1; i >= 0; i--) {
            Element unit = (Element) units.get(i);
            if (unit == null) continue;
            // hours
            detach(unit, "h", "our", "s");
            // minutes
            detach(unit, "min", "ute", "s");
        }

//        
//        K 
//        
        units = cml.query(".//units");
        nn = units.size();
        for (int i = nn - 1; i >= 0; i--) {
            Element unit = (Element) units.get(i);
            if (unit == null) continue;
            Element ne = unit.getFirstChildElement("ne");
            if (ne != null && "K".equals(ne.getValue())) {
                ne.detach();
                unit.appendChild(new Text("K"));
            }
        }
        
        // "No", "As" are stopwords...
        remove2LetterElements();
        
        redeemVerbsFromAdjectives();
    }

    // not yet used
    @SuppressWarnings("unused")
    private void tidyIsolatedUnits() {
        List units = CMLUtil.getQueryNodes(cml, ".//unit");
        for (Node node : units) {
            Element unit = (Element) node;
            CMLUtil.debug(unit, "OSCAR");
        }
    }
    
    @SuppressWarnings("unused")
    private void identifyReactants() {
        // To [molecule]
        
    }

    // not used
    @SuppressWarnings("unused")
    private void flattenInlineMarkup() {
        List nodeList = CMLUtil.getQueryNodes(doc, ".//SB | .//SP | .//IT");
        for (Node node : nodeList) {
            String name = ((Element)node).getLocalName();
            String value = "__"+name+node.getValue()+name+"__";
            ParentNode parent = node.getParent();
            int idx = parent.indexOf(node);
            parent.replaceChild(node, new Text(value));
        }
    }
    
    private void remove2LetterElements() {
        //In 
        Nodes nodes = cml.query(".//ne");
        remove2LetterElements(nodes, "Element");
        
        //As 
        remove2LetterElements(nodes, "EL");
    }

    private void redeemVerbsFromAdjectives() {
//        was  dried        
        List propList = CMLUtil.getQueryNodes(doc, 
                ".//property[@type='state']");
        for (Node node : propList) {
            Element prop = (Element) node;
            List pSibs = CMLUtil.getQueryNodes(prop, 
                    "./preceding-sibling::node()[" +
                    "position()=1 and " +
                    "self::text()]");
            if (pSibs.size() > 0 ) {
                Text pSib = (Text) pSibs.get(0);
                String s = pSib.getValue().trim();
                if (s.endsWith(" was") ||
                        s.endsWith(S_COMMA) ||
                        s.endsWith("and")) {
                    for (String verb : verbMap.keySet()) {
                        if (verb.equalsIgnoreCase(prop.getValue())) {
                            CMLAction action = new CMLAction();
                            action.setTitle(verb);
                            prop.getParent().replaceChild(prop, action);
//                            LOG.debug("replaced state by verb: "+verb);
                            break;
                        }
                    }
                }
            }
        }
    }
    
    private void remove2LetterElements(Nodes nodes, String elem) {
        for (int i = 0; i < nodes.size(); i++) {
            
            Element molecule = (Element) nodes.get(i);
            String title = (elem.equals("EL")) ? molecule.getValue() :
                molecule.getAttributeValue(elem);
            if (
                "As".equals(title) || 
                "At".equals(title) || 
                "Be".equals(title) || 
                "He".equals(title) || 
                "In".equals(title) || 
                "No".equals(title) || 
                false) {
                Text text = new Text(title);
                ParentNode parent = molecule.getParent();
                if (parent != null) {
                    parent.replaceChild(molecule, text);
                }
            }
        }
    }
    
    private void processBrackets() {
        Nodes nodes = bodyM.query(".//"+CMLModule.NS+"[@role='container']", CMLConstants.CML_XPATH);
        List moduleList = new ArrayList();
        for (int i = 0; i < nodes.size(); i++) {
            CMLModule module = (CMLModule) nodes.get(i);
            moduleList.add(module);
        }
        for (CMLModule module : moduleList) {
            // value may contain inline markup so only take first bit
            if (module.getChildCount() > 0) {
                processLeadingRightBracket(module);
                processTrailingLeftBracket(module);
            }
        }
        groupBrackets();

//        removeEmptyContainers();
        
        conflateMoleculeAndFollowingBalancedBrackets();
        conflatePropertyAndFollowingMolecule();
        
        removeBalancedBracketParentFromMolecule();
        removeBalancedBracketChildFromMolecule();
        
    }
    
    private void groupBrackets() {
        // this is not yet fully developed as some non-semantic RBRACKETS are picked up
        // now group brackets
        List lbrakList = CMLUtil.getQueryNodes(bodyM, ".//"+CMLModule.NS+"[@role='lbracket']", CMLConstants.CML_XPATH);
        for (Node lbrak : lbrakList) {
            Nodes nodes = lbrak.query("./following-sibling::"+CMLModule.NS+"[@role='rbracket']",
                    CMLConstants.CML_XPATH);
            if (nodes.size() == 0) {
                // debug stuff
                Nodes nn = lbrak.query("./preceding-sibling::*[1]");
                if (nn.size() > 0) {
                } else {
                    LOG.debug("no preceding siblling");
                }
                nn = lbrak.query("./following-sibling::*[1]");
                if (nn.size() > 0) {
                } else {
                }
                continue;
            }
            CMLModule rbrak = (CMLModule) nodes.get(0);
            CMLElement parent = (CMLElement) lbrak.getParent();
            CMLModule module1 = new CMLModule();
            module1.setRole("balancedBrackets");
            int idx0 = parent.indexOf(lbrak);
            int idx1 = parent.indexOf(rbrak);
            // transfer brackets nodes to new container
            for (int i = idx1-1; i > idx0; i--) {
                CMLElement elem = (CMLElement) parent.getChild(i);
                elem.detach();
                module1.insertChild(elem, 0);
            }
            parent.replaceChild(lbrak, module1);
            rbrak.detach();
        }
    }
    
    private void removeEmptyContainers() {
        // remove empty containers
        List emptyList = CMLUtil.getQueryNodes(
                doc, ".//"+CMLModule.NS+"[@role='container']", CMLConstants.CML_XPATH);
        for (Node empty : emptyList) {
            CMLModule container = (CMLModule) empty;
            if (container.getChildCMLElements().size() == 0 &&
                    container.getValue().trim().equals(S_EMPTY)) {
                empty.detach();
//                LOG.debug("DETACH.............");
            }
        }
    }
    
    private void flattenModules() {
        List modList = CMLUtil.getQueryNodes(
                doc, ".//"+CMLModule.NS+"[@role='container']",
                CMLConstants.CML_XPATH);
        for (Node mod : modList) {
            CMLModule module = (CMLModule) mod;
            Nodes childs = module.query(CMLModule.NS, CMLConstants.CML_XPATH);
            List texts = CMLUtil.getQueryNodes(module, "./text()");
            // has at least one module child
            if (childs.size() > 0) {
                if (texts.size() > 0) {
                    wrapTextNodes(module, texts);
                    childs = module.query(CMLModule.NS, CMLConstants.CML_XPATH);
                }
                // all are modules
                if (childs.size() == module.getChildCount()) {
//                    for (int i = 0; i < module.getChildCount(); i++) {
//                        Util.print((CMLModule)module.getChild(i)).getId()+S_SPACE);
//                    }
//                    LOG.debug("\nREPLACEBYCHILDREN............"+module.getId());
                    module.replaceByChildren();
                } else {
                    // non-module child
                    for (int i = 0; i < module.getChildCount(); i++) {
                        Node child = module.getChild(i);
                        if (child instanceof Text) {
//                            System.err.println("UNWRAPPED "+child.getValue());
                        } else if (child instanceof CMLModule) {
                        } else if (child instanceof CMLElement) {
                            System.err.println("CMLELEMENT "+((Element)child).getLocalName());
                        } else if (child instanceof Element) {
                            // normally IT, SB, SP
//                            System.err.println("ELEMENT "+((Element)child).getLocalName());
                        } else {
                            System.err.println("UNEXPECTED "+child.getClass());
                        }
                    }
                }
            }
        }
    }
    
    private void wrapTextNodes(CMLModule module, List texts) {
        for (Node n : texts) {
            CMLModule mod = new CMLModule();
            mod.setRole(OSCAR_CONTAINER);
            mod.setId(getContainerId());
            module.replaceChild(n, mod);
            mod.appendChild(n);
        }
    }

    private void processLeadingRightBracket(CMLModule module) {
        Node firstChild = module.getChild(0);
        if (firstChild instanceof Text) {
            String value = firstChild.getValue().trim();
            // add new module to represent bracket
            if (value.startsWith(S_RBRAK)) {
                CMLModule rbrak = new CMLModule();
                rbrak.setRole("rbracket");
                ((Text)firstChild).setValue(value.substring(1));
                ParentNode parent = module.getParent();
                parent.insertChild(rbrak, parent.indexOf(module));
            }
        }
    }
    
    private void processTrailingLeftBracket(CMLModule module) {
        Node lastChild = module.getChild(module.getChildCount() - 1);
        if (lastChild instanceof Text) {
            String value = lastChild.getValue().trim();
            if (value.endsWith(S_LBRAK)) {
                CMLModule lbrak = new CMLModule();
                lbrak.setRole("lbracket");
                ((Text)lastChild).setValue(value.substring(0, value.length()-1));
                ParentNode parent = module.getParent();
                parent.insertChild(lbrak, parent.indexOf(module)+1);
            }
        }
    }

    /** turns balancedBrackets after molecule into child 
     */
    private void conflateMoleculeAndFollowingBalancedBrackets() {
        List molList = CMLUtil.getQueryNodes(
                doc, ".//"+CMLMolecule.NS+"[following-sibling::"+CMLModule.NS+"[@role='balancedBrackets']]",
                CMLConstants.CML_XPATH);
        for (Node node : molList) {
            CMLMolecule mol = (CMLMolecule) node;
            @SuppressWarnings("unused")
            String title = mol.getTitle();
            List mods = CMLUtil.getQueryNodes(mol, 
                    "./following-sibling::"+CMLModule.NS+"[@role='balancedBrackets'][1]", CMLConstants.CML_XPATH);
            if (mods.size() > 0) {
                CMLModule module = (CMLModule) mods.get(0);
                module.detach();
                mol.appendChild(module);
            } else {
//                System.err.println("Cannot find sibling for ..."+title);
            }
        }
    }

    /** turns state preceding molecule into child 
     * requires cml:property@state as immediately preceding sibling
     */
    private void conflatePropertyAndFollowingMolecule() {
        String subquery = "preceding-sibling::node()[" +
                "position()=1 and " +
                "self::"+CMLProperty.NS+" and " +
                "@state]";
        List molList = CMLUtil.getQueryNodes(
                doc, ".//"+CMLMolecule.NS+"["+subquery+S_RSQUARE, CMLConstants.CML_XPATH);
        for (Node node : molList) {
            CMLMolecule mol = (CMLMolecule) node;
            @SuppressWarnings("unused")
            String title = mol.getTitle();
//            LOG.debug("PPPPP "+title);
            List props = CMLUtil.getQueryNodes(mol, subquery, CMLConstants.CML_XPATH);
            if (props.size() > 0) {
                CMLProperty property = (CMLProperty) props.get(0);
                property.detach();
                mol.appendChild(property);
            } else {
//                System.err.println("Cannot find state sibling for ..."+title);
            }
        }
    }

    private void removeBalancedBracketParentFromMolecule() {
        List brackMolList = CMLUtil.getQueryNodes(
            doc, ".//"+CMLModule.NS+"[@role='balancedBrackets' and " +
            "count(*) = 1 and count("+CMLMolecule.NS+") = 1]", CMLConstants.CML_XPATH);
        for (Node brackMol : brackMolList) {
            ((CMLElement)brackMol).replaceByChildren();
        }
    }
    
    private void removeBalancedBracketChildFromMolecule() {
        List brackMolList = CMLUtil.getQueryNodes(
            doc, ".//"+CMLMolecule.NS+S_SLASH+CMLModule.NS+"[@role='balancedBrackets']", CMLConstants.CML_XPATH);
        for (Node brackMol : brackMolList) {
            ((CMLElement)brackMol).replaceByChildren();
        }
    }

    private void processConjunctions() {
        List textList = CMLUtil.getQueryNodes(doc, ".//"+CMLModule.NS+"[@role='container']", CMLConstants.CML_XPATH);
        for (Node node : textList) {
            CMLModule module = (CMLModule) node;
            String value = module.getValue().trim();
            if (value.equals("with") ||
                    value.equals("and") ||
                    value.equals(S_COMMA)
                    ) {
                module.setRole("conjunction");
                ((Text)module.getChild(0)).setValue(value);
            }
        }
    }
    
    private void processDataSections() {
        List nodeList = CMLUtil.getQueryNodes(doc, ".//datasection");
        for (Node node : nodeList) {
            Element datasection = (Element) node;
            CMLModule module = new CMLModule();
            module.setRole(OSCAR_DATASECTION);
            datasection.getParent().replaceChild(datasection, module);
            CMLUtil.transferChildren(datasection, module);
        }
    }
    
    /**
     * detect yxzzy bar. Foo abc. Plugh
     * as containing sentence end. Change to 
     * yxzzy bar"
     * 
     * Foo abc"
     * 
     * Plugh
     * 
     * This is harder than it looks because the text may have embedded inline
     * markup and this causes problems with module structure.
     * 
     */
    private void processSentences() {
        // for all modules look for end of sentence
        List contList = CMLUtil.getQueryNodes(doc, ".//"+CMLModule.NS+"[@role='container']", CMLConstants.CML_XPATH);
        for (Node cont : contList) {
            CMLModule module = (CMLModule) cont;
            List textList = CMLUtil.getQueryNodes(module, "./text()");
            List splitList = new ArrayList();
            if (textList.size() > 1) {
//                System.err.println("SIZE "+textList.size());
            }
            for (Node t : textList) {
                Text text = (Text) t;
                int nsplit = splitSentence(module, text);
                if (nsplit >0) {
                    splitList.add(text);
                } else {
                }
            }
            // remove original text if split.
            for (Text t : splitList) {
                t.detach();
            }
        }
        // split modules at sentence end
        List sentenceEndList = CMLUtil.getQueryNodes(doc, 
                ".//"+CMLModule.NS+"[@role='container']/"+CMLModule.NS+"[@role='sentenceEnd']",
                CMLConstants.CML_XPATH);
        for (Node n : sentenceEndList) {
            CMLModule sentenceEnd = (CMLModule) n;
            splitParentModule(sentenceEnd);
        }
        wrapSentencesAsModules();
    }
    
    private int splitSentence(CMLModule textParent, Text text) {
        if (!(textParent.equals(text.getParent()))) {
            System.err.println("Bad parent: "+textParent.getClass());
            return 0;
        }
        ParentNode moduleParent = textParent.getParent();
        @SuppressWarnings("unused")
        int ipar = moduleParent.indexOf(textParent);
        int ipos = textParent.indexOf(text);
        String value = text.getValue().trim();
        // normalize string to end with space after period
        if (value.endsWith(S_PERIOD)) {
            value += S_SPACE;
        }
        int nsplit = 0;
        int ioff = 0;
        while (value.length() > 0) {
            value += S_SPACE;
            int idx = value.indexOf(S_PERIOD+S_SPACE);
            String value0 = null;
            if (idx == -1) {
                value0 = value;
                if (nsplit == 0) {
                    break;
                }
            } else {
                value0 = value.substring(0, idx);
                value = value.substring(idx+2).trim();
                nsplit++;
            }
            value0 = value0.trim();
            if (value0.trim().length() > 0) {
                CMLModule newModule = new CMLModule();
                newModule.setRole(OSCAR_CONTAINER);
                newModule.setId(getContainerId());
                newModule.appendChild(new Text(value0));
                textParent.insertChild(newModule, ipos+(++ioff));
            }
            if (idx != -1) {
                CMLModule stop = new CMLModule();
                stop.setRole(OSCAR_SENTENCE_END);
                textParent.insertChild(stop, ipos+(++ioff));
            } else {
                break;
            }
        }
        return nsplit;
    }
    
    private void wrapSentencesAsModules() {
        List sentenceEnds = CMLUtil.getQueryNodes(
                cml, ".//"+CMLModule.NS+"[@role='sentenceEnd']", CMLConstants.CML_XPATH);
        int ii = 0;
        for (Node node : sentenceEnds) {
            CMLModule sentenceEnd = (CMLModule) node;
            sentenceEnd.setId("S"+(++ii));
            ParentNode parent = sentenceEnd.getParent();
            List precedingSE = CMLUtil.getQueryNodes(sentenceEnd, 
                    "./preceding-sibling::"+CMLModule.NS+"[@role='sentenceEnd'][position()=1]", CMLConstants.CML_XPATH);
            int idx = (precedingSE.size()==0) ? -1 : 
                parent.indexOf(precedingSE.get(0));
            wrapSiblingsInSentence(sentenceEnd, idx);
        }
        for (Node node : sentenceEnds) {
            ((CMLModule)node).setRole("sentence");
        }
    }
    
    private void wrapSiblingsInSentence(
            CMLModule sentenceEnd, int indexOfPrecedingSE) {
        ParentNode parent = sentenceEnd.getParent();
        int thisIndex = parent.indexOf(sentenceEnd);
        List siblingList = new ArrayList();
        for (int i = indexOfPrecedingSE+1; i < thisIndex; i++) {
            Element sibling = (Element) parent.getChild(i);
            if (sibling instanceof CMLModule && 
                    "sentenceEnd".equals(sibling.getAttribute("role"))) {
                LOG.debug("SEEEEEEEEEEEEEEEEEEEEEEEEEEEEE");
                continue;
            }
            siblingList.add(sibling);
        }
        for (Element sibling : siblingList) {
            sibling.detach();
            if (sibling instanceof CMLModule && 
                    "sentenceEnd".equals(sibling.getAttribute("role"))) {
                LOG.debug("SXXXXXXXXXXXXXXXXXXXXXXXXXXXX");
                continue;
            }
            sentenceEnd.appendChild(sibling);
        }
    }
    
    private String getContainerId() {
        return "c"+(++ncontain);
    }

    private void splitParentModule(CMLModule sentenceEnd) {
        CMLModule parentModule = (CMLModule) sentenceEnd.getParent();
        int ipos = parentModule.indexOf(sentenceEnd);
        ParentNode grandparent = parentModule.getParent();
        int ipar = grandparent.indexOf(parentModule);
        sentenceEnd.detach();
        grandparent.insertChild(sentenceEnd, ipar+1);
        CMLModule newModule = new CMLModule();
        newModule.setRole(OSCAR_CONTAINER);
        grandparent.insertChild(newModule, ipar+2);
        
        // transfer trailing children to new module
        // ipos is where the sentenceEnd was, but now closed up
        int nchild = parentModule.getChildCount();
        for (int i = nchild-1; i >= ipos; i--) {
            Node node = parentModule.getChild(i);
            node.detach();
            newModule.insertChild(node, 0);
        }
    }
    
    private void aggregateMolecules() {
        // find molecule conjunction molecule
        List molList = CMLUtil.getQueryNodes(doc, ".//"+CMLMolecule.NS, CMLConstants.CML_XPATH);
        for (Node molNode : molList) {
            Node conj = CMLUtil.getPrecedingSibling(molNode);
            if (conj instanceof CMLModule) {
                CMLModule conjMod = (CMLModule) conj;
                if ("conjunction".equals(conjMod.getRole())) {
                    Node molSib = CMLUtil.getPrecedingSibling(conjMod);
                    if (molSib instanceof CMLMolecule) {
                        ParentNode parent = conj.getParent();
                        CMLMoleculeList moleculeList = new CMLMoleculeList();
                        molSib.detach();
                        moleculeList.addMolecule((CMLMolecule)molSib);
                        molNode.detach();
                        moleculeList.addMolecule((CMLMolecule)molNode);
                        parent.replaceChild(conj, moleculeList);
                    } else if (molSib instanceof CMLMoleculeList) {
                        ((CMLMoleculeList)molSib).addMolecule((CMLMolecule)molNode);
                        conjMod.detach();
                    }
                }
            }
        }
    }
    
    private void findSolvents() {
        // wrap possible molecules in substance
        List moleculeList = 
            CMLUtil.getQueryNodes(doc, ".//"+CMLMolecule.NS, CMLConstants.CML_XPATH);
        for (Node node : moleculeList) {
            if (node instanceof CMLMolecule) {
                CMLMolecule molecule = (CMLMolecule) node;
                CMLModule module = getPrecedingModuleContainer(molecule, " in");
                if (module != null) {
                    CMLSubstance substance = new CMLSubstance();
                    molecule.getParent().replaceChild(molecule, substance);
                    substance.appendChild(molecule);
                }
            }
        }
        // set role=solvent
        List substanceList = 
            CMLUtil.getQueryNodes(doc, ".//"+CMLSubstance.NS, CMLConstants.CML_XPATH);
        for (Node node : substanceList) {
            CMLSubstance substance = (CMLSubstance) node;
            @SuppressWarnings("unused")
            CMLModule module = getPrecedingModuleContainer(substance, " in");
            substance.setRole(C_A+"solvent");
        }
    }
    
    private CMLModule getPrecedingModuleContainer(
            CMLElement element, String endString) {
        CMLModule module = null;
        Node sib = CMLUtil.getPrecedingSibling(element);
        boolean ok = false;
        if (sib != null && sib instanceof CMLModule) {
            module = (CMLModule) sib;
            if (OSCAR_CONTAINER.equals(module.getAttributeValue(OSCAR_ROLE))) {
                String value = module.getValue().trim();
                if (value.endsWith(endString)) {
                    ok = true;
                }
            }
        }
        return ok ? module : null;
    }

    /** property + molecule => molecule/property
     */
    private void aggregatePropertyAndMolecules() {
        List nodeList = CMLUtil.getQueryNodes(doc,
                CMLMolecule.NS+"[preceding-sibling::"+CMLProperty.NS+"]", CMLConstants.CML_XPATH);
        for (Node node : nodeList) {
            CMLMolecule molecule = (CMLMolecule) node;
            Node propSib = CMLUtil.getPrecedingSibling(molecule);
            if (propSib != null) {
                propSib.detach();
                molecule.appendChild(propSib);
            }
        }
    }
    
    /** tidy this...
    
     (
        
    0.132
    
    , total amount 
        
    0.372
    
    , 45%),
    ...*/
    private void processAmounts() {
        Nodes nodes = cml.query(
                ".//"+CMLMolecule.NS+"[following-sibling::"+CMLModule.NS+"[@role='container' and position()=1]]",
                CMLConstants.CML_XPATH);
//        System.err.println("NODES "+nodes.size());
        for (int i = 0; i < nodes.size(); i++) {
            CMLMolecule molecule = (CMLMolecule) nodes.get(i);
            CMLModule module = (CMLModule) molecule.query("following-sibling::"+CMLModule.NS, CMLConstants.CML_XPATH).get(0);
            String value = module.getValue().trim();
            if (value.startsWith(S_LBRAK)) {
//                System.err.println("VALUE "+value);
                Nodes properties = module.query(CMLProperty.NS, CMLConstants.CML_XPATH);
                if (properties.size() != 2) {
                    @SuppressWarnings("unused")
                    Element fs0 = (Element)module.query("./preceding-sibling::*[1]").get(0);
                    Nodes fs = module.query("./following-sibling::*");
                    if (fs.size() >= 1) {
                        @SuppressWarnings("unused")
                    Element fs1 = (Element)module.query("./following-sibling::*[1]").get(0);
                    }
                    if (fs.size() >= 2) {
                        @SuppressWarnings("unused")
                    Element fs2 = (Element)module.query("./following-sibling::*[2]").get(0);
                    }
                    if (fs.size() >= 3) {
                        @SuppressWarnings("unused")
                    Element fs3 = (Element)module.query("./following-sibling::*[3]").get(0);
                    }
                } else {
                    LOG.debug("OK PROPRTY");
                    CMLPropertyList propertyList = new CMLPropertyList();
                    CMLUtil.transferChildren(module, propertyList);
                    module.getParent().replaceChild(module, propertyList);
                }
            }
        }
    }
    private void processSpectra() {
        Nodes ne = doc.query(".//spectrum");
        for (int i = 0; i < ne.size(); i++) {
            processSpectrum((Element) ne.get(i));
        }
    }
    
    private void processSpectrum(Element el) {
        String type = el.getAttributeValue("type");
        if (
                "cnmr".equals(type) ||
                "hnmr".equals(type) ||
                "ir".equals(type) ||
                "massSpec".equals(type) ||
                "uv".equals(type) ||
            false) {
            CMLSpectrum spectrum = new CMLSpectrum();
            spectrum.setType(type);
            el.getParent().replaceChild(el, spectrum);
            CMLUtil.transferChildren(el, spectrum);
            
            List childs = CMLUtil.getChildNodes(el);
            // some children may get removed so ignore them
            for (Node child : childs) {
                if (child.getParent() != null) {
                    processSpectrumDescendant(child);
                }
            }
        } else {
            System.err.println("UNKNOWN SPECTRUM "+type);
        }
    }
    
    private void processSpectrumDescendant(Node node) {
        if (node instanceof Text) {
            // leave as is
        } else if (node instanceof Element) {
            Element element = (Element) node;
            String name = element.getLocalName();
            if ("SB".equals(name)) {
                node.getParent().replaceChild(node, new Text(S_UNDER+element.getValue()+S_UNDER));
            } else if ("SP".equals(name)) {
                node.getParent().replaceChild(node, new Text("^"+element.getValue()+"^"));
            } else if ("peaks".equals(name)) {
                processPeaks(element);
            } else {
                System.err.println("FAILED TO PROCESS: "+name);
            }
        }
    }
    
    private void processPeaks(Element element) {
        
//        for ()
    }

    private void processNamedEntities() {
        Nodes ne = doc.query("//ne");
        List neList = new ArrayList();
        for (int i = 0; i < ne.size(); i++) {
            neList.add((Element) ne.get(i));
        }
        for (Element namedEntity : neList) {
            processNamedEntity(namedEntity);
        }
        List nodeList = CMLUtil.getQueryNodes(doc, ".//"+CMLMolecule.NS, CMLConstants.CML_XPATH);
        @SuppressWarnings("unused")
        List molList = new ArrayList();
        for (Node node : nodeList) {
            LOG.debug("NNNNNNN"+node);
//            CMLMolecule molecule = (CMLMolecule) node;
//            tryMergeWithPreviousSibling(molecule);
        }
    }
    
    private CMLElement processNamedEntity(Element el) {
        String type = el.getAttributeValue("type");
        CMLElement cmlElement = null;
        if ("CM".equals(type) ||
            "CMS".equals(type) ||
            "OX".equals(type) ||
            "CJ".equals(type) ||
            "RN".equals(type) ||
            "ASES".equals(type) ||
            false) {
            CMLMolecule molecule = new CMLMolecule();
            molecule.setTitle(el.getValue());
            molecule.setRole(type);
            el.getParent().replaceChild(el, molecule);
            cmlElement = molecule;
        } else {
            System.err.println("UNKNOWN NE "+type);
        }
        return cmlElement;
    }
    
    @SuppressWarnings("unused")
    private void tryMergeWithPreviousSibling(CMLMolecule molecule) {
        String title = molecule.getTitle();
        for (Pattern pattern : mergeableMoleculeMap.keySet()) {
            Matcher matcher = pattern.matcher(title);
            if (matcher.matches()) {
                ParentNode parent = molecule.getParent();
                int idx = parent.indexOf(molecule);
                List nodes = CMLUtil.getQueryNodes(molecule, 
                    "./preceding-sibling::"+CMLMolecule.NS, CMLConstants.CML_XPATH);
                if (nodes.size() > 0) {
                    CMLMolecule preceding = (CMLMolecule) nodes.get(0);
                    int prevIdx = parent.indexOf(preceding);
                    if (idx == prevIdx + 1) {
                        preceding.setTitle(preceding.getTitle()+"++"+title);
//                        String tValue = text.getValue();
//                        LOG.debug("XXXXXXXXXXXXXX "+tValue);
//                        text.setValue(tValue+"++"+title);
//                        molecule.setRole("detach");
                        LOG.debug("============== "+preceding.getTitle());
                        break;
                    }
                }
            }
        }
    }

    /** adds markup that OSCAR missed.
     */
    private void additionalMarkup() {
      List texts = CMLUtil.getQueryNodes(cml, ".//text()");
      for (Node n : texts) {
          markup(actionMap, (Text) n, CMLAction.class);
      }
      texts = CMLUtil.getQueryNodes(cml, ".//text()");
      for (Node n : texts) {
          markup(conditionMap, (Text) n, CMLConditionList.class);
      }
      // rather crude to do it this way, but safe
      texts = CMLUtil.getQueryNodes(cml, ".//text()");
      for (Node n : texts) {
          markup(objectMap, (Text) n, CMLObject.class);
      }
      texts = CMLUtil.getQueryNodes(cml, ".//text()");
      for (Node n : texts) {
          markup(substanceMap, (Text) n, CMLSubstance.class);
      }
      // markup missed units (e.g. cm3
      markupUnits();
      markupPropertiesWithUnits();
  }
    
    /** find keys and uinsert new marked elements into text.
     * Example: if text value is: "this is a foo action"
     * and foo is a key, might create:
     * Text("this is a ") foo Text(" action");
     * recurses through the next texts so that all instances of keys are 
     * found and marked
     * @param parent
     * @param text
     */
    
    private void markup(Map map, Text text, Class cmlClass) {
        ParentNode parent = text.getParent();
        String textS = text.getValue();
        int pos = parent.indexOf(text);
        for (Pattern pattern : map.keySet()) {
            Matcher matcher = pattern.matcher(textS);
            int start = 0;
            int end = 0;
            List textList = new ArrayList();
            while (matcher.find()) {
                if (end == 0) {
                    text.detach();
                }
                start = matcher.start();
                if (start > end) {
                    Text tt = new Text(textS.substring(end, start));
                    textList.add(tt);
                    parent.insertChild(tt, pos++);
                }
                end = matcher.end();
                if (end > start) {
                    String dictRef = map.get(pattern);
                    CMLElement cmlElement = null;
                    try {
                        cmlElement = (CMLElement) cmlClass.newInstance();
                    } catch (Exception e) {
                        e.printStackTrace();
                        Util.BUG("cannot create CML object", e);
                    }
                    String title = textS.substring(start, end);
                    if (!(cmlClass.equals(CMLAction.class))) {
//                        LOG.debug("Made "+cmlClass+S_SLASH+title);
                    }
                    cmlElement.addAttribute(new Attribute("title", title));
                    cmlElement.appendChild(new Text(title));
                    cmlElement.addAttribute(new Attribute(OSCAR_ROLE, dictRef));
                    parent.insertChild(cmlElement, pos++);
                }
            }
            if (end > 0) {
                String endS = textS.substring(end, textS.length());
                Text tt = new Text(endS);
                textList.add(tt);
                parent.insertChild(tt, pos++);
                for (Text t : textList) {
                    markup(map, t, cmlClass);
                }
                break;
            }
        }
    }

    private void markupUnits() {
        List spNodes = CMLUtil.getQueryNodes(doc, ".//SP");
        for (Node node : spNodes) {
            Element sp = (Element) node;
            String s = sp.getValue().trim();
            try {
                new Integer(s);
            } catch (NumberFormatException nfe) {
                continue;
            }
            Node psib = CMLUtil.getPrecedingSibling(node);
            if (psib != null) {
                String v = psib.getValue().trim();
                for (String u : powerUnitsMap.keySet()) {
                    if (v.endsWith(u)) {
                        Text t = CMLUtil.getLastTextDescendant(psib);
                        String vv = t.getValue();
                        if (!(v.endsWith(u))) {
                            throw new RuntimeException("Bad units");
                        }
                        t.setValue(vv.substring(0, vv.length()-u.length()));
                        Element unit = new Element("units");
                        unit.appendChild(new Text((u + s).trim()));
                        sp.getParent().replaceChild(sp, unit);
                    }
                }
            }
        }
    }


    
//  from
//  30min
// form     
//    
//      
//        30
//        min
//      
//    
    private void markupPropertiesWithUnits() {
        List unitNodes = CMLUtil.getQueryNodes(doc, ".//units");
        for (Node node : unitNodes) {
            Element unit = (Element) node;
            ParentNode parent = unit.getParent();
            if (parent != null || 
                !(parent instanceof Element) ||
                !(((Element)parent).getLocalName().equals("quantity"))) {
                    markupIsolatedUnit(parent, unit);
            }
            standardizeUnits(parent, unit);
        }
    }
    
    private void markupIsolatedUnit(ParentNode parent, Element unit) {
        int idx = parent.indexOf(unit);
        Node psib = CMLUtil.getPrecedingSibling(unit);
        Text t = CMLUtil.getLastTextDescendant(psib);
        if (t != null) {
            String vvv = t.getValue().trim();
            String[] vv = vvv.split(S_SPACE);
            String v = vv[vv.length-1].trim();
            Double d = null;
            if (v.length() > 0) {
                try {
                    d = new Double(v);
                } catch (NumberFormatException e) {
    //                LOG.debug("Isolated unit: Cannot parse as double: "+vector);
                }
            }
            if (d != null) {
                t.setValue(vvv.substring(0, vvv.length()-v.length()));
                @SuppressWarnings("unused")
                String u = unit.getValue();
                Element point = new Element("point");
                point.appendChild(new Text(v));
                Element value = new Element("value");
                value.appendChild(point);
                Element quantity = new Element("quantity");
                quantity.appendChild(value);
                unit.detach();
                quantity.appendChild(unit);
                Element property = new Element("property");
                property.appendChild(quantity);
                property.addAttribute(new Attribute("type", "quantity"));
                parent.insertChild(property, idx);
            }
        }
    }
    
    private void standardizeUnits(ParentNode parent, Element unit) {
        parent = unit.getParent();
        if (parent != null && 
                parent instanceof Element &&
                ((Element)parent).getLocalName().equals("quantity")) {
            Element quantity = (Element) parent;
            Text text = CMLUtil.getFirstTextDescendant(unit);
            if (text != null) {
                String u = text.getValue();
                String standardUnit = unitsMap.get(u);
                if (standardUnit == null) {
                    CMLUtil.debug(unit, "OSCAR1");
                    LOG.debug("\nCannot find unit for ["+u+S_RSQUARE);
                } else {
                    text.setValue(standardUnit);
                    String uType = unitTypeMap.get(standardUnit);
                    if (uType == null) {
                        LOG.debug("Cannot find type for: "+standardUnit);
                    } else {
                        Attribute type = quantity.getAttribute("type");
                        if (type == null) {
                            quantity.addAttribute(new Attribute("type", uType));
                        } else {
                            String t = type.getValue();
                            String tt = rawUnitTypeMap.get(t);
                            if (tt == null) {
                                LOG.debug("Unknown raw unit type: "+t);
                            } else {
                                if (tt.equals(uType)) {
                                } else {
                                    LOG.debug("original type ("+tt+") incomaptible with ("+uType+S_RBRAK);
                                }
                            }
                        }
                    }
                }
            }
        }
    }
        
    private void detach(Node unit, String s1, String s2, String s3) {
        Element parent = (Element) unit.getParent();
        if (parent != null) {
            int idx = parent.indexOf(unit);
            if (s3.equals(unit.getValue())) {
                Node n1 = parent.getChild(idx-1);
                Node n2 = (idx > 1) ? parent.getChild(idx-2) : null;
                if (n1 != null && s2.equals(n1.getValue()) &&
                    n2 != null && s1.equals(n2.getValue())
                        ) {
                    unit.detach();
                    n1.detach();
                }
            }
        }
    }
    private void processMetadata(Element metadata) {
        if (metadata != null) {
            metadataM = new CMLModule();
            metadataM.setRole("metadata");
            CMLUtil.transferChildren(metadata, metadataM);
            metadata.getParent().replaceChild(metadata, metadataM);
            processElements(metadataM);
        } else {
            error("No metadata");
        }
    }
    
    private void processTitle(Element title) {
        if (title != null) {
            titleM = new CMLModule();
            titleM.setRole("title");
            CMLUtil.transferChildren(title, titleM);
            title.getParent().replaceChild(title, titleM);
            processElements(titleM);
        } else {
            error("No title");
        }
    }
    
    private void processAuthorList(Element authorList) {
        if (authorList != null) {
            authorListM = new CMLModule();
            authorListM.setRole("authorList");
            CMLUtil.transferChildren(authorList, authorListM);
            authorList.getParent().replaceChild(authorList, authorListM);
            processElements(authorListM);
        } else {
            error("No authorList");
        }
    }
    
    private void processAbstract(Element abstractx) {
        if (abstractx != null) {
            abstractM = new CMLModule();
            abstractM.setRole("abstract");
            CMLUtil.transferChildren(abstractx, abstractM);
            abstractx.getParent().replaceChild(abstractx, abstractM);
            processElements(abstractM);
            processParagraphs(abstractM);
        } else {
            error("No abstract");
        }
    }
    
    private void processBody(Element body) {
        if (body != null) {
            bodyM = new CMLModule();
            bodyM.setRole("body");
            CMLUtil.transferChildren(body, bodyM);
            body.getParent().replaceChild(body, bodyM);
            processElements(bodyM);
            processParagraphs(bodyM);
            processBodySections();
            markSynthesizedCompounds();
        } else {
            error("No body");
        }
    }
    
    private void processExperimental(Element experimental) {
        if (experimental != null) {
            experimentalM = new CMLModule();
            experimentalM.setRole("experimental");
            CMLUtil.transferChildren(experimental, experimentalM);
            experimental.getParent().replaceChild(experimental, experimentalM);
            processElements(experimentalM);
            removeHeader(experimentalM);
        } else {
            error("No experimental");
        }
    }
    
    private void processConclusions(Element conclusions) {
        if (conclusions != null) {
            conclusionsM = new CMLModule();
            conclusionsM.setRole("conclusions");
            CMLUtil.transferChildren(conclusions, conclusionsM);
            conclusions.getParent().replaceChild(conclusions, conclusionsM);
            processElements(conclusionsM);
            removeHeader(conclusionsM);
        } else {
//            error("No conclusions");
        }
    }
    
    private void processDiscussion(Element discussion) {
        if (discussion != null) {
            discussionM = new CMLModule();
            discussionM.setRole("discussion");
            CMLUtil.transferChildren(discussion, discussionM);
            discussion.getParent().replaceChild(discussion, discussionM);
            processElements(discussionM);
            removeHeader(discussionM);
        } else {
//            error("No discussions");
        }
    }
    
    private void processIntroduction(Element introduction) {
        if (introduction != null) {
            introductionM = new CMLModule();
            introductionM.setRole("introduction");
            CMLUtil.transferChildren(introduction, introductionM);
            introduction.getParent().replaceChild(introduction, introductionM);
            processElements(introductionM);
            removeHeader(introductionM);
        } else {
            error("No introduction");
        }
    }
    
    private void processResults(Element results) {
        if (results != null) {
            resultsM = new CMLModule();
            resultsM.setRole("results");
//            System.err.println("RES "+results.getChildCount());
            CMLUtil.transferChildren(results, resultsM);
            results.getParent().replaceChild(results, resultsM);
            processElements(resultsM);
            removeHeader(resultsM);
//            System.err.println("RES "+resultsM.getChildCount());
        } else {
            error("No results");
        }
    }
    
    private void processParagraphs(CMLModule module) {
        Nodes paras = module.query(".//P");
        for (int i = 0; i < paras.size(); i++) {
            Element para = (Element) paras.get(i);
            CMLModule paraM = new CMLModule();
            paraM.setRole("para");
            CMLUtil.transferChildren(para, paraM);
            para.getParent().replaceChild(para, paraM);
            processParagraph(paraM);
        }
    }
    
    private void processParagraph(CMLModule paraM) {
        int nn = paraM.getChildCount();
        CMLModule container = null;
        List childList = new ArrayList();
        for (int i = 0; i < nn; i++) {
            childList.add((Node)paraM.getChild(i));
        }
        int ipos = 0;
        for (Node child : childList) {
            if (child instanceof CMLElement) {
                if (container != null) {
                    paraM.insertChild(container, ipos++);
                }
                child.detach();
                paraM.insertChild(child, ipos++);
                container = null;
            } else {
                child.detach();
                container = ensure(container);
                container.appendChild(child);
            }
        }
        if (container != null) {
            paraM.insertChild(container, ipos++);
        }
    }

    private CMLModule ensure(CMLModule container) {
        if (container == null) {
            container = new CMLModule();
            container.setRole(OSCAR_CONTAINER);
            container.setId(getContainerId());
        }
        return container;
    }
    private void processElements(Element elem) {

        // try to deal with encodings
        tidyCharacters(elem);
        
        badChemicals(elem);
        
        //4 to 
        xRef2Mol(elem);
        
        // formula[chemical] or formula[text(elem)]
        formula2Mol(elem);

        processProperty(elem);
        
        processChemical(elem);
        
        // 'chemical followed by XREF
        chemicalXref(elem);
        
        // molecule/REF
        moleculeREF(elem);
        
        // tidy things OSCAR has missed
        markText(elem);
        
    }
    
    // remove SB, SP, IT, etc.
    private static void unmark(Element div, String name) {
        Nodes inline = div.query(".//"+name);
        for (int i = 0; i < inline.size(); i++) {
            unmark((Element) inline.get(i));
        }
    }
    
    private static void unmark(Element elem) {
        ParentNode parent = elem.getParent();
        int idx = parent.indexOf(elem);
        elem.detach();
        for (int i = 0; i < elem.getChildCount(); i++) {
            Node node = elem.getChild(i);
            node.detach();
            parent.insertChild(node, idx + i);
        }
    }

    // tidy encodings
    // FIXME move to CMLUtil
    private void tidyCharacters(Element elem) {
        Nodes texts = elem.query(".//text()");
        for (int i = 0; i < texts.size(); i++) {
            Text text = (Text) texts.get(i);
            StringBuilder sb = new StringBuilder(text.getValue());
            boolean change = false;
            for (int j = 0; j < sb.length(); j++) {
                char c = sb.charAt(j);
                int ii = (int) c;
                if (ii > 255) {
                    String s = charactersMap.get(new Integer(ii));
                    if (s != null) {
                        sb.replace(j, j+1, s);
                        change = true;
                    } else {
                        LOG.debug(">unknown>"+ii+">>"+c);
                        LOG.debug(">>>"+text.getParent().getValue());
                    }
                }
            }
            if (change) {
                text.setValue(sb.toString());
            }
        }
    }
    
    private void processBodySections() {
        Nodes divs = bodyM.query("DIV[HEADER]");
        for (int i = 0; i < divs.size(); i++) {
            Element div = (Element) divs.get(i);
            Element header = (Element) div.getFirstChildElement("HEADER");
            String title = header.getValue();
            if (title == null || title.trim().equals(S_EMPTY)) {
                try {
//                CMLUtil.debug(div, System.err);
                } catch (Exception e) {}
                error("HEADER must have child text");
            } else if (title.equals("Conclusions") || 
                    title.equals("Conclusion")) {
                processConclusions(div);
            } else if (title.equals("Discussion")) {
                processDiscussion(div);
            } else if (title.equals("Experimental")) {
                processExperimental(div);
            } else if (title.equals("Introduction")) {
                processIntroduction(div);
            } else if (title.indexOf("Results") != -1) {
                processResults(div);
            } else {
                error("unknown section: "+title);
            }
        }
    }
    
    private void removeHeader(Element div) {
        Element header = div.getFirstChildElement("HEADER");
        header.detach();
    }

    
    //4 to 
    private void xRef2Mol(Element elem) {
        Nodes xrefs = elem.query(".//XREF[@TYPE='COMPOUND']");
        for (int i = 0; i < xrefs.size(); i++) {
            Element xref = (Element) xrefs.get(i);
            ParentNode parent = xref.getParent();
            CMLMolecule mol = new CMLMolecule();
            parent.replaceChild(xref, mol);
            mol.setRef(xref.getAttributeValue("ID"));
            mol.setTitle(xref.getValue());
        }
    }
    
//    - 
//    - 
//      NaHCO 
//      3 
//      
//      
    // or
//    
//    SCC 
//        
    // or
//  VIS 
    private void formula2Mol(Element elem) {
        Nodes formulas = elem.query(".//formula");
        for (int i = 0; i < formulas.size(); i++) {
            Element formula = (Element) formulas.get(i);
            String value = formula.getValue();
            ParentNode parent = formula.getParent();
            int idx = parent.indexOf(formula);
            formula.detach();
            if (badFormulas.contains(value)) {
    //            LOG.debug("Bad: "+value);
                parent.insertChild(new Text(value), idx);
    //            CMLUtil.debug((Element)parent);
            } else {
                CMLMolecule mol = new CMLMolecule();
                parent.insertChild(mol, idx);
                Elements chemicals = formula.getChildElements("chemical");
                if (chemicals.size() == 0) {
                    mol.setTitle(formula.getValue());
                } else {
                    Element chemical = (Element) chemicals.get(0);
                    mol.setTitle(chemical.getValue());
                    String smiles = formula.getAttributeValue("SMILES");
                    if (smiles != null) {
                        CMLScalar scalar = new CMLScalar();
                        scalar.setValue(smiles);
                        scalar.setDictRef(OSCAR_NSP+S_COLON+"smiles");
                        mol.appendChild(scalar);
                    }
                    String inchi = formula.getAttributeValue("InChI");
                    if (smiles != null) {
                        CMLScalar scalar = new CMLScalar();
                        scalar.setValue(inchi);
                        scalar.setDictRef(OSCAR_NSP+S_COLON+"inchi");
                        mol.appendChild(scalar);
                    }
                }
            }
        }
    }

    // deals with chemical/formula/text()
//    private static void badChemicalFormulas(Element chemicalFormula) {
//        if (chemicalFormula.getChildElements().size() == 0) {
//            String value = chemicalFormula.getValue();
//            if (badFormulas.contains(value)) {
//                ParentNode parent = chemicalFormula.getParent();
//                parent.replaceChild(chemicalFormula, new Text(chemicalFormula.getValue()));
//            }
//        }
//    }
    private void badChemicals(Element elem) {
        Nodes chemicals = elem.query(".//chemical");
        for (int i = 0; i < chemicals.size(); i++) {
            Element chemical = (Element) chemicals.get(i);
            if (chemical.getChildElements().size() == 0) {
                String value = chemical.getValue();
                if (badFormulas.contains(value)) {
                    ParentNode parent = chemical.getParent();
                    parent.replaceChild(chemical, new Text(chemical.getValue()));
                }
            }
        }
    }
    
    private void processChemical(Element elem) {
        Nodes chemicals = elem.query(".//chemical");
        for (int i = 0; i < chemicals.size(); i++) {
            Element chemical = (Element) chemicals.get(i);
            unmark(chemical, "SB");
            unmark(chemical, "SP");
            unmark(chemical, "IT");
            Elements elems = chemical.getChildElements();
            if (elems.size() > 0) {
    //            CMLUtil.debug((Element)chemical);
    //            LOG.debug(((CMLMolecule)elems.get(0)).getTitle());
    //            error("unexpected chemical child: "+elems.get(0).getLocalName());
                error("unexpected chemical child: "+elems.get(0).getLocalName());
            }
            String value = chemical.getValue();
            CMLMolecule mol = new CMLMolecule();
            mol.setTitle(value);
            mol.setRole("chemical");
            ParentNode parent = chemical.getParent();
            parent.replaceChild(chemical, mol);
    //        LOG.debug(">>>chemical>>>"+value);
        }
    }
    
    // ".//CMLMolecule.NS+"[@role='chemical']/following-sibling::CMLMolecule.NS+"[position()=1 and@ref]", CMLConstants.CML_XPATH);
    private void chemicalXref(Element elem) {
        Nodes molecules = elem.query(
                ".//"+CMLMolecule.NS+"[@role='chemical']/following-sibling::"+CMLMolecule.NS+"[position()=1 and @ref]", CMLConstants.CML_XPATH);
        for (int i = 0; i < molecules.size(); i++) {
            Element molecule = (CMLMolecule) molecules.get(i);
            CMLMolecule previous = (CMLMolecule) molecule.query(
                    "./preceding-sibling::"+CMLMolecule.NS+"[position()=1 and @role='chemical']", CMLConstants.CML_XPATH).get(0);
            CMLName name = new CMLName();
            name.setXMLContent(previous.getTitle());
            previous.detach();
            molecule.insertChild(name, 0);
        }
    }

    // molecule followed by REF
    private void moleculeREF(Element elem) {
        Nodes nodes = elem.query(".//"+CMLMolecule.NS+"[following-sibling::*[position()=1" +
                " and self::REF]]", CMLConstants.CML_XPATH);
        for (int i = 0; i < nodes.size(); i++) {
            CMLMolecule molecule = (CMLMolecule) nodes.get(i);
            Element ref = (Element) molecule.query("./following-sibling::REF").get(0);
            @SuppressWarnings("unused")
            String type = ref.getAttributeValue("TYPE");
            String id = ref.getAttributeValue("ID");
            String value = ref.getValue();
            CMLLink link = new CMLLink();
            link.setTitle(value);
            link.setTo(id);
            molecule.appendChild(link);
            ref.detach();
        }
    }
    
//    saturated 
    private void processProperty(Element elem) {
        Nodes propertys = elem.query(".//property");
        for (int i = 0; i < propertys.size(); i++) {
            Element prop = (Element) propertys.get(i);
            String type = prop.getAttributeValue("type");
            if (type == null) {
                error("null type");
            }
            if (false) {
            } else if (type.equals("hrms")) {
                hrms(prop);
            } else if (type.equals("nature")) {
                nature(prop);
            } else if (type.equals("state")) {
                state(prop);
            } else if (type.equals("quantity")) {
                quantity(prop);
            } else if (type.equals("yield")) {
                yield(prop);
            } else if (type.equals("elemAnal")) {
                elementalAnalysis(prop);
            } else if (type.equals("mp")) {
                meltingPoint(prop);
            } else if (type.equals("rf")) {
                rf(prop);
            } else if (type.equals("bp")) {
                boilingPoint(prop);
            } else if (type.equals("refractiveindex")) {
                refractiveIndex(prop);
            } else if (type.equals("optRot")) {
                optRot(prop);
            } else {
                error("Type not found "+type);
            }
            prop.detach();
        }
    }

//  mark up text that OSCAR has missed
    private void markText(Element elem) {
        Nodes texts = elem.query(".//text()");
        boolean change = true;
        while (change) {
            change = false;
            for (int i = 0; i < texts.size(); i++) {
                if (markText((Text)texts.get(i))) {
                    change = true;
                }
            }
        }
    }
    
    private boolean markText(Text text) {
        boolean change = false;
        @SuppressWarnings("unused")
        String s = text.getValue();
        
        return change;
    }

//    
// -
// - // (2E,4E,6E,8E,10E,12E)-12-Hydroxy-14-[(1R,4S)-4-hydroxy-1,2,2-trimethylcyclopentyl]-2,7,11-trimethyl-14-oxotetradeca-2,4,6,8,10,12-hexaenal // //
//

... private void markSynthesizedCompounds() { Nodes divs = experimentalM.query(".//DIV[HEADER[count("+CMLMolecule.NS+") = 1" + "and count(text()[string-length(normalize-space()) > 0]) = 0]]", CMLConstants.CML_XPATH); for (int i = 0; i < divs.size(); i++) { Element div = (Element) divs.get(i); Element header = (Element) div.getFirstChildElement("HEADER"); CMLMolecule molecule = (CMLMolecule) header.getFirstChildElement("molecule", CMLConstants.CML_NS); CMLReactionScheme reactionScheme = createReactionScheme(div); addProduct(reactionScheme, molecule); } divs = experimentalM.query(".//DIV[HEADER[count("+CMLMolecule.NS+") = 2]]", CMLConstants.CML_XPATH); for (int i = 0; i < divs.size(); i++) { Element div = (Element) divs.get(i); Element header = (Element) div.getFirstChildElement("HEADER"); Nodes molecules = header.query(CMLMolecule.NS, CMLConstants.CML_XPATH); int idx0 = header.indexOf(molecules.get(0)); int idx1 = header.indexOf(molecules.get(1)); // LOG.debug("=============="); if (idx1 == idx0 + 1) { error("Cannot interpret header "+header.getValue()); } else if (idx1 == idx0 + 2) { Node node = header.getChild(idx0+1); String s = node.getValue().trim(); if ("and".equals(s)) { CMLReactionScheme reactionScheme = createReactionScheme(div); addProduct(reactionScheme, (CMLMolecule) molecules.get(0)); addProduct(reactionScheme, (CMLMolecule) molecules.get(1)); } else { // LOG.debug("NODE "+s); } } else { for (int j = idx0 + 1; j < idx1; j++) { @SuppressWarnings("unused") Node node = header.getChild(j); // LOG.debug("NODE.. "+node.getValue()); } } } } private CMLReactionScheme createReactionScheme(Element div) { CMLReactionScheme reactionScheme = new CMLReactionScheme(); CMLReaction reaction = new CMLReaction(); reaction.setRole("overallReaction"); reactionScheme.addReaction(reaction); CMLUtil.transferChildren(div, reactionScheme); div.getParent().replaceChild(div, reactionScheme); return reactionScheme; } private void addProduct(CMLReactionScheme reactionScheme, CMLMolecule molecule) { CMLReaction reaction = reactionScheme.getReactionElements().get(0); CMLProductList productList = reaction.getProductList(); if (productList == null) { productList = new CMLProductList(); reaction.addProductList(productList); } CMLProduct product = new CMLProduct(); productList.addProduct(product); molecule.detach(); product.addMolecule(molecule); } // (Found: M // + // , //- //- // 340.2416 // // // . //- // C // 19 // H // 36 // O // 3 // Si // // requires // M // , //- //- // 340.2435 // // // ) private static void hrms(Element prop) { // LOG.debug(">>>hrms"); Elements quantities = prop.getChildElements("quantity"); CMLScalar found = null; CMLFormula formula = null; CMLScalar required = null; List ionList = new ArrayList(); for (int i = 0; i < quantities.size(); i++) { Element quantity = quantities.get(i); int idx = prop.indexOf(quantity); String type = quantity.getAttributeValue("type"); if (type == null) { error("null type"); } else if (type.equals("formula")) { // C19H36O3Si String val = quantity.getValue(); try { formula = CMLFormula.createFormula(val); } catch (RuntimeException e) { error("Bad formula: "+e.getMessage()); } if (formula == null) { error("null formula"); } else { quantity.detach(); prop.insertChild(formula, idx); } } else if (type.equals("found")) { // 340.2416 found = getScalar(quantity, "found"); quantity.detach(); } else if (type.equals("required")) { required = getScalar(quantity, "required"); quantity.detach(); } else if (type.equals("ion")) { // [Found: ( // M ? Ac // ) + , 497.3463. //... // requires M Ac, 497.3485] CMLScalar ion = new CMLScalar(); ion.setValue(quantity.getValue()); ionList.add(ion); if (ionList.size() == 1) { ion.setDictRef(OSCAR_NSP+S_COLON+"ionFound"); } else if (ionList.size() == 2) { ion.setDictRef(OSCAR_NSP+S_COLON+"ionRequired"); } else { error("Too many ion children"); } quantity.detach(); } else { LOG.debug("????????"+type); } } unmark(prop, "SB"); unmark(prop, "SP"); unmark(prop, "IT"); if (formula == null) { // CMLUtil.debug((Element)prop.getParent()); // error("no 'formula' field in hrms "); return; } if (ionList.size() == 0) { if (found == null) { error("no 'found' field in hrms "); return; } if (required == null) { Nodes texts = prop.query("./text()"); // might have something like: ', 252.0569, found' if (texts.size() > 0) { String value = texts.get(texts.size()-1).getValue(); value = value.replaceAll("[)(:, ;]", S_EMPTY); value = value.replaceAll("(\\.)?(F|f)ound", S_EMPTY); try { double d = new Double(value).doubleValue(); required = new CMLScalar(d); required.setDictRef(OSCAR_NSP+S_COLON+"required"); // LOG.debug("req "+d); } catch (NumberFormatException e) { // CMLUtil.debug(prop); LOG.debug("Couldn't interpret as hrms required value: "+value); } } if (required == null) { error(" no 'required' field in hrms "); return; } } } else if (ionList.size() == 2) { found = ionList.get(0); required = ionList.get(1); } else { // CMLUtil.debug(prop); // error("Wrong number of ions: "+ionList.size()); return; } formula.appendChild(found); formula.appendChild(required); int idx = prop.indexOf(formula); // tidy text nodes String ss = S_EMPTY; for (int i = 0; i < idx; i++) { Node node = prop.getChild(i); if (node instanceof Text) { ss += node.getValue(); } } CMLScalar foundSc = new CMLScalar(); foundSc.setDictRef(OSCAR_NSP+S_COLON+"foundString"); foundSc.setValue(ss); formula.appendChild(foundSc); ss = S_EMPTY; for (int i = idx; i < prop.getChildCount(); i++) { Node node = prop.getChild(i); if (node instanceof Text) { ss += node.getValue(); } } CMLScalar requiredSc = new CMLScalar(); requiredSc.setDictRef(OSCAR_NSP+S_COLON+"requiredString"); requiredSc.setValue(ss); formula.appendChild(requiredSc); int nc = prop.getChildCount(); for (int i = nc-1; i >= 0; i--) { if (i != idx) { Node node = prop.getChild(i); if (node instanceof Text) { node.detach(); } } } } private static CMLScalar getScalar(Element quantity, String name) { CMLScalar scalar = new CMLScalar(); scalar.setDictRef(OSCAR_NSP+S_COLON+name); double d = getValuePoint(quantity, name); scalar.setValue(d); return scalar; } // // - // - // (71 // // mg // // , // - // - // 36 // // // %) // private static void yield(Element div) { CMLProperty yield = new CMLProperty(); yield.setDictRef(OSCAR_NSP+S_COLON+"yield"); Elements quantities = div.getChildElements("quantity"); CMLScalar amount = null; CMLScalar mass = null; CMLScalar percent = null; for (int i = 0; i < quantities.size(); i++) { Element quantity = quantities.get(i); String type = quantity.getAttributeValue("type"); if (type == null) { error("Must give type on yield"); } else if (type.equals("mass")) { mass = new CMLScalar(); double d = getValuePoint(quantity, "mass"); mass.setValue(d); mass.setDictRef(OSCAR_NSP+S_COLON+"mass"); String units = getUnits(quantity, "mass"); mass.setUnits(OSCAR_NSPUNIT+S_COLON+units); yield.appendChild(mass); } else if (type.equals("percent")) { percent = new CMLScalar(); double d = getValuePoint(quantity, "percent"); percent.setValue(d); percent.setDictRef(OSCAR_NSP+S_COLON+"percent"); String units = "percent"; percent.setUnits(OSCAR_NSPUNIT+S_COLON+units); yield.appendChild(percent); } else if (type.equals("amount")) { amount = new CMLScalar(); double d = getValuePoint(quantity, "amount"); amount.setValue(d); amount.setDictRef(OSCAR_NSP+S_COLON+"amount"); String units = getUnits(quantity, "amount"); amount.setUnits(OSCAR_NSPUNIT+S_COLON+units); yield.appendChild(amount); } else if (type.equals("quantity")) { error("yield/amount not yet implemented"); } else { error("Unknown type on yield: "+type); } } if (mass == null) { // maybe the div/preceding-sibling is mass Nodes nodes = div.query("./preceding-sibling::*"); if (nodes.size() > 0 && nodes.get(0) instanceof Element) { Element div0 = (Element) nodes.get(0); if (div0 instanceof CMLMolecule) { // CMLMolecule molecule = (CMLMolecule) div0; } else if (div0.getLocalName().equals("property")) { Element quantity = div0.getFirstChildElement("quantity"); if (quantity != null) { mass = getScalar(quantity, "mass"); } } } // this may not be an error after all... if (mass == null) { // try { // CMLUtil.debug(div, System.err); // } catch (Exception e) {} // CMLUtil.debug(div); // error("no mass given in yield"); } } if (percent == null) { error("*no percent given in yield"); } } private static void elementalAnalysis(Element div) { // error("No element analysis"); } private static void meltingPoint(Element div) { // error("No melting point"); } private static void boilingPoint(Element div) { // error("No boiling point"); } private static void refractiveIndex(Element div) { // error("No refractive index"); } private static void rf(Element div) { // error("No rf"); } private static void optRot(Element div) { // error("No optRot"); } static double getValuePoint(Element elem, String ss) { double d = Double.NaN; Nodes points = elem.query("value/point"); Nodes texts = elem.query("value/text()"); // some constructs 3 ? 100 ... if (points.size() == 2 && texts.size() == 1 && texts.get(0).getValue().trim().equals("?")) { try { d = new Double(points.get(1).getValue()).doubleValue(); } catch (NumberFormatException nfe) { error("Bad double "+nfe+" for "+ss); } elem.addAttribute(new Attribute("count", points.get(0).getValue())); } else if (points.size() == 1){ String s = ((Element) points.get(0)).getValue(); // sometimes starts with ( if (s.startsWith(S_LBRAK)) { s = s.substring(1); } try { d = new Double(s).doubleValue(); } catch (NumberFormatException nfe) { error("Bad double "+nfe+" for "+ss); } } else if (elem.getParent() == null) { LOG.debug("null parent"); } else { CMLUtil.debug((Element)elem.getParent(), "OSCARTOOL2"); error("Bad value/point for "+ss+S_SLASH+elem.getParent().getValue()); } if (Double.isNaN(d)) { throw new RuntimeException("Unexpected NaN"); } return d; } private static String getUnits(Element elem, String ss) { Nodes units = elem.query("units"); if (units.size() != 1) { try { // CMLUtil.debug(elem, System.err); } catch (Exception e) {} error("Bad units for "+ss); } String s = null; if (!ss.equals(elem.getAttributeValue("type"))) { LOG.debug("no quantity of type found: "+ss); } else { s = ((Element) units.get(0)).getValue(); // trim bracket if (s.startsWith(S_LBRAK) || s.startsWith(S_MINUS)) { s = s.substring(1); } } return s; } // // colorless // oil // private static void nature(Element div) { // LOG.debug(">>>nature"); CMLProperty nature = new CMLProperty(); nature.setDictRef(OSCAR_NSP+S_COLON+"nature"); Elements quantities = div.getChildElements("quantity"); for (int i = 0; i < quantities.size(); i++) { Element quantity = quantities.get(i); CMLScalar scalar = new CMLScalar(); String type = quantity.getAttributeValue("type"); String value = quantity.getValue(); if (type == null) { error("Must give type on nature"); } else if (type.equals("colour")) { scalar.setDictRef(OSCAR_NSP+S_COLON+type); scalar.setValue(value); } else if (type.equals("nonsolidstate")) { scalar.setDictRef(OSCAR_NSP+S_COLON+type); scalar.setValue(value); } else if (type.equals("solidstate")) { scalar.setDictRef(OSCAR_NSP+S_COLON+type); scalar.setValue(value); } else if (type.equals("statemodifier")) { scalar.setDictRef(OSCAR_NSP+S_COLON+type); scalar.setValue(value); } else { error("Unknown type on nature: "+type); } nature.appendChild(scalar); } ParentNode parent = div.getParent(); parent.replaceChild(div, nature); } private void state(Element prop) { CMLProperty property = null; if (prop.getChildElements().size() == 0) { String val = prop.getValue().toLowerCase(); for (Pattern pattern : statePropertyMap.keySet()) { if (pattern.matcher(val).matches()) { String propertyS = statePropertyMap.get(pattern); if (propertyS == null) { error("Unknown property: "+val); } else { property = new CMLProperty(); property.setState(propertyS); property.appendChild(new Text(val)); } break; } } if (property == null) { LOG.debug("Unknown property: "+prop.getValue()); } } else { // LOG.debug("TYPE "+type); // property.setDictRef(OSCAR_NSP+S_COLON+propertyS); } if (property != null) { ParentNode parent = prop.getParent(); int idx = parent.indexOf(prop); parent.insertChild(property, idx); } else { // CMLUtil.debug(prop); // LOG.debug("Cannot add state: "); } } // // - // - // 11.9 // // g // // , // - // - // 0.04 // // mol // // static void quantity(Element property) { // LOG.debug(">>>quantity>>>"); CMLProperty cmlProperty = new CMLProperty(); cmlProperty.setDictRef(OSCAR_NSP+S_COLON+"quantity"); Elements quantities = property.getChildElements("quantity"); CMLScalar temperature = null; int nQuant = quantities.size(); for (int i = 0; i < nQuant; i++) { String units = null; Element oldQuantity = quantities.get(i); String oldValue = oldQuantity.getValue(); String type = oldQuantity.getAttributeValue("type"); Element unit = oldQuantity.getFirstChildElement("units"); if ("temperature".equals(type)) { unit = parseNonNumericTemperatures(type, unit, oldQuantity); } if (unit == null) { Node node = CMLUtil.getLastTextDescendant(oldQuantity); unit = makeUnitFrom(node); if (unit != null) { // LOG.debug("Interpreted text as unit: "+unit.getValue()); node.getParent().replaceChild(node, unit); } else { LOG.debug("No units given: "+property.getValue()+" :: "); } } if (unit != null) { units = unit.getValue(); if (type == null) { error("Must give type on quantity"); } else if ( type.equals("amount") || type.equals("conc") || type.equals("equiv") || type.equals("integral") || type.equals("mass") || type.equals("percent") || type.equals("time") || type.equals("volume") || false) { try { createScalarAndAppendAsChildOfQuantity(cmlProperty, oldQuantity, type, units); } catch (RuntimeException e) { error("unpexcted NaN: "+oldValue); } } else if (type.equals("temperature")) { temperature = new CMLScalar(); temperature.setDictRef(OSCAR_NSP+S_COLON+"temperature"); if ("rt".equalsIgnoreCase(oldQuantity.getValue()) || "ambient temperature".equals(oldQuantity.getValue()) || "room temperature".equals(oldQuantity.getValue())) { units = "k"; temperature.setUnits(OSCAR_NSPUNIT+S_COLON+units); temperature.setValue(298.15); } else { double d = getValuePoint(oldQuantity, "temperature"); temperature.setValue(d); temperature.setDictRef(OSCAR_NSP+S_COLON+"temperature"); } temperature.setUnits(OSCAR_NSPUNIT+S_COLON+units); cmlProperty.appendChild(temperature); } else { error("Unknown type on quantity: "+type); } } } ParentNode parent = property.getParent(); parent.replaceChild(property,cmlProperty); } private static Element parseNonNumericTemperatures( String type, Element unit, Element quant) { Node node = CMLUtil.getFirstTextDescendant(quant); if (node == null) { LOG.debug("no text in quantity:"); CMLUtil.debug(quant, "OSCAR4"); } else { String s = node.getValue(); for (Pattern temperaturePattern : temperatureMap.keySet()) { Matcher matcher = temperaturePattern.matcher(s); if (matcher.matches()) { node.detach(); Element value = new Element("value"); Element point = new Element("point"); value.appendChild(point); point.appendChild(new Text(temperatureMap.get(temperaturePattern))); unit = new Element("units"); unit.appendChild(new Text("K")); quant.appendChild(value); quant.appendChild(unit); break; } } } return unit; } /** create new CMLScalar and append as child to quantity. * * @param quantity * @param name * @param units */ static void createScalarAndAppendAsChildOfQuantity(CMLProperty property, Element quantity, String name, String units) { CMLScalar scalar = new CMLScalar(); double d = getValuePoint(quantity, name); if (Double.isNaN(d)) { throw new RuntimeException("Bad quantity: "); } scalar.setValue(d); scalar.setDictRef(OSCAR_NSP+S_COLON+name); scalar.setUnits(OSCAR_NSPUNIT+S_COLON+units); // quantity.appendChild(scalar); property.appendChild(scalar); } private static Element makeUnitFrom(Node node) { Element unit = null; if (node == null) { LOG.debug("Null units text"); } else if (node instanceof Text) { String s = node.getValue().trim(); if (s.equals(S_EMPTY)) { // CMLUtil.debug((Element) node.getParent()); LOG.debug("No trailing units given"); } else { String type = unitsMap.get(s); if (type == null) { LOG.debug("Cannot interpret text as unit:"+s+S_COLON); } else { unit = new Element("units"); unit.appendChild(new Text(s)); } } } else { CMLUtil.debug((Element) node, "OSCAR5"); } return unit; } private static void error(String s) { LOG.debug("***ERROR**>>>"+s); } }





© 2015 - 2025 Weber Informatics LLC | Privacy Policy