com.ibm.spss.hive.serde2.xml.processor.java.JavaXmlProcessor Maven / Gradle / Ivy

/**
 * (c) Copyright IBM Corp. 2013. All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License").
 * You may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *    http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
*/

package com.ibm.spss.hive.serde2.xml.processor.java;

import java.io.StringReader;
import java.io.StringWriter;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;

import javax.xml.parsers.DocumentBuilder;
import javax.xml.parsers.DocumentBuilderFactory;
import javax.xml.transform.OutputKeys;
import javax.xml.transform.Transformer;
import javax.xml.transform.TransformerFactory;
import javax.xml.transform.dom.DOMSource;
import javax.xml.transform.stream.StreamResult;
import javax.xml.xpath.XPath;
import javax.xml.xpath.XPathConstants;
import javax.xml.xpath.XPathExpression;
import javax.xml.xpath.XPathFactory;

import org.apache.hadoop.hive.serde2.objectinspector.PrimitiveObjectInspector.PrimitiveCategory;
import org.w3c.dom.Document;
import org.w3c.dom.NamedNodeMap;
import org.w3c.dom.Node;
import org.w3c.dom.NodeList;
import org.xml.sax.InputSource;

import com.ibm.spss.hive.serde2.xml.processor.XmlMapEntry;
import com.ibm.spss.hive.serde2.xml.processor.XmlMapFacet;
import com.ibm.spss.hive.serde2.xml.processor.XmlProcessor;
import com.ibm.spss.hive.serde2.xml.processor.XmlProcessorContext;
import com.ibm.spss.hive.serde2.xml.processor.XmlQuery;
import com.ibm.spss.hive.serde2.xml.processor.XmlUtils;

/**
 * The XML processor implementation based on the javax.xml.xpath.XPath
 */
public class JavaXmlProcessor implements XmlProcessor {

    private static TransformerFactory TRANSFORMER_FACTORY = TransformerFactory.newInstance();
    protected static DocumentBuilderFactory DOCUMENT_BUILDER_FACTORY = null;
    private DocumentBuilder builder = null;

    private static XPathFactory XPATH_FACTORY = null;

    static {
        DOCUMENT_BUILDER_FACTORY = DocumentBuilderFactory.newInstance();
        DOCUMENT_BUILDER_FACTORY.setNamespaceAware(true);
        DOCUMENT_BUILDER_FACTORY.setIgnoringComments(true);
        // Theoretically we could use setIgnoringElementContentWhitespace(true)
        // but that would require a validating parser and the schema which we do not always have.
        // As a workaround we'll use custom solution to trim the whitespace from the text nodes
        // and drop them if all the text is just whitespace.
        // See also http://bugs.sun.com/bugdatabase/view_bug.do?bug_id=6545684
        XPATH_FACTORY = XPathFactory.newInstance();
    }

    private List queries = new ArrayList();
    private Map mapSpecification = null;

    /**
     * @see com.ibm.spss.hive.serde2.xml.processor.XmlProcessor#initialize(com.ibm.spss.hive.serde2.xml.processor.XmlProcessorContext)
     */
    @Override
    public void initialize(XmlProcessorContext xmlProcessorContext) {
        try {
            this.builder = DOCUMENT_BUILDER_FACTORY.newDocumentBuilder();
            XPath xpath = XPATH_FACTORY.newXPath();
            for (XmlQuery xmlQuery : xmlProcessorContext.getXmlQueries()) {
                this.queries.add(new JavaXmlQuery(xmlQuery).compile(xpath));
            }
            this.mapSpecification = xmlProcessorContext.getXmlMapSpecification();
        } catch (Exception e) {
            throw new RuntimeException(e);
        }
    }

    /**
     * @see com.ibm.spss.hive.serde2.xml.processor.XmlProcessor#parse(java.lang.String)
     */
    @Override
    public Map parse(String value) {
        Map result = null;
        Document document = null;
        try {
            result = new HashMap();
            document = this.builder.parse(new InputSource(new StringReader(value)));
            for (JavaXmlQuery query : this.queries) {
                XPathExpression expression = query.getExpression();
                String name = query.getName();
                NodeArray nodeArray = new NodeArray().withName(name);
                if (expression != null) {
                    NodeList nodeList = (NodeList) expression.evaluate(document, XPathConstants.NODESET);
                    for (int nodeIndex = 0; nodeIndex < nodeList.getLength(); ++nodeIndex) {
                        Node node = nodeList.item(nodeIndex);
                        if (node.getNodeType() == Node.TEXT_NODE) {
                            Node text = trimTextNode(node);
                            if (text != null) {
                                nodeArray.add(node);
                            }
                        } else {
                            trimWhitespace(node);
                            nodeArray.add(node);
                        }
                    }
                }
                result.put(name, nodeArray);
            }
        } catch (Exception e) {
            throw new RuntimeException(e);
        }
        return result;
    }

    /**
     * @see com.ibm.spss.hive.serde2.xml.processor.java.XmlProcessor.XPathProcessor#getObjectValue(java.lang.Object, java.lang.String)
     */
    @SuppressWarnings("rawtypes")
    @Override
    public Object getObjectValue(Object o, String fieldName) {
        if (o instanceof Map) {
            NodeArray nodeArray = (NodeArray) ((Map) o).get(fieldName);
            return nodeArray.size() == 0 ? null : nodeArray;
        } else if (o instanceof Node) {
            return getObjectValue((Node) o, fieldName);
        } else if (o instanceof NodeArray) {
            NodeArray array = (NodeArray) o;
            List nodes = new ArrayList();
            for (Node node : array) {
                Object value = getObjectValue(node, fieldName);
                if (value instanceof Node) {
                    nodes.add((Node) value);
                } else if (value instanceof NodeArray) {
                    nodes.addAll((NodeArray) value);
                }
            }
            return nodes.size() == 0 ? null : new NodeArray(nodes);
        }
        return null;
    }

    /**
     * Returns the object value for the given field name and node
     * 
     * @param node
     *            the node
     * @param fieldName
     *            the field name
     * @return the object value for the given field name and node
     */
    private Object getObjectValue(Node node, String fieldName) {
        // we have to take into account the fact that fieldName will be in the lower case
        if (node != null) {
            String name = node.getLocalName();
            switch (node.getNodeType()) {
                case Node.ATTRIBUTE_NODE:
                    return name.equalsIgnoreCase(fieldName) ? node : null;
                case Node.ELEMENT_NODE: {
                    if (name.equalsIgnoreCase(fieldName)) {
                        return new NodeArray(node.getChildNodes());
                    } else {
                        NamedNodeMap namedNodeMap = node.getAttributes();
                        for (int attributeIndex = 0; attributeIndex < namedNodeMap.getLength(); ++attributeIndex) {
                            Node attribute = namedNodeMap.item(attributeIndex);
                            if (attribute.getLocalName().equalsIgnoreCase(fieldName)) {
                                return attribute;
                            }
                        }
                        return null;
                    }
                }
                default:
                    return null;
            }
        }
        return null;
    }

    /**
     * Returns the string value for the object
     * 
     * @param o
     *            the object
     * @return the string value for the object
     */
    private String getStringValue(Object o) {
        if (o instanceof String) {
            return (String) o;
        } else if (o instanceof NodeArray) {
            NodeArray array = (NodeArray) o;
            switch (array.size()) {
                case 0:
                    return null;
                case 1: {
                    return getStringValue(array.get(0));
                }
                default:
                    return getStringValue(array);
            }
        } else if (o instanceof Node) {
            return getStringValue((Node) o);
        }
        return null;
    }

    /**
     * Returns the string value for the node
     * 
     * @param node
     *            the node
     * @return the string value for the node
     */
    private String getStringValue(Node node) {
        switch (node.getNodeType()) {
            case Node.ATTRIBUTE_NODE:
            case Node.TEXT_NODE:
                return node.getNodeValue();
            default: {
                try {
                    Transformer transformer = TRANSFORMER_FACTORY.newTransformer();
                    StringWriter buffer = new StringWriter();
                    transformer.setOutputProperty(OutputKeys.OMIT_XML_DECLARATION, "yes");
                    transformer.transform(new DOMSource(node), new StreamResult(buffer));
                    return buffer.toString();
                } catch (Exception e) {
                }
                return null;
            }
        }
    }

    /**
     * Returns the string value for the node array
     * 
     * @param node
     *            the node array
     * @return the string value for the node array
     */
    private String getStringValue(NodeArray nodes) {
        StringBuilder stringBuilder = new StringBuilder();
        // If all we have is just a bunch of nodes and the user wants a string
        // we'll use a parent element called  to have a valid XML document
        stringBuilder.append("");
        for (Node node : nodes) {
            stringBuilder.append(getStringValue(node));
        }
        stringBuilder.append("");
        return stringBuilder.toString();
    }

    /**
     * @see com.ibm.spss.hive.serde2.xml.processor.java.XmlProcessor.XPathProcessor#getPrimitiveObjectValue(java.lang.Object,
     *      org.apache.hadoop.hive.serde2.objectinspector.PrimitiveObjectInspector.PrimitiveCategory)
     */
    @Override
    public Object getPrimitiveObjectValue(Object o, PrimitiveCategory primitiveCategory) {
        return XmlUtils.getPrimitiveValue(getStringValue(o), primitiveCategory);
    }

    /**
     * The given node returns a map entry or null
     * 
     * @param node
     *            the node
     * @return a map entry for the given node or null
     */
    @SuppressWarnings("rawtypes")
    private Map.Entry getMapEntry(Node node) {
        Object value = null;
        String keyName = node.getLocalName();
        XmlMapEntry xmlMapEntry = this.mapSpecification.get(keyName);
        switch (node.getNodeType()) {
            case Node.ATTRIBUTE_NODE:
                value = node.getNodeValue();
                break;
            case Node.ELEMENT_NODE: {
                if (xmlMapEntry == null) {
                    value = new NodeArray(node.getChildNodes());
                } else {
                    XmlMapFacet valueFacet = xmlMapEntry.getValue();
                    switch (valueFacet.getType()) {
                        case ELEMENT:
                            value = keyName;
                            break;
                        case CONTENT: {
                            NodeList nodeList = node.getChildNodes();
                            if (nodeList.getLength() > 0) {
                                value = new NodeArray(nodeList);
                            }
                        }
                            break;
                        case ATTRIBUTE: {
                            NamedNodeMap attributes = node.getAttributes();
                            Node attribute = attributes.getNamedItem(valueFacet.getName());
                            if (attribute != null) {
                                value = attribute.getNodeValue();
                            }
                        }
                            break;
                        default:
                            throw new IllegalStateException();
                    }
                    XmlMapFacet keyFacet = xmlMapEntry.getKey();
                    switch (keyFacet.getType()) {
                        case ELEMENT:
                            break;
                        case CONTENT: {
                            NodeList nodeList = node.getChildNodes();
                            if (nodeList.getLength() > 0) {
                                keyName = getStringValue((Node) nodeList.item(0));
                            } else {
                                keyName = null;
                            }
                        }
                            break;
                        case ATTRIBUTE: {
                            NamedNodeMap attributes = node.getAttributes();
                            Node attribute = attributes.getNamedItem(keyFacet.getName());
                            if (attribute != null) {
                                keyName = attribute.getNodeValue();
                            } else {
                                keyName = null;
                            }
                            break;
                        }
                        default:
                            throw new IllegalStateException();
                    }
                }
            }
        }
        if (keyName == null) {
            return null;
        } else {
            final Object _key = keyName;
            final Object _value = value;
            return new Map.Entry() {

                @Override
                public Object getKey() {
                    return _key;
                }

                @Override
                public Object getValue() {
                    return _value;
                }

                @Override
                public Object setValue(Object object) {
                    return null;
                }

            };
        }
    }

    /**
     * Given the node populates the map
     * 
     * @param map
     *            the map
     * @param node
     *            the node
     */
    @SuppressWarnings({"unchecked", "rawtypes"})
    private void populateMap(Map map, Node node) {
        Map.Entry entry = getMapEntry(node);
        if (entry != null) {
            map.put(entry.getKey(), entry.getValue());
        }
    }

    /**
     * 
     * @param node
     */
    protected void trimWhitespace(Node node) {
        List doomedChildren = new ArrayList();
        NodeList children = node.getChildNodes();
        for (int childIndex = 0; childIndex < children.getLength(); ++childIndex) {
            Node child = children.item(childIndex);
            short nodeType = child.getNodeType();
            if (nodeType == Node.ELEMENT_NODE) {
                trimWhitespace(child);
            } else if (nodeType == Node.TEXT_NODE) {
                String trimmedValue = child.getNodeValue().trim();
                if (trimmedValue.length() == 0) {
                    doomedChildren.add(child);
                } else {
                    child.setNodeValue(trimmedValue);
                }
            } else if (nodeType == Node.COMMENT_NODE) {
                node.removeChild(child);
            }
        }
        for (Node doomed : doomedChildren) {
            node.removeChild(doomed);
        }
    }

    /**
     * @param node
     * @return
     */
    private Node trimTextNode(Node node) {
        String trimmedValue = node.getNodeValue().trim();
        if (trimmedValue.length() == 0) {
            return null;
        } else {
            node.setNodeValue(trimmedValue);
            return node;
        }
    }

    /**
     * @see com.ibm.spss.hive.serde2.xml.processor.XmlProcessor#getMap(java.lang.Object)
     */
    @SuppressWarnings("rawtypes")
    @Override
    public Map getMap(Object o) {
        Map map = null;
        if (o != null) {
            map = new HashMap();
            if (o instanceof Node) {
                Node node = (Node) o;
                populateMap(map, node);
            } else if (o instanceof NodeArray) {
                NodeArray array = (NodeArray) o;
                for (Node node : array) {
                    populateMap(map, node);
                }
            }
        }
        return map;
    }

    /**
     * @see com.ibm.spss.hive.serde2.xml.processor.XmlProcessor#getList(java.lang.Object)
     */
    @Override
    public List getList(Object o) {
        if (o == null) {
            return null;
        } else if (o instanceof NodeArray) {
            return (List) o;
        }
        return null;
    }
}