org.beanio.stream.xml.XmlReader Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of beanio Show documentation
A Java un/marshalling library for CSV, XML, delimited and fixed length stream formats.
There is a newer version: 2.1.0
/*
 * Copyright 2011 Kevin Seim
 * 
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 * 
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.beanio.stream.xml;

import static javax.xml.stream.XMLStreamConstants.CHARACTERS;
import static javax.xml.stream.XMLStreamConstants.END_DOCUMENT;
import static javax.xml.stream.XMLStreamConstants.END_ELEMENT;
import static javax.xml.stream.XMLStreamConstants.START_ELEMENT;

import java.io.*;

import javax.xml.stream.*;

import org.beanio.internal.util.DomUtil;
import org.beanio.stream.*;
import org.w3c.dom.*;

/**
 * A XmlReader is used to read records from a XML input stream.  Each XML
 * record read from the input stream is parsed into a Document Object Model (DOM).  
 * A XmlReader is configured using a base DOM object to define the group
 * structure of the XML.  When a XML element is read from the input stream that
 * is not found in the base document, the element and its children are appended
 * to the base document to form the record.  The base document object model
 * will be modified as the input stream is read and should therefore not be
 * shared across multiple streams.
 * 
 * A XmlReader makes use of the DOM user data feature to pass additional
 * information to and from the parser.  The GROUP_COUNT is an Integer 
 * value added to elements in the base document to indicate the number of times an
 * element was read from the input stream.  And the IS_NAMESPACE_IGNORED is a
 * Boolean value set on elements in the base document where the XML namespace
 * should not be used to match nodes read from the input stream.
 * 
 * The method getRecordText() is not currently supported.
 * 
 * @author Kevin Seim
 * @since 1.1
 */
public class XmlReader implements RecordReader {

    /**
     * The DOM user data key to obtain the number of times a group element was
     * read in the base document as a java.lang.Integer.
     */
    public static final String GROUP_COUNT = "count";
    
    /**
     * The DOM user data key to indicate whether the namespace of an element in 
     * the base document is ignored when matching nodes read from an input stream.
     * The value must be a java.lang.Boolean. 
     */
    public static final String IS_NAMESPACE_IGNORED = "namespaceIgnored";
    
    private static final XMLInputFactory xmlInputFactory;
    static {
        xmlInputFactory = XMLInputFactory.newInstance();
        xmlInputFactory.setProperty(XMLInputFactory.SUPPORT_DTD, Boolean.FALSE);
    }
    
    /* the input stream to read from */
    private XMLStreamReader in;
    /* the base document used to define the group structure of the XML read from the input stream */
    private Document document;
    /* the parent node is the record node's parent in the base document */
    private Node parentNode;
    /* the "root" element of the last record read */
    private Node recordNode;
    /* set to true if the base document was null during construction and the XML input stream 
     * will be fully read */
    private boolean readFully = false;
    
    private transient int recordLineNumber = -1;
    private transient boolean eof = false;

    /**
     * Constructs a new XmlReader.
     * @param reader the input stream to read from
     */
    public XmlReader(Reader reader) {
        this(reader, null);
    }
    
    /**
     * Constructs a new XmlReader.
     * @param reader the input stream to read from
     * @param base the base document object model (DOM) that defines the
     *   group structure of the XML.  May be null if fully reading 
     *   the XML document.
     */
    public XmlReader(Reader reader, Document base) {
        if (reader == null) {
            throw new IllegalArgumentException("reader is null");
        }
        
        try {
            this.in = xmlInputFactory.createXMLStreamReader(reader);
        }
        catch (XMLStreamException ex) {
            throw new IllegalArgumentException("Failed to create XMLStreamReader: " + ex.getMessage(), ex);
        }
        
        if (base == null) {
            base = DomUtil.newDocument();
        }
        this.document = base;
        
        if (base.getDocumentElement() == null) {
            this.readFully = true;
            this.parentNode = base;
        }
        else {
            this.readFully = false;
            this.parentNode = null;
        }
    }
    
    /*
     * (non-Javadoc)
     * @see org.beanio.stream.RecordReader#read()
     */
    public Document read() throws IOException, RecordIOException {
        if (eof) {
            return null;
        }
        
        try {
            if (parentNode != null) {
                if (recordNode != null) {
                    parentNode.removeChild(recordNode);
                }
                recordNode = null;
            }

            return readRecord() ? document : null;
        }
        catch (XMLStreamException ex) {
            throw new RecordIOException(ex.getMessage(), ex);
        }
    }
    
    /**
     * Appends the next record read from the XML stream reader to the base document object model.
     * @return true if a record was found, or false if the end of the
     *   stream was reached
     * @throws XMLStreamException
     */
    private boolean readRecord() throws XMLStreamException {
        
        // the record position stores the number of elements deep in the record, or -1 if a
        // record has not been found yet
        int recordPosition = readFully ? 0 : -1;
        
        // the parent element to the node we are reading
        Node node = parentNode;
        
        while (in.hasNext()) {
            int event = in.next();
            
            switch (event) {
            case START_ELEMENT:
                if (recordPosition < 0) {
                    // handle the root element of the document
                    if (node == null) {
                        node = document.getDocumentElement();
                        if (isNode(node, in.getNamespaceURI(), in.getLocalName())) {
                            node.setUserData(GROUP_COUNT, 1, null);
                            continue;
                        }
                    }
                    else {
                        // try to find a child in the base document that matches the element we just read
                        Element baseElement = findChild((Element)node, in.getNamespaceURI(), in.getLocalName());
                        if (baseElement != null) {
                            // if found, increment its counter and continue
                            Integer count = (Integer) baseElement.getUserData(GROUP_COUNT);
                            baseElement.setUserData(GROUP_COUNT, count == null ? 1 : 1 + count, null);
                            node = baseElement;
                            continue;
                        }
                    }
                
                    // if we find an element not included in the base document, this is the beginning of our record
                    recordLineNumber = in.getLocation().getLineNumber();
                    parentNode = node;
                }
                
                // create and append the new element to our Document
                Element e = document.createElementNS(in.getNamespaceURI(), in.getLocalName());
                for (int i=0,j=in.getAttributeCount(); i= 0) {
                    node.appendChild(document.createTextNode(in.getText()));
                }
                break;
                
            case END_ELEMENT:
                Node parent = node.getParentNode();
                if (parent.getNodeType() == Node.ELEMENT_NODE) {
                    node = (Element) parent;
                }
                else {
                    node = null;
                }
                
                if (recordPosition < 0) {
                    continue;
                }
                
                // if the record position reaches 0, the record is complete
                if (recordPosition-- == 0) {
                    return true;
                }
                break;
                
            case END_DOCUMENT:
                break;
            }
        }
        
        eof = true;
        return readFully;
    }
    
    /**
     * Searches a DOM element for a child element matching the given XML namespace
     * and local name. 
     * @param parent the parent DOM element
     * @param namespace the XML namesapce to match
     * @param name the XML local name to match
     * @return the matched child element, or null if not found
     */
    private Element findChild(Element parent, String namespace, String name) {
        Node node = parent.getFirstChild();
        while (node != null) {
            if (node.getNodeType() == Node.ELEMENT_NODE) {
                Element element = (Element) node;
                if (isNode(element, namespace, name)) {
                    return element;
                }
            }
            node = node.getNextSibling();
        }
        return null;
    }
    
    /**
     * Returns whether a XML node matches a given namespace and local name.
     * @param node the Node to test
     * @param namespace the namespace to match
     * @param name the local name to match
     * @return true if the Node matches the given XML namespace and
     *   local name
     */
    private boolean isNode(Node node, String namespace, String name) {
        if (node.getLocalName().equals(name)) {
            if (Boolean.TRUE.equals(node.getUserData(IS_NAMESPACE_IGNORED))) {
                return true;
            }
            
            String uri = node.getNamespaceURI();
            if (namespace == null && uri == null) {
                return true;
            }
            else {
                return uri != null && uri.equals(namespace);
            }
        }
        return false;
    }
    
    /*
     * (non-Javadoc)
     * @see org.beanio.stream.RecordReader#close()
     */
    public void close() throws IOException {
        try {
            in.close();
        }
        catch (XMLStreamException e) {
            IOException ex = new IOException("XMLStreamException caught closing input stream");
            ex.initCause(e);
            throw ex;
        }
    }

    /*
     * (non-Javadoc)
     * @see org.beanio.stream.RecordReader#getRecordLineNumber()
     */
    public int getRecordLineNumber() {
        return recordLineNumber;
    }

    /*
     * (non-Javadoc)
     * @see org.beanio.stream.RecordReader#getRecordText()
     */
    public String getRecordText() {
        return null;
    }
}