es.rickyepoderi.wbxml.document.WbXmlParser Maven / Gradle / Ivy

Go to download
/*
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
 *    
 * Linking this library statically or dynamically with other modules 
 * is making a combined work based on this library. Thus, the terms and
 * conditions of the GNU General Public License cover the whole
 * combination.
 *    
 * As a special exception, the copyright holders of this library give 
 * you permission to link this library with independent modules to 
 * produce an executable, regardless of the license terms of these 
 * independent modules, and to copy and distribute the resulting 
 * executable under terms of your choice, provided that you also meet, 
 * for each linked independent module, the terms and conditions of the 
 * license of that module.  An independent module is a module which 
 * is not derived from or based on this library.  If you modify this 
 * library, you may extend this exception to your version of the 
 * library, but you are not obligated to do so.  If you do not wish 
 * to do so, delete this exception statement from your version.
 *
 * Project: github.com/rickyepoderi/wbxml-stream
 * 
 */
package es.rickyepoderi.wbxml.document;

import es.rickyepoderi.wbxml.definition.IanaCharset;
import es.rickyepoderi.wbxml.definition.WbXmlAttributeDef;
import es.rickyepoderi.wbxml.definition.WbXmlAttributeValueDef;
import es.rickyepoderi.wbxml.definition.WbXmlDefinition;
import es.rickyepoderi.wbxml.definition.WbXmlExtensionDef;
import es.rickyepoderi.wbxml.definition.WbXmlInitialization;
import es.rickyepoderi.wbxml.definition.WbXmlTagDef;
import es.rickyepoderi.wbxml.definition.WbXmlToken;
import java.io.ByteArrayOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.nio.charset.Charset;
import java.util.ArrayList;
import java.util.List;

/**
 *
 * The parser is the class that is used to parse an WBXML document into a
 * WbXmlDocument object in this package. This packages reads the document and
 * start creating the different elements in this package in order to finalize
 * with a complete document in Java/memory representation. All the different
 * parseXXX methods are called recursively to fulfill the complete Java
 * object.
 * 
 * The parser follows the WXML format specification explained in the 
 * open alliance document.
 * This class has different methods to parse all the objects in this
 * package although it is intended to only call main method
 * parse().
 * 
 * All the parsing methods suppose that the stream is just at the start 
 * of the element to parse. That means that the currentByte is the first byte
 * of the element. In order to do that sometimes a byte should be read 
 * backwards.
 * 
 * @author ricky
 */
public class WbXmlParser {
    
    /**
     * Input Stream where the WBXML document is going to be read.
     */
    private InputStream is;
    
    /**
     * The document the parser is creating during a parsing processing.
     */
    private WbXmlDocument doc;
    
    /**
     * the page code state for attributes. See chapter 5.8.1. Parser State Machine
     * of the specification.
     */
    private byte pageAttrState;
    
    /**
     * The page code for tags. See same chapter 5.8.1. Parser State Machine.
     */
    private byte pageTagState;
    
    /**
     * The current byte read from the stream.
     */
    private Byte currentByte;
    
    /**
     * Sometimes it is useful to read one byte backwards (to start when a
     * new element start and so on). This way a buffer of one byte is maintained
     * is used to can go back one byte.
     */
    private Byte nextByte;
    
    /**
     * Constructor of the parse. Only the input stream is needed cos the rest
     * is parsed from the document.
     * @param is The input stream to read the WBXML document from
     */
    public WbXmlParser(InputStream is) {
        this.is = is;
        this.pageAttrState = 0x00;
        this.pageTagState = 0x00;
        this.doc = null;
        currentByte = null;
        nextByte = null;
    }
    
    /**
     * Method that reads the next byte in the stream. Cos the parser lets one
     * read back the nextByte property is checked (in that case no byte
     * is really read and the nextByte is the current byte).
     * @return true if there is a new byte read, false if the document is finished
     * @throws IOException Some error reading from the stream
     */
    public boolean read() throws IOException {
        if (nextByte != null) {
            currentByte = nextByte;
            nextByte = null;
        } else {
            int i = this.is.read();
            if (i < 0) {
                currentByte = null;
            } else {
                currentByte = (byte) i;
            }
        }
        //System.err.println((currentByte != null)? WbXmlDefinition.formatUInt8Char(currentByte):"null");
        return currentByte != null;
    }
    
    /**
     * As explained in the class it is sometimes useful to read one byte 
     * backwards (usually to position the stream in the start of another 
     * element or attribute). This method does exactly that, the current byte
     * is set to null and the nextByte is filled with the current. The method 
     * is intended to immediately call the read() method.
     * @throws IOException A exception is thrown if the the current byte is null
     * (that means the readBackwards has been called twice).
     */
    public void readBackwards() throws IOException {
        if (currentByte != null) {
            nextByte = currentByte;
            currentByte = null;
        } else {
            throw new IOException("Only one readBackwards is permitted!");
        }
    }
    
    /**
     * Method that let us read a complete byte array buffer. This is used when
     * strings or other fixed arrays are presented (STR_I, OPAQUE,...). This
     * method is intended to read known fixed sized arrays (WBXML presents
     * this king of data always knowing the array length. The current byte is 
     * set to the last byte read.
     * @param b The buffer to read
     * @return The same buffer that is passed as argument
     * @throws IOException Some error reading from the stream or if the
     * end of the file is reached before filling all the array
     */
    public byte[] read(byte[] b) throws IOException {
        if (b.length == 0) {
            return b;
        }
        int offset = 0;
        if (nextByte != null) {
            b[0] = nextByte;
            nextByte = null;
            offset = 1;
        }
        int i = this.is.read(b, offset, b.length - offset);
        if (i  + offset < b.length) {
            // end of file => IOEXception
            throw new IOException(String.format("End of file reading a byte[] of size %d", b.length));
        }
        currentByte = b[b.length - 1];
        return b;
    }
    
    /**
     * WBXML specification presents mb_u_int32 integers in a very strange
     * way in order to safe space. The way they are encoded is explained in
     * the chapter 5.1. Multi-byte Integers of the specification.
     * 
     * A multi-byte integer consists of a series of octets, where the most 
     * significant bit is the continuation flag and the remaining seven bits 
     * are a scalar value. The continuation flag indicates that an octet is not 
     * the end of the multi-byte sequence. A single integer value is encoded 
     * into a sequence of N octets. The first N-1 octets have the continuation 
     * flag set to a value of one (1). The final octet in the series has a 
     * continuation flag value of zero (0). The remaining seven bits in each 
     * octet are encoded in a big-endian order, e.g., most significant bit 
     * first. The octets are arranged in a big-endian order, e.g., the most 
     * significant seven bits are transmitted first. In the situation where the 
     * initial octet has less than seven bits of value, all unused bits must be 
     * set to zero (0). For example, the integer value 0xA0 would be encoded 
     * with the two-byte sequence 0x81 0x20. The integer value 0x60 would be 
     * encoded with the one-byte sequence 0x60.
     * 
     * @return The read integer in long format
     * @throws IOException Some error reading the integer
     */
    public long readUnsignedInteger() throws IOException {
        long res = 0;
        read();
        int times = 1;
        res = (res << 7) | (((byte) currentByte) & 0x7F);
        while ((((byte) currentByte) & 0x80) != 0) {
            if (times > 5) {
                throw new IOException("An unsigned integer should not be longer than 5 bytes!");
            }
            read();
            times++;
            res = (res << 7) | (((byte) currentByte) & 0x7F);
        }
        return res;
    }
    
    /**
     * A inline string (STR_I) is a string that is appended to the document
     * (as part of an attribute value or part of a content string). It is just 
     * defined as follows:
     * 
     *      * inline = STR_I termstr
     * termstr = charset-dependent string with termination
     * 
     * 
     * The inline string is just a STR_I token follow by the string
     * (charset dependent) terminated in 0x00 token. The strings format are 
     * specified in the chapter 5.8.4.1. Strings of the specification.
     * So this method read a byte until 0x00 token is found.
     * 
     * @return The string inlined
     * @throws IOException 
     */
    public String readInlineString() throws IOException {
        ByteArrayOutputStream bos = null;
        try {
            bos = new ByteArrayOutputStream();
            read();
            while (currentByte != 0x0) {
                bos.write(currentByte);
                read();
            }
            return new String(bos.toByteArray(), doc.getCharset().getCharset());
        } finally {
            if (bos != null) {
                try {
                    bos.close();
                } catch(IOException e) {}
            }
        }
    }
    
    /**
     * The WBXML format specification defines two states for a parser/encoder
     * machine. The states deals with the page code, one state is used for tags
     * and the other for attributes. When a tag or attribute comes if it
     * is from the same page code that the state is no switch page is written.
     * Nevertheless if the tag is from other page a switch page token should 
     * be written in order to change the state (there are two states and they
     * are independent, that is why the parser have to page codes, for tags
     * and for attributes). This states are explained in the chapter
     * 5.8.1. Parser State Machine of the specification.
     * 
     * This method is called when a new switch page can come (in any
     * tag, attribute or attribute value element), if it is found the 
     * switch page token the correspondent state is updated and the next
     * token is read in current byte.
     * 
     * This method reads possible switch page for the attribute state
     * parser.
     * 
     * @throws IOException Some error reading the stream
     */
    public void readSwitchPageAttribute() throws IOException {
        if (WbXmlLiterals.SWTICH_PAGE == currentByte) {
            read();
            this.pageAttrState = currentByte;
            read();
        }
    }
    
    /**
     * The WBXML format specification defines two states for a parser/encoder
     * machine. The states deals with the page code, one state is used for tags
     * and the other for attributes. When a tag or attribute comes if it
     * is from the same page code that the state is no switch page is written.
     * Nevertheless if the tag is from other page a switch page token should 
     * be written in order to change the state (there are two states and they
     * are independent, that is why the parser have to page codes, for tags
     * and for attributes). This states are explained in the chapter
     * 5.8.1. Parser State Machine of the specification.
     * 
     * This method is called when a new switch page can come (in any
     * tag, attribute or attribute value element), if it is found the 
     * switch page token the correspondent state is updated and the next
     * token is read in current byte.
     * 
     * This method reads possible switch page for the TAG state
     * parser.
     * 
     * @throws IOException Some error reading the stream
     */
    public void readSwitchPageTag() throws IOException {
        if (WbXmlLiterals.SWTICH_PAGE == currentByte) {
            read();
            this.pageTagState = currentByte;
            read();
        }
    }
    
    /**
     * Method that parses the version of a WBXML document. The version
     * is defined in the specification:
     * 
     *      * version = u_int8 // WBXML version number
     * 
     * 
     * The version encoding/parsing chapter is the 5.4. Version Number:
     * All WBXML documents contain a version number in their initial byte. This 
     * version specifies the WBXML specification version. The version byte 
     * contains the major version minus one in the upper four bits and the minor
     * version in the lower four bits. For example, the version number 1.3 would
     * be encoded as 0x03, and version number 2.7 as 0x17.
     * 
     * @return The enumeration version that corresponds to the byte read
     * @throws IOException Some error reading the version or a unknown version
     */
    public WbXmlVersion parseVersion() throws IOException {
        read();
        byte major = (byte) ((currentByte >> 4) + 1);
        byte minor = (byte) (currentByte & 0x0F);
        WbXmlVersion v = WbXmlVersion.locateVersion(major, minor);
        if (v == null) {
            throw new IOException(String.format("Invalid version (%d,%d)", major, minor));
        }
        return v;
    }
    
    /**
     * Method that parses the WBXML public ID of a document. The public id
     * is defined in the specification as follows:
     * 
     *      * publicid = mb_u_int32 | ( zero index )
     * zero = u_int8        // with a 0x0 value
     * index = mb_u_int32   // integer index into string table.
     * 
     * 
     * The chapter 5.5. Document Public Identifier defines how the
     * public id is parser/encoded. The public id can be parsed using 
     * directly the standard ID of the language (mb_u_int32) or using
     * the XML formal public id. In the last case a 0x00 byte is used and
     * the the String Table is used to locate the string. The public ID is
     * used to locate the language definition of the parsed document and, 
     * in case normal mb_u_int32 id used, it is set in the document.
     * 
     * @return The publicId, the index in the table in case string 
     * representation or -1 if unknown. In case of string representation StrTbl 
     * has not been read yet, so no language definition is still associated 
     * to the parser.
     * @throws IOException Error reading the stream or unknown language definition
     */
    public long parsePublicId() throws IOException {
        // read the mb_u_int32 or zero
        long publicId = readUnsignedInteger();
        if (publicId == WbXmlDefinition.PUBLIC_ID_STR_T) {
            // read the index in the strtbl, that index is returned
            publicId = readUnsignedInteger();
        } else if (publicId != WbXmlDefinition.PUBLIC_ID_UNKNOWN) {
            doc.setDefinition(WbXmlInitialization.getDefinitionByPublicId(publicId));
            if (doc.getDefinition() == null) {
                throw new IOException(String.format("Unknown definition public id (%d)", publicId));
            }
        } else {
            publicId = -1;
        }
        return publicId;
    }
    
    /**
     * Method that parses the charset of the WBXML document. In the specification
     * the charset is defined as follows:
     * 
     *      * charset = mb_u_int32
     * 
     * 
     * The chapter 5.6. Charset of the specification explains how
     * the charset should be handled. it is just the MIB numeric identifier
     * of the IANA charset.The charset is set in the document.
     * 
     * @return The IANA charset that corresponds to the MIB found
     * @throws IOException Some error reading the stream or unknown IANA charset
     */
    public IanaCharset parseCharset() throws IOException {
        long mib = readUnsignedInteger();
        IanaCharset iana = IanaCharset.getIanaCharset(mib);
        if (mib != 0 && iana.equals(IanaCharset.UNKNOWN)) {
            throw new IOException(String.format("Unknown character encoding '%d'", mib));
        }
        doc.setCharset(iana);
        return iana;
    }
    
    /**
     * Method that parses the string table of the WBXML documnet. The string
     * table is defined as follows:
     * 
     *      * strtbl = length *byte
     * 
     * 
     * And the chapter 5.7. String Table explains how the string
     * table is used and encoded. The table is just the length of itself and
     * a byte array with all the strings defined in the strtbl. The strings
     * are just charset dependent byte arrays 0x00 terminated. Later references
     * in the document to the strings in the table are done using the
     * relative starting idex of the string in the table. Besides being 
     * returned the strtbl is set in the parsed document.
     * 
     * @return The strtbl read
     * @throws IOException Some error in the stream or reading the table
     */
    public WbXmlStrtbl parseStrtbl() throws IOException {
        WbXmlStrtbl strtbl = new WbXmlStrtbl();
        long length = readUnsignedInteger();
        strtbl.setSize(length);
        //System.err.println("length=" + length);
        byte[] b = new byte[(int) length];
        read(b);
        int idx = 0;
        for (int i = 0; i < b.length; i++) {
            if (b[i] == 0x0) {
                String s = new String(b, idx, i - idx, doc.getCharset().getCharset());
                //System.err.println("idx=" + idx + " end=" + i + " s=" + s);
                strtbl.internalAddString(idx, s);
                idx = i + 1;
            }
        }
        doc.setStrtbl(strtbl);
        return strtbl;
    }
    
    /**
     * Method that parses an TAG opaque token. The WBXML specification let
     * languages to encode any TAG using a opaque byte array. Languages use
     * this feature to encode/parse particular data (datetime formats, bynary
     * data,...), so it is clearly language definition dependent. This 
     * library let define plugins to encode/parse an opaque data.
     * 
     *  The opaque is defined in the WBXML specification as follows:
     * 
     *      * opaque = OPAQUE length *byte
     * 
     * 
     * Just a OPAQUE token, length of the byte array and the bytes that
     * compound the opaque data. This method search if the definition defines
     * a plugin for this tag and calls it to retrieve the content 
     * associated to the element.
     * 
     * NOTE: Right now a exception is throws if no plugin is found, libwbxml
     * just parses it as a string charset dependent!!!
     * 
     * @param tagName The name of the tag to loacte the plugin
     * @return The content after calling the associated plugin
     * @throws IOException Some error reading the stream or locating/executing the plugin
     */
    public WbXmlContent parseOpaqueTag(String tagName) throws IOException {
        // first get the plugin for the attr
        OpaqueContentPlugin plugin = doc.getDefinition().locateTagPlugin(tagName);
        if (plugin == null) {
            // read as a string opaque
            throw new IOException(String.format("No plugin defined for tag (%s)", tagName));
        }
        // read the OPAQUE tag
        read();
        if (WbXmlLiterals.OPAQUE != currentByte) {
            throw new IOException("Opaque must start with OPAQUE tag!");
        }
        // read the length
        long length = readUnsignedInteger();
        // create a byte array of that length
        byte[] b = new byte[(int) length];
        // read the byte array
        read(b);
        // parse the opaque data using the plugin
        return plugin.parse(this, b);
    }
    
    /**
     * Method that parses an attribute opaque token. The WBXML specification let
     * languages to encode any attribute value using a opaque byte array. Languages 
     * use this feature to encode/parse particular data (datetime formats, binary
     * data,...), so it is clearly language definition dependent. This 
     * library let define plugins to encode/parse an opaque data.
     * 
     *  The opaque is defined in the WBXML specification as follows:
     * 
     *      * opaque = OPAQUE length *byte
     * 
     * 
     * Just a OPAQUE token, length of the byte array and the bytes that
     * compound the opaque data. This method search if the definition defines
     * a plugin for this tag and calls it to retrieve the content 
     * associated to the element. In case of an attribute only string 
     * contents can be returned (as it is said in several points maybe two
     * interfaces would have been a better idea)
     * 
     * NOTE: Right now a exception is throws if no plugin is found, libwbxml
     * just parses it as a string charset dependent!!!
     * 
     * @param attrName The name of the attribute to locate the plugin
     * @return The String of the attr
     * @throws IOException Some error reading the stream or locating/executing the plugin
     */
    public String parseOpaqueAttr(String attrName) throws IOException {
        // first get the plugin for the attr
        OpaqueAttributePlugin plugin = doc.getDefinition().locateAttrPlugin(attrName);
        if (plugin == null) {
            // read as a string opaque
            throw new IOException(String.format("No plugin defined for attr (%s)", attrName));
        }
        // read the OPAQUE tag
        read();
        if (WbXmlLiterals.OPAQUE != currentByte) {
            throw new IOException("Opaque must start with OPAQUE tag!");
        }
        // read the length
        long length = readUnsignedInteger();
        // create a byte array of that length
        byte[] b = new byte[(int) length];
        // read the byte array
        read(b);
        // parse the opaque data using the plugin
        return plugin.parse(this, b);
    }
    
    /**
     * Method that parses an numeric entity in the WBXML document. The 
     * entity is defined in the specification as follows:
     * 
     *      * entity = ENTITY entcode
     * entcode = mb_u_int32 // UCS-4 character code
     * 
     * 
     * The chapter 5.8.4.3. Character Entity comments how the 
     * entity should be understood: The character entity token (ENTITY) encodes 
     * a numeric character entity. This has the same semantics as an XML
     * numeric character entity (e.g.,  ). The mb_u_int32 refers to a 
     * character in the UCS-4 character encoding.
     * 
     * @return The string that represents the entity (i.e. " ") 
     * @throws IOException Some error reading the stream
     */
    public String parseEntity() throws IOException {
        // read the ENTITY
        read();
        if (WbXmlLiterals.ENTITY != currentByte) {
            throw new IOException("Entity must start with the ENTITY tag!");
        }
        // read the numeric entity and construct the "&#" + num + ";"
        long entity = readUnsignedInteger();
        return new StringBuilder("&#").append(entity).append(";").toString();
    }
    
    /**
     * Method to parse the attribute values of attribute. In the specification
     * the attribute value is defined as follows:
     * 
     *      * attrValue = ([switchPage] ATTRVALUE) | string | extension | entity | opaque
     *
     * string = inline | tableref 
     * inline = STR_I termstr 
     * tableref = STR_T index
     * index= mb_u_int32 // index in the attr table
     *
     * extension = [switchPage] (( EXT_I termstr ) | ( EXT_T index ) | EXT)
     *
     * entity = ENTITY entcode 
     * entcode = mb_u_int32 // UCS-4 character code
     *
     * opaque = OPAQUE length *byte
     * 
     * 
     * The method keeps reading attributes and completing the values one
     * by one. It tries to differentiate the type (string, entity, opaque and
     * s on) and read each value accordingly. The values are returned as a
     * list of strings (one per different element in the WBXML document).
     * 
     * @param attrName The name of the attribute being parsed (used for opaques)
     * @return The list of values read from the stream
     * @throws IOException Some error reading the values from the stream
     */
    public List parseAttributeValues(String attrName) throws IOException {
        //System.err.println("parseAttributeValues");
        List values = new ArrayList();
        read();
        boolean cont = true;
        while (cont) {
            // attr value can be switchPage => jump it if it is the case
            readSwitchPageAttribute();
            // ATTRVALUE, STR_I, STR_T, EXT_I*, EXT_T*, EXT, ENTITY or OPAQUE
            if (WbXmlLiterals.STR_I == currentByte) {
                values.add(readInlineString());
            } else if (WbXmlLiterals.STR_T == currentByte) {
                long idx = readUnsignedInteger();
                values.add(doc.getStrtbl().getString(idx));
            } else if (WbXmlLiterals.EXT_I_0 == currentByte
                    || WbXmlLiterals.EXT_I_1 == currentByte
                    || WbXmlLiterals.EXT_I_2 == currentByte) {
                // read the string from termstr
                values.add(readInlineString());
            } else if (WbXmlLiterals.EXT_T_0 == currentByte
                    || WbXmlLiterals.EXT_T_1 == currentByte
                    || WbXmlLiterals.EXT_T_2 == currentByte) {
                // read the index
                long extToken = readUnsignedInteger();
                WbXmlExtensionDef ext = doc.getDefinition().locateExtension(extToken);
                if (ext == null) {
                    throw new IOException(String.format("Unknown extension (%d)", extToken));
                }
                values.add(ext.getValue());
            } else if (WbXmlLiterals.EXT_0 == currentByte
                    || WbXmlLiterals.EXT_1 == currentByte
                    || WbXmlLiterals.EXT_2 == currentByte) {
                throw new IOException("Implementation does not support EXT_0, EXT_1 or EXT_2 in attribute values");
            } else if (WbXmlLiterals.ENTITY == currentByte) {
                // read backwards to start with ENTITY tag
                readBackwards();
                values.add(parseEntity());
            } else if (WbXmlLiterals.OPAQUE == currentByte) {
                // read backwars to start at OPAQUE and parse it
                readBackwards();
                String v = parseOpaqueAttr(attrName);
                if (v != null && !v.isEmpty()) {
                    values.add(v);
                }
            } else if ((currentByte & 0x80) != 0) {
                // it is an ATTRVALUE (any attrvalue is >= 128)
                WbXmlAttributeValueDef attrValDef = doc.getDefinition().locateAttributeValue(pageAttrState, currentByte);
                if (attrValDef == null) {
                    throw new IOException(String.format("Unknown ATTRVALUE in the definition (%s)",
                            new WbXmlToken(pageAttrState, currentByte)));
                }
                values.add(attrValDef.getValue());
            } else {
                // start of the next attribute or END
                cont = false;
            }
            if (cont) {
                // read next attribute value or END
                read();
            }
        }
        return values;
    }
    
    /**
     * Method that read a complete attribute from the WBXML stream. An
     * attribute is defined in the specifications as follows:
     * 
     *      * attribute = attrStart *attrValue 
     * 
     * attrStart = ([switchPage] ATTRSTART) | (LITERAL index )
     * 
     * attrValue = ...
     * 
     * 
     * So this method reads the attrStart and then call the parseAttributeValues
     * method to read all the values for the attribute.
     * 
     * @return The attribute read from the document
     * @throws IOException Some error reading the attribute from the stream
     */
    public WbXmlAttribute parseAttribute() throws IOException {
        //System.err.println("parseAttribute");
        WbXmlAttribute attr = new WbXmlAttribute();
        WbXmlAttributeDef attrDef;
        // read the first byte switchpage or attrstart
        read();
        // can be a switchPage => jump it
        readSwitchPageAttribute();
        // now b is the ATTRSTART or LITERAL
        if (WbXmlLiterals.LITERAL == currentByte) {
            // the attribute is a literal
            long idx = readUnsignedInteger();
            attr.setName(doc.getStrtbl().getString(idx));
        } else {
            // ATTRSTART, read the attribute definition
            attrDef = doc.getDefinition().locateAttribute(pageAttrState, currentByte);
            if (attrDef == null) {
                throw new IOException(String.format("Unknown ATTRSTART in the definition (%s)", 
                        new WbXmlToken(pageAttrState, currentByte)));
            }
            attr.setName(attrDef.getNameWithPrefix());
            //System.err.println("parseAttribute: name=" + attrDef.getName());
            if (attrDef.getValue() != null) {
                // add the first part of the string
                attr.addValue(attrDef.getValue());
            }
        }
        // now the attribute value
        attr.addValues(parseAttributeValues(attr.getName()));
        //System.err.println("parseAttribute: " + attr);
        // transform all the strings into one
        attr.normalize();
        return attr;
    }
    
    /**
     * Method that parses the different types of contents in an WBXML document.
     * The content is defined in the specificatiosn as follows:
     * 
     *      * content = element | string | extension | entity | pi | opaque
     * 
     * string = inline | tableref
     * inline = STR_I termstr
     * tableref = STR_T index
     * 
     * extension = [switchPage] (( EXT_I termstr ) | ( EXT_T index ) | EXT)
     * 
     * entity = ENTITY entcode
     * 
     * opaque = OPAQUE length *byte
     * 
     * pi = PI attrStart *attrValue END
     * 
     * 
     * So the method tries to differentiate the type (string, pi, opaque,...)
     * and create the content object with the value.
     * 
     * @param  tagName The tag of the element the content belongs to (used for opaques)
     * @return The content read from the stream
     * @throws IOException Some error reading the content from the stream
     */
    public WbXmlContent parseContent(String tagName) throws IOException {
        //System.err.println("parseContent");
        WbXmlContent content = new WbXmlContent();
        read();
        //System.err.println(WbXmlLiterals.formatUInt8Char(currentByte));
        // can be a switchPage => jump it
        readSwitchPageTag();
        // the first byte can be:
        //  -> string: STR_I, STR_T
        //  -> extension: EXT_I*, EXT_T*, EXT*
        //  -> entity: ENTITY
        //  -> opaque: OPAQUE
        //  -> pi: PI
        //  -> ELEMENT: TAG, LITERAL_*
        // ATTRVALUE, STR_I, STR_T, EXT_I*, EXT_T*, EXT, ENTITY or OPAQUE
        if (WbXmlLiterals.STR_I == currentByte) {
            content.setString(readInlineString());
        } else if (WbXmlLiterals.STR_T == currentByte) {
            long idx = readUnsignedInteger();
            content.setString(doc.getStrtbl().getString(idx));
        } else if (WbXmlLiterals.EXT_I_0 == currentByte
                || WbXmlLiterals.EXT_I_1 == currentByte
                || WbXmlLiterals.EXT_I_2 == currentByte) {
            // read the string from termstr
            content.setString(readInlineString());
        } else if (WbXmlLiterals.EXT_T_0 == currentByte
                || WbXmlLiterals.EXT_T_1 == currentByte
                || WbXmlLiterals.EXT_T_2 == currentByte) {
            // read the index
            long extToken = readUnsignedInteger();
            WbXmlExtensionDef ext = doc.getDefinition().locateExtension(extToken);
            if (ext == null) {
                throw new IOException(String.format("Unknown extension (%x)", extToken & 0xFF));
            }
            content.setString(ext.getValue());
        } else if (WbXmlLiterals.EXT_0 == currentByte
                || WbXmlLiterals.EXT_1 == currentByte
                || WbXmlLiterals.EXT_2 == currentByte) {
            throw new IOException("Implementation does not support EXT_0, EXT_1 or EXT_2 in contents");
        } else if (WbXmlLiterals.ENTITY == currentByte) {
            // read backwards to start with ENTITY tag
            readBackwards();
            content.setString(parseEntity());
        } else if (WbXmlLiterals.OPAQUE == currentByte) {
            // read backwars to start at OPAQUE and parse it
            readBackwards();
            content = parseOpaqueTag(tagName);
        } else if (WbXmlLiterals.PI == currentByte) {
            // read backwards to start with PI and parse;
            readBackwards();
            content.setPi(parsePi());
        } else if (WbXmlLiterals.END != currentByte) {
            // element => read backwards and call recursive
            readBackwards();
            content.setElement(parseElement());
        }
        // read the next element => END or another content
        read();
        if (content.isEmpty()) {
            // assign empty string
            content = null;
        }
        //System.err.println("parseContent: " + content);
        return content;
    }
    
    /**
     * Method that reads a complete element from the WBXML stream. An
     * element is defined in the specification as follows:
     * 
     *      * element = ([switchPage] stag) [ 1*attribute END ] [ *content END ] 
     * stag = TAG | (literalTag index) 
     * literalTag = LITERAL | LITERAL_A | LITERAL_C | LITERAL_AC
     * 
     * 
     * So the method reads the stag and then calls recursively to read all
     * the attributes and contents to previous methods.
     * 
     * @return The element read from the stream
     * @throws IOException Some error reading the element from the stream
     */
    public WbXmlElement parseElement() throws IOException {
        //System.err.println("parseElement");
        WbXmlElement element = new WbXmlElement();
        boolean hasAttributes;
        boolean hasContent;
        read();
        //System.err.println(WbXmlLiterals.formatUInt8Char(currentByte));
        // read possible switch page
        readSwitchPageTag();
        // the tag can be a LITERAL or a tag definition
        if (WbXmlLiterals.LITERAL == currentByte
                || WbXmlLiterals.LITERAL_A == currentByte
                || WbXmlLiterals.LITERAL_C == currentByte
                || WbXmlLiterals.LITERAL_AC == currentByte) {
            // literal => the name is in the strtbl
            hasAttributes = (WbXmlLiterals.LITERAL_A == currentByte) || 
                    (WbXmlLiterals.LITERAL_AC == currentByte);
            hasContent = (WbXmlLiterals.LITERAL_C == currentByte) || 
                    (WbXmlLiterals.LITERAL_AC == currentByte);
            long idx = readUnsignedInteger();
            String name = doc.getStrtbl().getString(idx);
            element.setTag(name);
        } else {
            // look for the tag in the definition
            WbXmlTagDef tagDef = doc.getDefinition().locateTag(pageTagState, (byte) (currentByte & 0x3F));
            if (tagDef == null) {
                throw new IOException(String.format("Unknown TAG in the definition (%s)", 
                        new WbXmlToken(pageTagState, currentByte)));
            }
            element.setTag(tagDef.getNameWithPrefix());
            // calculate attributes and contents
            hasAttributes = ((currentByte & 0x80) != 0);
            hasContent = ((currentByte & 0x40) != 0);
        }
        //System.err.println(element.getTag());
        if (hasAttributes) {
            boolean cont = true;
            while (cont) {
                // read one or more attributes
                element.addAttribute(parseAttribute());
                if (WbXmlLiterals.END == currentByte) {
                    cont = false;
                } else {
                    // keep going but set the current at start
                    readBackwards();
                }
            }
        }
        if (hasContent) {
            boolean cont = true;
            while (cont) {
                // read one or more contents
                WbXmlContent content = parseContent(element.getTag());
                if (content != null) {
                    element.addContent(content);
                }
                if (currentByte == null || WbXmlLiterals.END == currentByte) {
                    cont = false;
                } else {
                    // keep going but set the current at start
                    readBackwards();
                }
            }
        }
        // normalize strings
        element.normalize();
        //System.err.println("parseElement: " + element);
        return element;
    }
    
    /**
     * Method that parses a PI element from the WBXML stream. A PI element
     * is very similar to an attribute, it is defined as follows:
     * 
     *      * PI attrStart *attrValue END
     * 
     * 
     * So it reads the start PI token and then call the parseAttribute method.
     * 
     * @return The attribute of the PI
     * @throws IOException Some error reading the PI attribute from the stream
     */
    public WbXmlAttribute parsePi() throws IOException {
        read();
        if (WbXmlLiterals.PI != currentByte) {
            throw new IOException("PI must start with PI tag!");
        }
        WbXmlAttribute attr = parseAttribute();
        read();
        if (WbXmlLiterals.END != currentByte) {
            throw new IOException("PI must end with END tag!");
        }
        return attr;
    }
    
    /**
     * Method that parses the body of a WBXML document. The body is defined as
     * follows in the specification:
     * 
     *      * body = *pi element *pi
     * 
     * 
     * So the three elements are called to be read.
     *
     * @return The body read from the stream
     * @throws IOException Some error reading the body from the stream
     */
    public WbXmlBody parseBody() throws IOException {
        WbXmlBody body = new WbXmlBody();
        read();
        // b can be a PI, switchPage or stag, the first indicates
        // a pi element follows, the other a element
        while (WbXmlLiterals.PI == currentByte) {
            readBackwards();
            body.addPrePi(parsePi());
            read();
        }
        // we are at the start of an element => read backwards and parse
        readBackwards();
        body.setElement(parseElement());
        // we can be at the end of the file or before a PI
        read();
        while (currentByte != null && WbXmlLiterals.PI == currentByte) {
            readBackwards();
            body.addPostPi(parsePi());
            read();
        }
        return body;
    }
    
    /**
     * Main method of the class. This method starts the complete parsing of
     * the WBXML document, it calls recursively the different previous methods
     * to construct a whole Java representation of the WBXML document. This 
     * method let the caller to set a fixed definition.
     * The document is defined by the specification as follows:
     * 
     *      * start = version publicid charset strtbl body
     * 
     * 
     * @param def The definition to be used (forced), it can be null
     * @return The document (java representation) of the stream
     * @throws IOException Some error reading the document from the stream
     */
    public WbXmlDocument parse(WbXmlDefinition def) throws IOException {
        doc = new WbXmlDocument();
        // read the version
        doc.setVersion(parseVersion());
        // read the public id for the definition
        long publicId = parsePublicId();
        if (def != null) {
            // force the definition to the one specified
            doc.setDefinition(def);
        }
        // parse the charset
        parseCharset();
        // read the strtbl
        parseStrtbl();
        // get the definition
        if (doc.getDefinition() == null && publicId != -1) {
            // the public id is a index in the strtbl => read it
            String fpi = doc.getStrtbl().getString(publicId);
            doc.setDefinition(WbXmlInitialization.getDefinitionByFPI(fpi));
            if (doc.getDefinition() == null) {
                throw new IOException(String.format("Unknown definition formal public id (%s)", fpi));
            }
        } else if (doc.getDefinition() == null) {
            throw new IOException("Unknown definition and no one specified");
        }
        // read the body
        doc.setBody(parseBody());
        return doc;
    }
    
    /**
     * Main method of the class. This method starts the complete parsing of
     * the WBXML document, it calls recursively the different previous methods
     * to construct a whole Java representation of the WBXML document. The
     * document is defined by the specification as follows:
     * 
     *      * start = version publicid charset strtbl body
     * 
     * 
     * @return The document (java representation) of the stream
     * @throws IOException Some error reading the document from the stream
     */
    public WbXmlDocument parse() throws IOException {
        return parse(null);
    }
    
    /**
     * Getter for the document after being parsed.
     * @return The document
     */
    public WbXmlDocument getDocument() {
        return this.doc;
    }
    
    /**
     * Getter for the charset after being parsed
     * @return The charset of the document
     */
    public Charset getCharset() {
        return this.doc.getCharset().getCharset();
    }
    
    /**
     * Getter for the document language definition. As it is said the WBXML
     * specification defines one language for document but some of them
     * (SyncML for instance) let encode another language as opaque data. 
     * For that reason the parser provides a getdefinitionsUsed() method
     * to obtain all the languages used in the parsing and not only the
     * one of the document.
     * @return The language definition used in the parsing
     */
    public WbXmlDefinition getDefinition() {
        return this.doc.getDefinition();
    }
    
}