es.rickyepoderi.wbxml.document.WbXmlParser Maven / Gradle / Ivy
/*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
*
* Linking this library statically or dynamically with other modules
* is making a combined work based on this library. Thus, the terms and
* conditions of the GNU General Public License cover the whole
* combination.
*
* As a special exception, the copyright holders of this library give
* you permission to link this library with independent modules to
* produce an executable, regardless of the license terms of these
* independent modules, and to copy and distribute the resulting
* executable under terms of your choice, provided that you also meet,
* for each linked independent module, the terms and conditions of the
* license of that module. An independent module is a module which
* is not derived from or based on this library. If you modify this
* library, you may extend this exception to your version of the
* library, but you are not obligated to do so. If you do not wish
* to do so, delete this exception statement from your version.
*
* Project: github.com/rickyepoderi/wbxml-stream
*
*/
package es.rickyepoderi.wbxml.document;
import es.rickyepoderi.wbxml.definition.IanaCharset;
import es.rickyepoderi.wbxml.definition.WbXmlAttributeDef;
import es.rickyepoderi.wbxml.definition.WbXmlAttributeValueDef;
import es.rickyepoderi.wbxml.definition.WbXmlDefinition;
import es.rickyepoderi.wbxml.definition.WbXmlExtensionDef;
import es.rickyepoderi.wbxml.definition.WbXmlInitialization;
import es.rickyepoderi.wbxml.definition.WbXmlTagDef;
import es.rickyepoderi.wbxml.definition.WbXmlToken;
import java.io.ByteArrayOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.nio.charset.Charset;
import java.util.ArrayList;
import java.util.List;
/**
*
* The parser is the class that is used to parse an WBXML document into a
* WbXmlDocument object in this package. This packages reads the document and
* start creating the different elements in this package in order to finalize
* with a complete document in Java/memory representation. All the different
* parseXXX methods are called recursively to fulfill the complete Java
* object.
*
* The parser follows the WXML format specification explained in the
* open alliance document.
* This class has different methods to parse all the objects in this
* package although it is intended to only call main method
* parse().
*
* All the parsing methods suppose that the stream is just at the start
* of the element to parse. That means that the currentByte is the first byte
* of the element. In order to do that sometimes a byte should be read
* backwards.
*
* @author ricky
*/
public class WbXmlParser {
/**
* Input Stream where the WBXML document is going to be read.
*/
private InputStream is;
/**
* The document the parser is creating during a parsing processing.
*/
private WbXmlDocument doc;
/**
* the page code state for attributes. See chapter 5.8.1. Parser State Machine
* of the specification.
*/
private byte pageAttrState;
/**
* The page code for tags. See same chapter 5.8.1. Parser State Machine.
*/
private byte pageTagState;
/**
* The current byte read from the stream.
*/
private Byte currentByte;
/**
* Sometimes it is useful to read one byte backwards (to start when a
* new element start and so on). This way a buffer of one byte is maintained
* is used to can go back one byte.
*/
private Byte nextByte;
/**
* Constructor of the parse. Only the input stream is needed cos the rest
* is parsed from the document.
* @param is The input stream to read the WBXML document from
*/
public WbXmlParser(InputStream is) {
this.is = is;
this.pageAttrState = 0x00;
this.pageTagState = 0x00;
this.doc = null;
currentByte = null;
nextByte = null;
}
/**
* Method that reads the next byte in the stream. Cos the parser lets one
* read back the nextByte property is checked (in that case no byte
* is really read and the nextByte is the current byte).
* @return true if there is a new byte read, false if the document is finished
* @throws IOException Some error reading from the stream
*/
public boolean read() throws IOException {
if (nextByte != null) {
currentByte = nextByte;
nextByte = null;
} else {
int i = this.is.read();
if (i < 0) {
currentByte = null;
} else {
currentByte = (byte) i;
}
}
//System.err.println((currentByte != null)? WbXmlDefinition.formatUInt8Char(currentByte):"null");
return currentByte != null;
}
/**
* As explained in the class it is sometimes useful to read one byte
* backwards (usually to position the stream in the start of another
* element or attribute). This method does exactly that, the current byte
* is set to null and the nextByte is filled with the current. The method
* is intended to immediately call the read() method.
* @throws IOException A exception is thrown if the the current byte is null
* (that means the readBackwards has been called twice).
*/
public void readBackwards() throws IOException {
if (currentByte != null) {
nextByte = currentByte;
currentByte = null;
} else {
throw new IOException("Only one readBackwards is permitted!");
}
}
/**
* Method that let us read a complete byte array buffer. This is used when
* strings or other fixed arrays are presented (STR_I, OPAQUE,...). This
* method is intended to read known fixed sized arrays (WBXML presents
* this king of data always knowing the array length. The current byte is
* set to the last byte read.
* @param b The buffer to read
* @return The same buffer that is passed as argument
* @throws IOException Some error reading from the stream or if the
* end of the file is reached before filling all the array
*/
public byte[] read(byte[] b) throws IOException {
if (b.length == 0) {
return b;
}
int offset = 0;
if (nextByte != null) {
b[0] = nextByte;
nextByte = null;
offset = 1;
}
int i = this.is.read(b, offset, b.length - offset);
if (i + offset < b.length) {
// end of file => IOEXception
throw new IOException(String.format("End of file reading a byte[] of size %d", b.length));
}
currentByte = b[b.length - 1];
return b;
}
/**
* WBXML specification presents mb_u_int32 integers in a very strange
* way in order to safe space. The way they are encoded is explained in
* the chapter 5.1. Multi-byte Integers of the specification.
*
* A multi-byte integer consists of a series of octets, where the most
* significant bit is the continuation flag and the remaining seven bits
* are a scalar value. The continuation flag indicates that an octet is not
* the end of the multi-byte sequence. A single integer value is encoded
* into a sequence of N octets. The first N-1 octets have the continuation
* flag set to a value of one (1). The final octet in the series has a
* continuation flag value of zero (0). The remaining seven bits in each
* octet are encoded in a big-endian order, e.g., most significant bit
* first. The octets are arranged in a big-endian order, e.g., the most
* significant seven bits are transmitted first. In the situation where the
* initial octet has less than seven bits of value, all unused bits must be
* set to zero (0). For example, the integer value 0xA0 would be encoded
* with the two-byte sequence 0x81 0x20. The integer value 0x60 would be
* encoded with the one-byte sequence 0x60.
*
* @return The read integer in long format
* @throws IOException Some error reading the integer
*/
public long readUnsignedInteger() throws IOException {
long res = 0;
read();
int times = 1;
res = (res << 7) | (((byte) currentByte) & 0x7F);
while ((((byte) currentByte) & 0x80) != 0) {
if (times > 5) {
throw new IOException("An unsigned integer should not be longer than 5 bytes!");
}
read();
times++;
res = (res << 7) | (((byte) currentByte) & 0x7F);
}
return res;
}
/**
* A inline string (STR_I) is a string that is appended to the document
* (as part of an attribute value or part of a content string). It is just
* defined as follows:
*
*
* inline = STR_I termstr
* termstr = charset-dependent string with termination
*
*
* The inline string is just a STR_I token follow by the string
* (charset dependent) terminated in 0x00 token. The strings format are
* specified in the chapter 5.8.4.1. Strings of the specification.
* So this method read a byte until 0x00 token is found.
*
* @return The string inlined
* @throws IOException
*/
public String readInlineString() throws IOException {
ByteArrayOutputStream bos = null;
try {
bos = new ByteArrayOutputStream();
read();
while (currentByte != 0x0) {
bos.write(currentByte);
read();
}
return new String(bos.toByteArray(), doc.getCharset().getCharset());
} finally {
if (bos != null) {
try {
bos.close();
} catch(IOException e) {}
}
}
}
/**
* The WBXML format specification defines two states for a parser/encoder
* machine. The states deals with the page code, one state is used for tags
* and the other for attributes. When a tag or attribute comes if it
* is from the same page code that the state is no switch page is written.
* Nevertheless if the tag is from other page a switch page token should
* be written in order to change the state (there are two states and they
* are independent, that is why the parser have to page codes, for tags
* and for attributes). This states are explained in the chapter
* 5.8.1. Parser State Machine of the specification.
*
* This method is called when a new switch page can come (in any
* tag, attribute or attribute value element), if it is found the
* switch page token the correspondent state is updated and the next
* token is read in current byte.
*
* This method reads possible switch page for the attribute state
* parser.
*
* @throws IOException Some error reading the stream
*/
public void readSwitchPageAttribute() throws IOException {
if (WbXmlLiterals.SWTICH_PAGE == currentByte) {
read();
this.pageAttrState = currentByte;
read();
}
}
/**
* The WBXML format specification defines two states for a parser/encoder
* machine. The states deals with the page code, one state is used for tags
* and the other for attributes. When a tag or attribute comes if it
* is from the same page code that the state is no switch page is written.
* Nevertheless if the tag is from other page a switch page token should
* be written in order to change the state (there are two states and they
* are independent, that is why the parser have to page codes, for tags
* and for attributes). This states are explained in the chapter
* 5.8.1. Parser State Machine of the specification.
*
* This method is called when a new switch page can come (in any
* tag, attribute or attribute value element), if it is found the
* switch page token the correspondent state is updated and the next
* token is read in current byte.
*
* This method reads possible switch page for the TAG state
* parser.
*
* @throws IOException Some error reading the stream
*/
public void readSwitchPageTag() throws IOException {
if (WbXmlLiterals.SWTICH_PAGE == currentByte) {
read();
this.pageTagState = currentByte;
read();
}
}
/**
* Method that parses the version of a WBXML document. The version
* is defined in the specification:
*
*
* version = u_int8 // WBXML version number
*
*
* The version encoding/parsing chapter is the 5.4. Version Number:
* All WBXML documents contain a version number in their initial byte. This
* version specifies the WBXML specification version. The version byte
* contains the major version minus one in the upper four bits and the minor
* version in the lower four bits. For example, the version number 1.3 would
* be encoded as 0x03, and version number 2.7 as 0x17.
*
* @return The enumeration version that corresponds to the byte read
* @throws IOException Some error reading the version or a unknown version
*/
public WbXmlVersion parseVersion() throws IOException {
read();
byte major = (byte) ((currentByte >> 4) + 1);
byte minor = (byte) (currentByte & 0x0F);
WbXmlVersion v = WbXmlVersion.locateVersion(major, minor);
if (v == null) {
throw new IOException(String.format("Invalid version (%d,%d)", major, minor));
}
return v;
}
/**
* Method that parses the WBXML public ID of a document. The public id
* is defined in the specification as follows:
*
*
* publicid = mb_u_int32 | ( zero index )
* zero = u_int8 // with a 0x0 value
* index = mb_u_int32 // integer index into string table.
*
*
* The chapter 5.5. Document Public Identifier defines how the
* public id is parser/encoded. The public id can be parsed using
* directly the standard ID of the language (mb_u_int32) or using
* the XML formal public id. In the last case a 0x00 byte is used and
* the the String Table is used to locate the string. The public ID is
* used to locate the language definition of the parsed document and,
* in case normal mb_u_int32 id used, it is set in the document.
*
* @return The publicId, the index in the table in case string
* representation or -1 if unknown. In case of string representation StrTbl
* has not been read yet, so no language definition is still associated
* to the parser.
* @throws IOException Error reading the stream or unknown language definition
*/
public long parsePublicId() throws IOException {
// read the mb_u_int32 or zero
long publicId = readUnsignedInteger();
if (publicId == WbXmlDefinition.PUBLIC_ID_STR_T) {
// read the index in the strtbl, that index is returned
publicId = readUnsignedInteger();
} else if (publicId != WbXmlDefinition.PUBLIC_ID_UNKNOWN) {
doc.setDefinition(WbXmlInitialization.getDefinitionByPublicId(publicId));
if (doc.getDefinition() == null) {
throw new IOException(String.format("Unknown definition public id (%d)", publicId));
}
} else {
publicId = -1;
}
return publicId;
}
/**
* Method that parses the charset of the WBXML document. In the specification
* the charset is defined as follows:
*
*
* charset = mb_u_int32
*
*
* The chapter 5.6. Charset of the specification explains how
* the charset should be handled. it is just the MIB numeric identifier
* of the IANA charset.The charset is set in the document.
*
* @return The IANA charset that corresponds to the MIB found
* @throws IOException Some error reading the stream or unknown IANA charset
*/
public IanaCharset parseCharset() throws IOException {
long mib = readUnsignedInteger();
IanaCharset iana = IanaCharset.getIanaCharset(mib);
if (mib != 0 && iana.equals(IanaCharset.UNKNOWN)) {
throw new IOException(String.format("Unknown character encoding '%d'", mib));
}
doc.setCharset(iana);
return iana;
}
/**
* Method that parses the string table of the WBXML documnet. The string
* table is defined as follows:
*
*
* strtbl = length *byte
*
*
* And the chapter 5.7. String Table explains how the string
* table is used and encoded. The table is just the length of itself and
* a byte array with all the strings defined in the strtbl. The strings
* are just charset dependent byte arrays 0x00 terminated. Later references
* in the document to the strings in the table are done using the
* relative starting idex of the string in the table. Besides being
* returned the strtbl is set in the parsed document.
*
* @return The strtbl read
* @throws IOException Some error in the stream or reading the table
*/
public WbXmlStrtbl parseStrtbl() throws IOException {
WbXmlStrtbl strtbl = new WbXmlStrtbl();
long length = readUnsignedInteger();
strtbl.setSize(length);
//System.err.println("length=" + length);
byte[] b = new byte[(int) length];
read(b);
int idx = 0;
for (int i = 0; i < b.length; i++) {
if (b[i] == 0x0) {
String s = new String(b, idx, i - idx, doc.getCharset().getCharset());
//System.err.println("idx=" + idx + " end=" + i + " s=" + s);
strtbl.internalAddString(idx, s);
idx = i + 1;
}
}
doc.setStrtbl(strtbl);
return strtbl;
}
/**
* Method that parses an TAG opaque token. The WBXML specification let
* languages to encode any TAG using a opaque byte array. Languages use
* this feature to encode/parse particular data (datetime formats, bynary
* data,...), so it is clearly language definition dependent. This
* library let define plugins to encode/parse an opaque data.
*
* The opaque is defined in the WBXML specification as follows:
*
*
* opaque = OPAQUE length *byte
*
*
* Just a OPAQUE token, length of the byte array and the bytes that
* compound the opaque data. This method search if the definition defines
* a plugin for this tag and calls it to retrieve the content
* associated to the element.
*
* NOTE: Right now a exception is throws if no plugin is found, libwbxml
* just parses it as a string charset dependent!!!
*
* @param tagName The name of the tag to loacte the plugin
* @return The content after calling the associated plugin
* @throws IOException Some error reading the stream or locating/executing the plugin
*/
public WbXmlContent parseOpaqueTag(String tagName) throws IOException {
// first get the plugin for the attr
OpaqueContentPlugin plugin = doc.getDefinition().locateTagPlugin(tagName);
if (plugin == null) {
// read as a string opaque
throw new IOException(String.format("No plugin defined for tag (%s)", tagName));
}
// read the OPAQUE tag
read();
if (WbXmlLiterals.OPAQUE != currentByte) {
throw new IOException("Opaque must start with OPAQUE tag!");
}
// read the length
long length = readUnsignedInteger();
// create a byte array of that length
byte[] b = new byte[(int) length];
// read the byte array
read(b);
// parse the opaque data using the plugin
return plugin.parse(this, b);
}
/**
* Method that parses an attribute opaque token. The WBXML specification let
* languages to encode any attribute value using a opaque byte array. Languages
* use this feature to encode/parse particular data (datetime formats, binary
* data,...), so it is clearly language definition dependent. This
* library let define plugins to encode/parse an opaque data.
*
* The opaque is defined in the WBXML specification as follows:
*
*
* opaque = OPAQUE length *byte
*
*
* Just a OPAQUE token, length of the byte array and the bytes that
* compound the opaque data. This method search if the definition defines
* a plugin for this tag and calls it to retrieve the content
* associated to the element. In case of an attribute only string
* contents can be returned (as it is said in several points maybe two
* interfaces would have been a better idea)
*
* NOTE: Right now a exception is throws if no plugin is found, libwbxml
* just parses it as a string charset dependent!!!
*
* @param attrName The name of the attribute to locate the plugin
* @return The String of the attr
* @throws IOException Some error reading the stream or locating/executing the plugin
*/
public String parseOpaqueAttr(String attrName) throws IOException {
// first get the plugin for the attr
OpaqueAttributePlugin plugin = doc.getDefinition().locateAttrPlugin(attrName);
if (plugin == null) {
// read as a string opaque
throw new IOException(String.format("No plugin defined for attr (%s)", attrName));
}
// read the OPAQUE tag
read();
if (WbXmlLiterals.OPAQUE != currentByte) {
throw new IOException("Opaque must start with OPAQUE tag!");
}
// read the length
long length = readUnsignedInteger();
// create a byte array of that length
byte[] b = new byte[(int) length];
// read the byte array
read(b);
// parse the opaque data using the plugin
return plugin.parse(this, b);
}
/**
* Method that parses an numeric entity in the WBXML document. The
* entity is defined in the specification as follows:
*
*
* entity = ENTITY entcode
* entcode = mb_u_int32 // UCS-4 character code
*
*
* The chapter 5.8.4.3. Character Entity comments how the
* entity should be understood: The character entity token (ENTITY) encodes
* a numeric character entity. This has the same semantics as an XML
* numeric character entity (e.g., ). The mb_u_int32 refers to a
* character in the UCS-4 character encoding.
*
* @return The string that represents the entity (i.e. " ")
* @throws IOException Some error reading the stream
*/
public String parseEntity() throws IOException {
// read the ENTITY
read();
if (WbXmlLiterals.ENTITY != currentByte) {
throw new IOException("Entity must start with the ENTITY tag!");
}
// read the numeric entity and construct the "" + num + ";"
long entity = readUnsignedInteger();
return new StringBuilder("").append(entity).append(";").toString();
}
/**
* Method to parse the attribute values of attribute. In the specification
* the attribute value is defined as follows:
*
*
* attrValue = ([switchPage] ATTRVALUE) | string | extension | entity | opaque
*
* string = inline | tableref
* inline = STR_I termstr
* tableref = STR_T index
* index= mb_u_int32 // index in the attr table
*
* extension = [switchPage] (( EXT_I termstr ) | ( EXT_T index ) | EXT)
*
* entity = ENTITY entcode
* entcode = mb_u_int32 // UCS-4 character code
*
* opaque = OPAQUE length *byte
*
*
* The method keeps reading attributes and completing the values one
* by one. It tries to differentiate the type (string, entity, opaque and
* s on) and read each value accordingly. The values are returned as a
* list of strings (one per different element in the WBXML document).
*
* @param attrName The name of the attribute being parsed (used for opaques)
* @return The list of values read from the stream
* @throws IOException Some error reading the values from the stream
*/
public List parseAttributeValues(String attrName) throws IOException {
//System.err.println("parseAttributeValues");
List values = new ArrayList();
read();
boolean cont = true;
while (cont) {
// attr value can be switchPage => jump it if it is the case
readSwitchPageAttribute();
// ATTRVALUE, STR_I, STR_T, EXT_I*, EXT_T*, EXT, ENTITY or OPAQUE
if (WbXmlLiterals.STR_I == currentByte) {
values.add(readInlineString());
} else if (WbXmlLiterals.STR_T == currentByte) {
long idx = readUnsignedInteger();
values.add(doc.getStrtbl().getString(idx));
} else if (WbXmlLiterals.EXT_I_0 == currentByte
|| WbXmlLiterals.EXT_I_1 == currentByte
|| WbXmlLiterals.EXT_I_2 == currentByte) {
// read the string from termstr
values.add(readInlineString());
} else if (WbXmlLiterals.EXT_T_0 == currentByte
|| WbXmlLiterals.EXT_T_1 == currentByte
|| WbXmlLiterals.EXT_T_2 == currentByte) {
// read the index
long extToken = readUnsignedInteger();
WbXmlExtensionDef ext = doc.getDefinition().locateExtension(extToken);
if (ext == null) {
throw new IOException(String.format("Unknown extension (%d)", extToken));
}
values.add(ext.getValue());
} else if (WbXmlLiterals.EXT_0 == currentByte
|| WbXmlLiterals.EXT_1 == currentByte
|| WbXmlLiterals.EXT_2 == currentByte) {
throw new IOException("Implementation does not support EXT_0, EXT_1 or EXT_2 in attribute values");
} else if (WbXmlLiterals.ENTITY == currentByte) {
// read backwards to start with ENTITY tag
readBackwards();
values.add(parseEntity());
} else if (WbXmlLiterals.OPAQUE == currentByte) {
// read backwars to start at OPAQUE and parse it
readBackwards();
String v = parseOpaqueAttr(attrName);
if (v != null && !v.isEmpty()) {
values.add(v);
}
} else if ((currentByte & 0x80) != 0) {
// it is an ATTRVALUE (any attrvalue is >= 128)
WbXmlAttributeValueDef attrValDef = doc.getDefinition().locateAttributeValue(pageAttrState, currentByte);
if (attrValDef == null) {
throw new IOException(String.format("Unknown ATTRVALUE in the definition (%s)",
new WbXmlToken(pageAttrState, currentByte)));
}
values.add(attrValDef.getValue());
} else {
// start of the next attribute or END
cont = false;
}
if (cont) {
// read next attribute value or END
read();
}
}
return values;
}
/**
* Method that read a complete attribute from the WBXML stream. An
* attribute is defined in the specifications as follows:
*
*
* attribute = attrStart *attrValue
*
* attrStart = ([switchPage] ATTRSTART) | (LITERAL index )
*
* attrValue = ...
*
*
* So this method reads the attrStart and then call the parseAttributeValues
* method to read all the values for the attribute.
*
* @return The attribute read from the document
* @throws IOException Some error reading the attribute from the stream
*/
public WbXmlAttribute parseAttribute() throws IOException {
//System.err.println("parseAttribute");
WbXmlAttribute attr = new WbXmlAttribute();
WbXmlAttributeDef attrDef;
// read the first byte switchpage or attrstart
read();
// can be a switchPage => jump it
readSwitchPageAttribute();
// now b is the ATTRSTART or LITERAL
if (WbXmlLiterals.LITERAL == currentByte) {
// the attribute is a literal
long idx = readUnsignedInteger();
attr.setName(doc.getStrtbl().getString(idx));
} else {
// ATTRSTART, read the attribute definition
attrDef = doc.getDefinition().locateAttribute(pageAttrState, currentByte);
if (attrDef == null) {
throw new IOException(String.format("Unknown ATTRSTART in the definition (%s)",
new WbXmlToken(pageAttrState, currentByte)));
}
attr.setName(attrDef.getNameWithPrefix());
//System.err.println("parseAttribute: name=" + attrDef.getName());
if (attrDef.getValue() != null) {
// add the first part of the string
attr.addValue(attrDef.getValue());
}
}
// now the attribute value
attr.addValues(parseAttributeValues(attr.getName()));
//System.err.println("parseAttribute: " + attr);
// transform all the strings into one
attr.normalize();
return attr;
}
/**
* Method that parses the different types of contents in an WBXML document.
* The content is defined in the specificatiosn as follows:
*
*
* content = element | string | extension | entity | pi | opaque
*
* string = inline | tableref
* inline = STR_I termstr
* tableref = STR_T index
*
* extension = [switchPage] (( EXT_I termstr ) | ( EXT_T index ) | EXT)
*
* entity = ENTITY entcode
*
* opaque = OPAQUE length *byte
*
* pi = PI attrStart *attrValue END
*
*
* So the method tries to differentiate the type (string, pi, opaque,...)
* and create the content object with the value.
*
* @param tagName The tag of the element the content belongs to (used for opaques)
* @return The content read from the stream
* @throws IOException Some error reading the content from the stream
*/
public WbXmlContent parseContent(String tagName) throws IOException {
//System.err.println("parseContent");
WbXmlContent content = new WbXmlContent();
read();
//System.err.println(WbXmlLiterals.formatUInt8Char(currentByte));
// can be a switchPage => jump it
readSwitchPageTag();
// the first byte can be:
// -> string: STR_I, STR_T
// -> extension: EXT_I*, EXT_T*, EXT*
// -> entity: ENTITY
// -> opaque: OPAQUE
// -> pi: PI
// -> ELEMENT: TAG, LITERAL_*
// ATTRVALUE, STR_I, STR_T, EXT_I*, EXT_T*, EXT, ENTITY or OPAQUE
if (WbXmlLiterals.STR_I == currentByte) {
content.setString(readInlineString());
} else if (WbXmlLiterals.STR_T == currentByte) {
long idx = readUnsignedInteger();
content.setString(doc.getStrtbl().getString(idx));
} else if (WbXmlLiterals.EXT_I_0 == currentByte
|| WbXmlLiterals.EXT_I_1 == currentByte
|| WbXmlLiterals.EXT_I_2 == currentByte) {
// read the string from termstr
content.setString(readInlineString());
} else if (WbXmlLiterals.EXT_T_0 == currentByte
|| WbXmlLiterals.EXT_T_1 == currentByte
|| WbXmlLiterals.EXT_T_2 == currentByte) {
// read the index
long extToken = readUnsignedInteger();
WbXmlExtensionDef ext = doc.getDefinition().locateExtension(extToken);
if (ext == null) {
throw new IOException(String.format("Unknown extension (%x)", extToken & 0xFF));
}
content.setString(ext.getValue());
} else if (WbXmlLiterals.EXT_0 == currentByte
|| WbXmlLiterals.EXT_1 == currentByte
|| WbXmlLiterals.EXT_2 == currentByte) {
throw new IOException("Implementation does not support EXT_0, EXT_1 or EXT_2 in contents");
} else if (WbXmlLiterals.ENTITY == currentByte) {
// read backwards to start with ENTITY tag
readBackwards();
content.setString(parseEntity());
} else if (WbXmlLiterals.OPAQUE == currentByte) {
// read backwars to start at OPAQUE and parse it
readBackwards();
content = parseOpaqueTag(tagName);
} else if (WbXmlLiterals.PI == currentByte) {
// read backwards to start with PI and parse;
readBackwards();
content.setPi(parsePi());
} else if (WbXmlLiterals.END != currentByte) {
// element => read backwards and call recursive
readBackwards();
content.setElement(parseElement());
}
// read the next element => END or another content
read();
if (content.isEmpty()) {
// assign empty string
content = null;
}
//System.err.println("parseContent: " + content);
return content;
}
/**
* Method that reads a complete element from the WBXML stream. An
* element is defined in the specification as follows:
*
*
* element = ([switchPage] stag) [ 1*attribute END ] [ *content END ]
* stag = TAG | (literalTag index)
* literalTag = LITERAL | LITERAL_A | LITERAL_C | LITERAL_AC
*
*
* So the method reads the stag and then calls recursively to read all
* the attributes and contents to previous methods.
*
* @return The element read from the stream
* @throws IOException Some error reading the element from the stream
*/
public WbXmlElement parseElement() throws IOException {
//System.err.println("parseElement");
WbXmlElement element = new WbXmlElement();
boolean hasAttributes;
boolean hasContent;
read();
//System.err.println(WbXmlLiterals.formatUInt8Char(currentByte));
// read possible switch page
readSwitchPageTag();
// the tag can be a LITERAL or a tag definition
if (WbXmlLiterals.LITERAL == currentByte
|| WbXmlLiterals.LITERAL_A == currentByte
|| WbXmlLiterals.LITERAL_C == currentByte
|| WbXmlLiterals.LITERAL_AC == currentByte) {
// literal => the name is in the strtbl
hasAttributes = (WbXmlLiterals.LITERAL_A == currentByte) ||
(WbXmlLiterals.LITERAL_AC == currentByte);
hasContent = (WbXmlLiterals.LITERAL_C == currentByte) ||
(WbXmlLiterals.LITERAL_AC == currentByte);
long idx = readUnsignedInteger();
String name = doc.getStrtbl().getString(idx);
element.setTag(name);
} else {
// look for the tag in the definition
WbXmlTagDef tagDef = doc.getDefinition().locateTag(pageTagState, (byte) (currentByte & 0x3F));
if (tagDef == null) {
throw new IOException(String.format("Unknown TAG in the definition (%s)",
new WbXmlToken(pageTagState, currentByte)));
}
element.setTag(tagDef.getNameWithPrefix());
// calculate attributes and contents
hasAttributes = ((currentByte & 0x80) != 0);
hasContent = ((currentByte & 0x40) != 0);
}
//System.err.println(element.getTag());
if (hasAttributes) {
boolean cont = true;
while (cont) {
// read one or more attributes
element.addAttribute(parseAttribute());
if (WbXmlLiterals.END == currentByte) {
cont = false;
} else {
// keep going but set the current at start
readBackwards();
}
}
}
if (hasContent) {
boolean cont = true;
while (cont) {
// read one or more contents
WbXmlContent content = parseContent(element.getTag());
if (content != null) {
element.addContent(content);
}
if (currentByte == null || WbXmlLiterals.END == currentByte) {
cont = false;
} else {
// keep going but set the current at start
readBackwards();
}
}
}
// normalize strings
element.normalize();
//System.err.println("parseElement: " + element);
return element;
}
/**
* Method that parses a PI element from the WBXML stream. A PI element
* is very similar to an attribute, it is defined as follows:
*
*
* PI attrStart *attrValue END
*
*
* So it reads the start PI token and then call the parseAttribute method.
*
* @return The attribute of the PI
* @throws IOException Some error reading the PI attribute from the stream
*/
public WbXmlAttribute parsePi() throws IOException {
read();
if (WbXmlLiterals.PI != currentByte) {
throw new IOException("PI must start with PI tag!");
}
WbXmlAttribute attr = parseAttribute();
read();
if (WbXmlLiterals.END != currentByte) {
throw new IOException("PI must end with END tag!");
}
return attr;
}
/**
* Method that parses the body of a WBXML document. The body is defined as
* follows in the specification:
*
*
* body = *pi element *pi
*
*
* So the three elements are called to be read.
*
* @return The body read from the stream
* @throws IOException Some error reading the body from the stream
*/
public WbXmlBody parseBody() throws IOException {
WbXmlBody body = new WbXmlBody();
read();
// b can be a PI, switchPage or stag, the first indicates
// a pi element follows, the other a element
while (WbXmlLiterals.PI == currentByte) {
readBackwards();
body.addPrePi(parsePi());
read();
}
// we are at the start of an element => read backwards and parse
readBackwards();
body.setElement(parseElement());
// we can be at the end of the file or before a PI
read();
while (currentByte != null && WbXmlLiterals.PI == currentByte) {
readBackwards();
body.addPostPi(parsePi());
read();
}
return body;
}
/**
* Main method of the class. This method starts the complete parsing of
* the WBXML document, it calls recursively the different previous methods
* to construct a whole Java representation of the WBXML document. This
* method let the caller to set a fixed definition.
* The document is defined by the specification as follows:
*
*
* start = version publicid charset strtbl body
*
*
* @param def The definition to be used (forced), it can be null
* @return The document (java representation) of the stream
* @throws IOException Some error reading the document from the stream
*/
public WbXmlDocument parse(WbXmlDefinition def) throws IOException {
doc = new WbXmlDocument();
// read the version
doc.setVersion(parseVersion());
// read the public id for the definition
long publicId = parsePublicId();
if (def != null) {
// force the definition to the one specified
doc.setDefinition(def);
}
// parse the charset
parseCharset();
// read the strtbl
parseStrtbl();
// get the definition
if (doc.getDefinition() == null && publicId != -1) {
// the public id is a index in the strtbl => read it
String fpi = doc.getStrtbl().getString(publicId);
doc.setDefinition(WbXmlInitialization.getDefinitionByFPI(fpi));
if (doc.getDefinition() == null) {
throw new IOException(String.format("Unknown definition formal public id (%s)", fpi));
}
} else if (doc.getDefinition() == null) {
throw new IOException("Unknown definition and no one specified");
}
// read the body
doc.setBody(parseBody());
return doc;
}
/**
* Main method of the class. This method starts the complete parsing of
* the WBXML document, it calls recursively the different previous methods
* to construct a whole Java representation of the WBXML document. The
* document is defined by the specification as follows:
*
*
* start = version publicid charset strtbl body
*
*
* @return The document (java representation) of the stream
* @throws IOException Some error reading the document from the stream
*/
public WbXmlDocument parse() throws IOException {
return parse(null);
}
/**
* Getter for the document after being parsed.
* @return The document
*/
public WbXmlDocument getDocument() {
return this.doc;
}
/**
* Getter for the charset after being parsed
* @return The charset of the document
*/
public Charset getCharset() {
return this.doc.getCharset().getCharset();
}
/**
* Getter for the document language definition. As it is said the WBXML
* specification defines one language for document but some of them
* (SyncML for instance) let encode another language as opaque data.
* For that reason the parser provides a getdefinitionsUsed() method
* to obtain all the languages used in the parsing and not only the
* one of the document.
* @return The language definition used in the parsing
*/
public WbXmlDefinition getDefinition() {
return this.doc.getDefinition();
}
}