org.jivesoftware.openfire.net.MXParser Maven / Gradle / Ivy
The newest version!
/*
* Copyright (C) 2005-2008 Jive Software. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.jivesoftware.openfire.net;
import org.xmlpull.v1.XmlPullParserException;
import org.xmlpull.v1.XmlPullParser;
import java.io.IOException;
import java.io.Reader;
/**
* MXParser that returns an IGNORABLE_WHITESPACE event when a whitespace character or a
* line feed is received. This parser is useful when not validating documents.
*
* @author Gaston Dombiak
*/
public class MXParser extends org.xmlpull.mxp1.MXParser {
/**
* Last time a heartbeat was received. Hearbeats are represented as whitespaces
* or \n characters received when an XmlPullParser.END_TAG was parsed. Note that we
* can falsely detect heartbeats when parsing XHTML content but that is fine.
*/
private long lastHeartbeat = 0;
@Override
protected int nextImpl()
throws XmlPullParserException, IOException
{
text = null;
pcEnd = pcStart = 0;
usePC = false;
bufStart = posEnd;
if(pastEndTag) {
pastEndTag = false;
--depth;
namespaceEnd = elNamespaceCount[ depth ]; // less namespaces available
}
if(emptyElementTag) {
emptyElementTag = false;
pastEndTag = true;
return eventType = END_TAG;
}
// [1] document ::= prolog element Misc*
if(depth > 0) {
if(seenStartTag) {
seenStartTag = false;
return eventType = parseStartTag();
}
if(seenEndTag) {
seenEndTag = false;
return eventType = parseEndTag();
}
// ASSUMPTION: we are _on_ first character of content or markup!!!!
// [43] content ::= CharData? ((element | Reference | CDSect | PI | Comment) CharData?)*
char ch;
if(seenMarkup) { // we have read ahead ...
seenMarkup = false;
ch = '<';
} else if(seenAmpersand) {
seenAmpersand = false;
ch = '&';
} else {
ch = more();
}
posStart = pos - 1; // VERY IMPORTANT: this is correct start of event!!!
// when true there is some potential event TEXT to return - keep gathering
boolean hadCharData = false;
// when true TEXT data is not continous (like ) and requires PC merging
boolean needsMerging = false;
MAIN_LOOP:
while(true) {
// work on MARKUP
if(ch == '<') {
if(hadCharData) {
//posEnd = pos - 1;
if(tokenize) {
seenMarkup = true;
return eventType = TEXT;
}
}
ch = more();
if(ch == '/') {
if(!tokenize && hadCharData) {
seenEndTag = true;
//posEnd = pos - 2;
return eventType = TEXT;
}
return eventType = parseEndTag();
} else if(ch == '!') {
ch = more();
if(ch == '-') {
// note: if(tokenize == false) posStart/End is NOT changed!!!!
parseComment();
if(tokenize) return eventType = COMMENT;
if( !usePC && hadCharData ) {
needsMerging = true;
} else {
posStart = pos; //completely ignore comment
}
} else if(ch == '[') {
//posEnd = pos - 3;
// must remeber previous posStart/End as it merges with content of CDATA
//int oldStart = posStart + bufAbsoluteStart;
//int oldEnd = posEnd + bufAbsoluteStart;
parseCDSect(hadCharData);
if(tokenize) return eventType = CDSECT;
final int cdStart = posStart;
final int cdEnd = posEnd;
final int cdLen = cdEnd - cdStart;
if(cdLen > 0) { // was there anything inside CDATA section?
hadCharData = true;
if(!usePC) {
needsMerging = true;
}
}
// posStart = oldStart;
// posEnd = oldEnd;
// if(cdLen > 0) { // was there anything inside CDATA section?
// if(hadCharData) {
// // do merging if there was anything in CDSect!!!!
// // if(!usePC) {
// // // posEnd is correct already!!!
// // if(posEnd > posStart) {
// // joinPC();
// // } else {
// // usePC = true;
// // pcStart = pcEnd = 0;
// // }
// // }
// // if(pcEnd + cdLen >= pc.length) ensurePC(pcEnd + cdLen);
// // // copy [cdStart..cdEnd) into PC
// // System.arraycopy(buf, cdStart, pc, pcEnd, cdLen);
// // pcEnd += cdLen;
// if(!usePC) {
// needsMerging = true;
// posStart = cdStart;
// posEnd = cdEnd;
// }
// } else {
// if(!usePC) {
// needsMerging = true;
// posStart = cdStart;
// posEnd = cdEnd;
// hadCharData = true;
// }
// }
// //hadCharData = true;
// } else {
// if( !usePC && hadCharData ) {
// needsMerging = true;
// }
// }
} else {
throw new XmlPullParserException(
"unexpected character in markup "+printable(ch), this, null);
}
} else if(ch == '?') {
parsePI();
if(tokenize) return eventType = PROCESSING_INSTRUCTION;
if( !usePC && hadCharData ) {
needsMerging = true;
} else {
posStart = pos; //completely ignore PI
}
} else if( isNameStartChar(ch) ) {
if(!tokenize && hadCharData) {
seenStartTag = true;
//posEnd = pos - 2;
return eventType = TEXT;
}
return eventType = parseStartTag();
} else {
throw new XmlPullParserException(
"unexpected character in markup "+printable(ch), this, null);
}
// do content comapctation if it makes sense!!!!
} else if(ch == '&') {
// work on ENTITTY
//posEnd = pos - 1;
if(tokenize && hadCharData) {
seenAmpersand = true;
return eventType = TEXT;
}
final int oldStart = posStart + bufAbsoluteStart;
final int oldEnd = posEnd + bufAbsoluteStart;
final char[] resolvedEntity = parseEntityRef();
if(tokenize) return eventType = ENTITY_REF;
// check if replacement text can be resolved !!!
if(resolvedEntity == null) {
if(entityRefName == null) {
entityRefName = newString(buf, posStart, posEnd - posStart);
}
throw new XmlPullParserException(
"could not resolve entity named '"+printable(entityRefName)+"'",
this, null);
}
//int entStart = posStart;
//int entEnd = posEnd;
posStart = oldStart - bufAbsoluteStart;
posEnd = oldEnd - bufAbsoluteStart;
if(!usePC) {
if(hadCharData) {
joinPC(); // posEnd is already set correctly!!!
needsMerging = false;
} else {
usePC = true;
pcStart = pcEnd = 0;
}
}
//assert usePC == true;
// write into PC replacement text - do merge for replacement text!!!!
for (int i = 0; i < resolvedEntity.length; i++)
{
if(pcEnd >= pc.length) ensurePC(pcEnd);
pc[pcEnd++] = resolvedEntity[ i ];
}
hadCharData = true;
//assert needsMerging == false;
} else {
if(needsMerging) {
//assert usePC == false;
joinPC(); // posEnd is already set correctly!!!
//posStart = pos - 1;
needsMerging = false;
}
//no MARKUP not ENTITIES so work on character data ...
// [14] CharData ::= [^<&]* - ([^<&]* ']]>' [^<&]*)
hadCharData = true;
boolean normalizedCR = false;
final boolean normalizeInput = tokenize == false || roundtripSupported == false;
// use loop locality here!!!!
boolean seenBracket = false;
boolean seenBracketBracket = false;
do {
// check that ]]> does not show in
if (eventType == XmlPullParser.END_TAG &&
(ch == ' ' || ch == '\n' || ch == '\t')) {
// ** ADDED CODE (INCLUDING IF STATEMENT)
lastHeartbeat = System.currentTimeMillis();
}
if(ch == ']') {
if(seenBracket) {
seenBracketBracket = true;
} else {
seenBracket = true;
}
} else if(seenBracketBracket && ch == '>') {
throw new XmlPullParserException(
"characters ]]> are not allowed in content", this, null);
} else {
if(seenBracket) {
seenBracketBracket = seenBracket = false;
}
// assert seenTwoBrackets == seenBracket == false;
}
if(normalizeInput) {
// deal with normalization issues ...
if(ch == '\r') {
normalizedCR = true;
posEnd = pos -1;
// posEnd is alreadys set
if(!usePC) {
if(posEnd > posStart) {
joinPC();
} else {
usePC = true;
pcStart = pcEnd = 0;
}
}
//assert usePC == true;
if(pcEnd >= pc.length) ensurePC(pcEnd);
pc[pcEnd++] = '\n';
} else if(ch == '\n') {
// if(!usePC) { joinPC(); } else { if(pcEnd >= pc.length) ensurePC(); }
if(!normalizedCR && usePC) {
if(pcEnd >= pc.length) ensurePC(pcEnd);
pc[pcEnd++] = '\n';
}
normalizedCR = false;
} else {
if(usePC) {
if(pcEnd >= pc.length) ensurePC(pcEnd);
pc[pcEnd++] = ch;
}
normalizedCR = false;
}
}
ch = more();
} while(ch != '<' && ch != '&');
posEnd = pos - 1;
continue MAIN_LOOP; // skip ch = more() from below - we are alreayd ahead ...
}
ch = more();
} // endless while(true)
} else {
if(seenRoot) {
return parseEpilog();
} else {
return parseProlog();
}
}
}
/**
* Returns the last time a heartbeat was received. Hearbeats are represented as whitespaces
* or \n characters received when an XmlPullParser.END_TAG was parsed. Note that we
* can falsely detect heartbeats when parsing XHTML content but that is fine.
*
* @return the time in milliseconds when a heartbeat was received.
*/
public long getLastHeartbeat() {
return lastHeartbeat;
}
public void resetInput() {
Reader oldReader = reader;
String oldEncoding = inputEncoding;
reset();
reader = oldReader;
inputEncoding = oldEncoding;
}
private boolean highSurrogateSeen = false;
/**
* Makes sure that each individual character is a valid XML character.
*
* Note that when MXParser is being modified to handle multibyte chars correctly, this method needs to change (as
* then, there are more codepoints to check).
*
*/
@Override
protected char more() throws IOException, XmlPullParserException {
final char codePoint = super.more(); // note - this does NOT return a codepoint now, but simply a (double byte) character!
boolean validCodepoint = false;
boolean isLowSurrogate = Character.isLowSurrogate(codePoint);
if ((codePoint == 0x0) || // 0x0 is not allowed, but flash clients insist on sending this as the very first character of a stream. We should stop allowing this codepoint after the first byte has been parsed.
(codePoint == 0x9) ||
(codePoint == 0xA) ||
(codePoint == 0xD) ||
((codePoint >= 0x20) && (codePoint <= 0xD7FF)) ||
((codePoint >= 0xE000) && (codePoint <= 0xFFFD))) {
validCodepoint = true;
}
else if (highSurrogateSeen) {
if (isLowSurrogate) {
validCodepoint = true;
} else {
throw new XmlPullParserException("High surrogate followed by non low surrogate '0x" + String.format("%x", (int) codePoint) + "'");
}
}
else if (isLowSurrogate) {
throw new XmlPullParserException("Low surrogate '0x " + String.format("%x", (int) codePoint) + " without preceeding high surrogate");
}
else if (Character.isHighSurrogate(codePoint)) {
highSurrogateSeen = true;
// Return here so that highSurrogateSeen is not reset
return codePoint;
}
// Always reset high surrogate seen
highSurrogateSeen = false;
if (validCodepoint)
return codePoint;
throw new XmlPullParserException("Illegal XML character '0x" + String.format("%x", (int) codePoint) + "'");
}
}