org.jivesoftware.openfire.nio.XMLLightweightParser Maven / Gradle / Ivy
The newest version!
/*
* Copyright (C) 2005-2008 Jive Software. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.jivesoftware.openfire.nio;
import java.nio.CharBuffer;
import java.nio.charset.Charset;
import java.nio.charset.CharsetDecoder;
import java.nio.charset.CodingErrorAction;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.apache.mina.core.buffer.IoBuffer;
import org.apache.mina.filter.codec.ProtocolDecoderException;
import org.jivesoftware.util.JiveGlobals;
import org.jivesoftware.util.PropertyEventDispatcher;
import org.jivesoftware.util.PropertyEventListener;
/**
* This is a Light-Weight XML Parser.
* It read data from a channel and collect data until data are available in
* the channel.
* When a message is complete you can retrieve messages invoking the method
* getMsgs() and you can invoke the method areThereMsgs() to know if at least
* an message is presents.
*
* @author Daniele Piras
* @author Gaston Dombiak
*/
class XMLLightweightParser {
private static final Pattern XML_HAS_CHARREF = Pattern.compile("(0*([0-9]+)|[xX]0*([0-9a-fA-F]+));");
private static final String MAX_PROPERTY_NAME = "xmpp.parser.buffer.size";
private static int maxBufferSize;
// Chars that rappresent CDATA section start
protected static char[] CDATA_START = {'<', '!', '[', 'C', 'D', 'A', 'T', 'A', '['};
// Chars that rappresent CDATA section end
protected static char[] CDATA_END = {']', ']', '>'};
// Buffer with all data retrieved
protected StringBuilder buffer = new StringBuilder();
// ---- INTERNAL STATUS -------
// Initial status
protected static final int INIT = 0;
// Status used when the first tag name is retrieved
protected static final int HEAD = 2;
// Status used when robot is inside the xml and it looking for the tag conclusion
protected static final int INSIDE = 3;
// Status used when a '<' is found and try to find the conclusion tag.
protected static final int PRETAIL = 4;
// Status used when the ending tag is equal to the head tag
protected static final int TAIL = 5;
// Status used when robot is inside the main tag and found an '/' to check '/>'.
protected static final int VERIFY_CLOSE_TAG = 6;
// Status used when you are inside a parameter
protected static final int INSIDE_PARAM_VALUE = 7;
// Status used when you are inside a cdata section
protected static final int INSIDE_CDATA = 8;
// Status used when you are outside a tag/reading text
protected static final int OUTSIDE = 9;
final String[] sstatus = {"INIT", "", "HEAD", "INSIDE", "PRETAIL", "TAIL", "VERIFY", "INSIDE_PARAM", "INSIDE_CDATA", "OUTSIDE"};
// Current robot status
protected int status = XMLLightweightParser.INIT;
// Index to looking for a CDATA section start or end.
protected int cdataOffset = 0;
// Number of chars that machs with the head tag. If the tailCount is equal to
// the head length so a close tag is found.
protected int tailCount = 0;
// Indicate the starting point in the buffer for the next message.
protected int startLastMsg = 0;
// Flag used to discover tag in the form .
protected boolean insideRootTag = false;
// Object conteining the head tag
protected StringBuilder head = new StringBuilder(5);
// List with all finished messages found.
protected List msgs = new ArrayList<>();
private int depth = 0;
protected boolean insideChildrenTag = false;
CharsetDecoder encoder;
static {
// Set default max buffer size to 1MB. If limit is reached then close connection
maxBufferSize = JiveGlobals.getIntProperty(MAX_PROPERTY_NAME, 1048576);
// Listen for changes to this property
PropertyEventDispatcher.addListener(new PropertyListener());
}
public XMLLightweightParser(Charset charset) {
encoder = charset.newDecoder()
.onMalformedInput(CodingErrorAction.REPLACE)
.onUnmappableCharacter(CodingErrorAction.REPLACE);
}
/*
* true if the parser has found some complete xml message.
*/
public boolean areThereMsgs() {
return (msgs.size() > 0);
}
/*
* @return an array with all messages found
*/
public String[] getMsgs() {
String[] res = new String[msgs.size()];
for (int i = 0; i < res.length; i++) {
res[i] = msgs.get(i);
}
msgs.clear();
invalidateBuffer();
return res;
}
/*
* Method use to re-initialize the buffer
*/
protected void invalidateBuffer() {
if (buffer.length() > 0) {
String str = buffer.substring(startLastMsg);
buffer.delete(0, buffer.length());
buffer.append(str);
buffer.trimToSize();
}
startLastMsg = 0;
}
/*
* Method that add a message to the list and reinit parser.
*/
protected void foundMsg(String msg) throws XMLNotWellFormedException {
// Add message to the complete message list
if (msg != null) {
if (hasIllegalCharacterReferences(msg)) {
buffer = null;
throw new XMLNotWellFormedException("Illegal character reference found in: " + msg);
}
msgs.add(msg);
}
// Move the position into the buffer
status = XMLLightweightParser.INIT;
tailCount = 0;
cdataOffset = 0;
head.setLength(0);
insideRootTag = false;
insideChildrenTag = false;
depth = 0;
}
/*
* Main reading method
*/
public void read(IoBuffer byteBuffer) throws Exception {
if (buffer == null) {
// exception was thrown before, avoid duplicate exception(s)
// "read" and discard remaining data
byteBuffer.position(byteBuffer.limit());
return;
}
invalidateBuffer();
// Check that the buffer is not bigger than 1 Megabyte. For security reasons
// we will abort parsing when 1 Mega of queued chars was found.
if (buffer.length() > maxBufferSize) {
// purge the local buffer / free memory
buffer = null;
// processing the exception takes quite long
final ProtocolDecoderException ex = new ProtocolDecoderException("Stopped parsing never ending stanza");
ex.setHexdump("(redacted hex dump of never ending stanza)");
throw ex;
}
CharBuffer charBuffer = CharBuffer.allocate(byteBuffer.capacity());
encoder.reset();
encoder.decode(byteBuffer.buf(), charBuffer, false);
char[] buf = new char[charBuffer.position()];
charBuffer.flip();
charBuffer.get(buf);
int readChar = buf.length;
// Just return if nothing was read
if (readChar == 0) {
return;
}
buffer.append(buf);
// Robot.
char ch;
boolean isHighSurrogate = false;
for (int i = 0; i < readChar; i++) {
ch = buf[i];
if (ch < 0x20 && ch != 0x9 && ch != 0xA && ch != 0xD && ch != 0x0) {
//Unicode characters in the range 0x0000-0x001F other than 9, A, and D are not allowed in XML
//We need to allow the NULL character, however, for Flash XMLSocket clients to work.
buffer = null;
throw new XMLNotWellFormedException("Character is invalid in: " + ch);
}
if (isHighSurrogate) {
if (Character.isLowSurrogate(ch)) {
// Everything is fine. Clean up traces for surrogates
isHighSurrogate = false;
}
else {
// Trigger error. Found high surrogate not followed by low surrogate
buffer = null;
throw new Exception("Found high surrogate not followed by low surrogate");
}
}
else if (Character.isHighSurrogate(ch)) {
isHighSurrogate = true;
}
else if (Character.isLowSurrogate(ch)) {
// Trigger error. Found low surrogate char without a preceding high surrogate
buffer = null;
throw new Exception("Found low surrogate char without a preceding high surrogate");
}
if (status == XMLLightweightParser.TAIL) {
// Looking for the close tag
if (depth < 1 && ch == head.charAt(tailCount)) {
tailCount++;
if (tailCount == head.length()) {
// Close stanza found!
// Calculate the correct start,end position of the message into the buffer
int end = buffer.length() - readChar + (i + 1);
String msg = buffer.substring(startLastMsg, end);
// Add message to the list
foundMsg(msg);
startLastMsg = end;
}
} else {
tailCount = 0;
status = XMLLightweightParser.INSIDE;
}
} else if (status == XMLLightweightParser.PRETAIL) {
if (ch == XMLLightweightParser.CDATA_START[cdataOffset]) {
cdataOffset++;
if (cdataOffset == XMLLightweightParser.CDATA_START.length) {
status = XMLLightweightParser.INSIDE_CDATA;
cdataOffset = 0;
continue;
}
} else {
cdataOffset = 0;
status = XMLLightweightParser.INSIDE;
}
if (ch == '/') {
status = XMLLightweightParser.TAIL;
depth--;
}
else if (ch == '!') {
// This is a ') {
depth--;
status = XMLLightweightParser.OUTSIDE;
if (depth < 1) {
// Found a tag in the form
int end = buffer.length() - readChar + (i + 1);
String msg = buffer.substring(startLastMsg, end);
// Add message to the list
foundMsg(msg);
startLastMsg = end;
}
} else if (ch == '<') {
status = XMLLightweightParser.PRETAIL;
insideChildrenTag = true;
} else {
status = XMLLightweightParser.INSIDE;
}
} else if (status == XMLLightweightParser.INSIDE_PARAM_VALUE) {
if (ch == '"') {
status = XMLLightweightParser.INSIDE;
}
} else if (status == XMLLightweightParser.INSIDE_CDATA) {
if (ch == XMLLightweightParser.CDATA_END[cdataOffset]) {
cdataOffset++;
if (cdataOffset == XMLLightweightParser.CDATA_END.length) {
status = XMLLightweightParser.OUTSIDE;
cdataOffset = 0;
}
} else if (cdataOffset == XMLLightweightParser.CDATA_END.length-1 && ch == XMLLightweightParser.CDATA_END[cdataOffset - 1]) {
// if we are looking for the last CDATA_END char, and we instead found an extra ']'
// char, leave cdataOffset as is and proceed to the next char. This could be a case
// where the XML character data ends with multiple square braces. For Example ]]]>
} else {
cdataOffset = 0;
}
} else if (status == XMLLightweightParser.INSIDE) {
if (ch == XMLLightweightParser.CDATA_START[cdataOffset]) {
cdataOffset++;
if (cdataOffset == XMLLightweightParser.CDATA_START.length) {
status = XMLLightweightParser.INSIDE_CDATA;
cdataOffset = 0;
continue;
}
} else {
cdataOffset = 0;
status = XMLLightweightParser.INSIDE;
}
if (ch == '"') {
status = XMLLightweightParser.INSIDE_PARAM_VALUE;
} else if (ch == '>') {
status = XMLLightweightParser.OUTSIDE;
if (insideRootTag && ("stream:stream>".equals(head.toString()) ||
("?xml>".equals(head.toString())) || ("flash:stream>".equals(head.toString())))) {
// Found closing stream:stream
int end = buffer.length() - readChar + (i + 1);
// Skip LF, CR and other "weird" characters that could appear
while (startLastMsg < end && '<' != buffer.charAt(startLastMsg)) {
startLastMsg++;
}
String msg = buffer.substring(startLastMsg, end);
foundMsg(msg);
startLastMsg = end;
}
insideRootTag = false;
} else if (ch == '/') {
status = XMLLightweightParser.VERIFY_CLOSE_TAG;
}
} else if (status == XMLLightweightParser.HEAD) {
if (ch == ' ' || ch == '>') {
// Append > to head to allow searching
head.append('>');
if(ch == '>')
status = XMLLightweightParser.OUTSIDE;
else
status = XMLLightweightParser.INSIDE;
insideRootTag = true;
insideChildrenTag = false;
continue;
}
else if (ch == '/' && head.length() > 0) {
status = XMLLightweightParser.VERIFY_CLOSE_TAG;
depth--;
}
head.append(ch);
} else if (status == XMLLightweightParser.INIT) {
if (ch == '<') {
status = XMLLightweightParser.HEAD;
depth = 1;
}
else {
startLastMsg++;
}
} else if (status == XMLLightweightParser.OUTSIDE) {
if (ch == '<') {
status = XMLLightweightParser.PRETAIL;
cdataOffset = 1;
insideChildrenTag = true;
}
}
}
if (head.length() > 0 &&
("/stream:stream>".equals(head.toString()) || ("/flash:stream>".equals(head.toString())))) {
// Found closing stream:stream
foundMsg("");
}
}
/**
* This method verifies if the provided argument contains at least one numeric character reference (
* CharRef ::= '' [0-9]+ ';' | '' [0-9a-fA-F]+ ';
) for which the decimal or hexidecimal
* character value refers to an invalid XML 1.0 character.
*
* @param string
* The input string
* @return true if the input string contains an invalid numeric character reference, false
* otherwise.
* @see http://www.w3.org/TR/2008/REC-xml-20081126/#dt-charref
*/
public static boolean hasIllegalCharacterReferences(String string) {
// If there's no character reference, don't bother to do more specific checking.
final Matcher matcher = XML_HAS_CHARREF.matcher(string);
while (matcher.find()) {
final String decValue = matcher.group(2);
if (decValue != null) {
final int value = Integer.parseInt(decValue);
if (!isLegalXmlCharacter(value)) {
return true;
} else {
continue;
}
}
final String hexValue = matcher.group(3);
if (hexValue != null) {
final int value = Integer.parseInt(hexValue, 16);
if (!isLegalXmlCharacter(value)) {
return true;
} else {
continue;
}
}
// This is bad. The XML_HAS_CHARREF expression should have a hit for either the decimal
// or the heximal notation.
throw new IllegalStateException(
"An error occurred while searching for illegal character references in the value [" + string + "].");
}
return false;
}
/**
* Verifies if the codepoint value represents a valid character as defined in paragraph 2.2 of
* "Extensible Markup Language (XML) 1.0 (Fifth Edition)"
*
* @param value
* the codepoint
* @return true if the codepoint is a valid charater per XML 1.0 definition, false otherwise.
* @see http://www.w3.org/TR/2008/REC-xml-20081126/#NT-Char
*/
public static boolean isLegalXmlCharacter(int value) {
return value == 0x9 || value == 0xA || value == 0xD || (value >= 0x20 && value <= 0xD7FF)
|| (value >= 0xE000 && value <= 0xFFFD) || (value >= 0x10000 && value <= 0x10FFFF);
}
private static class PropertyListener implements PropertyEventListener {
@Override
public void propertySet(String property, Map params) {
if (MAX_PROPERTY_NAME.equals(property)) {
String value = (String) params.get("value");
if (value != null) {
maxBufferSize = Integer.parseInt(value);
}
}
}
@Override
public void propertyDeleted(String property, Map params) {
if (MAX_PROPERTY_NAME.equals(property)) {
// Use default value when none was specified
maxBufferSize = 1048576;
}
}
@Override
public void xmlPropertySet(String property, Map params) {
// Do nothing
}
@Override
public void xmlPropertyDeleted(String property, Map params) {
// Do nothing
}
}
}