org.apache.axiom.om.util.XMLStreamWriterRemoveIllegalChars Maven / Gradle / Ivy
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package org.apache.axiom.om.util;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
/**
* This is an XMLStreamWriterFilter that removes illegal characters.
*
* Valid and invalid character ranges are defined by:
* http://www.w3.org/TR/2008/REC-xml-20081126/#NT-Char
*
*
*/
public class XMLStreamWriterRemoveIllegalChars extends
XMLStreamWriterFilterBase {
private static final Log log = LogFactory.getLog(XMLStreamWriterRemoveIllegalChars.class);
public XMLStreamWriterRemoveIllegalChars() {
super();
if (log.isDebugEnabled()) {
log.debug("Creating XMLStreamWriterRemoveIllegalChars object " + this);
}
}
// Characters less than 0x20 may be control characters and should be removed
// Note the non-initialized bytes in this array are zero
private static byte[] REMOVE = new byte[32];
static {
REMOVE[0x00] = 1;
REMOVE[0x01] = 1;
REMOVE[0x02] = 1;
REMOVE[0x03] = 1;
REMOVE[0x04] = 1;
REMOVE[0x05] = 1;
REMOVE[0x06] = 1;
REMOVE[0x07] = 1;
REMOVE[0x08] = 1;
// 0x09 is TAB...which is allowed
// 0x0A is LINEFEED...which is allowed
REMOVE[0x0B] = 1;
REMOVE[0x0C] = 1;
// 0x0D is CARRIAGE RETURN, which is allowed
REMOVE[0x0E] = 1;
REMOVE[0x0F] = 1;
REMOVE[0x10] = 1;
REMOVE[0x11] = 1;
REMOVE[0x12] = 1;
REMOVE[0x13] = 1;
REMOVE[0x14] = 1;
REMOVE[0x15] = 1;
REMOVE[0x16] = 1;
REMOVE[0x17] = 1;
REMOVE[0x18] = 1;
REMOVE[0x19] = 1;
REMOVE[0x1A] = 1;
REMOVE[0x1B] = 1;
REMOVE[0x1C] = 1;
REMOVE[0x1D] = 1;
REMOVE[0x1E] = 1;
REMOVE[0x1F] = 1;
}
// These two characters are not allowed
private final int FFFE = 0xFFFE;
private final char FFFF = 0xFFFF;
// Characters in the surrogate range are not allowed
// (unless the result is a valid supplemental character)
private final char SURROGATE_START = 0xD800;
private final char SURROGATE_END = 0xDFFF;
/* (non-Javadoc)
* @see org.apache.axiom.om.util.XMLStreamWriterFilterBase#xmlData(java.lang.String)
*/
protected String xmlData(String value) {
char[] buffer = null;
int len = value.length();
int srcI = 0;
int tgtI = 0;
int copyLength = 0;
int i = 0;
// Traverse all of the characters in the input String (value)
while (i < len) {
// Get the codepoint of the character at the index
// Note that the code point may be two characters long (a supplemental character)
int cp = value.codePointAt(i);
if (cp > FFFF) {
// Supplemental Character...Increase index by 2
// Increase the length of good characters to copy by 2
i = i+2;
copyLength = copyLength+2;
} else {
// See if the character is invalid
if ((cp < 0x20 && (REMOVE[cp] > 0)) || // Control Character
(cp >= SURROGATE_START && cp <= SURROGATE_END ) || // Bad surrogate
(cp == FFFF || cp == FFFE)) { // or illegal character
// Flow to here indicates that the character is not allowed.
// The good characters (up to this point) are copied into the buffer.
// Note that the buffer is initialized with the original characters.
// Thus the buffer copy is always done on the same buffer (saving
// both time and space).
// Make the buffer on demand
if (buffer == null) {
if (log.isDebugEnabled()) {
log.debug("One or more illegal characterss found. Codepoint=" + cp);
}
buffer = value.toCharArray();
}
// Copy the good characters into the buffer
System.arraycopy(buffer, srcI, buffer, tgtI, copyLength);
tgtI = tgtI + copyLength; // Update the target location in the array
srcI = i + 1; // Skip over the current character
copyLength = 0; // reset new copy length
} else {
// Valid character, increase copy length
copyLength = copyLength+1;
}
// Single bit16 character, increase index by 1
i = i+1;
}
}
if (buffer == null) {
// Normal case, no illegal characters removed..No buffer
return value;
} else {
// Move the final valid characters to the buffer
// and return a string representing the value
System.arraycopy(buffer, srcI, buffer, tgtI, copyLength);
String newValue = new String(buffer, 0, tgtI + copyLength);
return newValue;
}
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy