com.rometools.rome.io.impl.XmlFixerReader Maven / Gradle / Ivy
/*
* Copyright 2005 Sun Microsystems, Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*
*/
package com.rometools.rome.io.impl;
import java.io.IOException;
import java.io.Reader;
import java.util.HashMap;
import java.util.Map;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
public class XmlFixerReader extends Reader {
protected Reader in;
public XmlFixerReader(final Reader in) {
super(in);
this.in = in;
buffer = new StringBuffer();
state = 0;
}
private boolean trimmed;
private boolean cdata = false;
private final StringBuffer buffer;
private int bufferPos;
private int state = 0;
private boolean trimStream() throws IOException {
boolean hasContent = true;
int state = 0;
boolean loop;
int c;
do {
switch (state) {
case 0:
c = in.read();
if (c == -1) {
loop = false;
hasContent = false;
} else if (c == ' ' || c == '\n' || c == '\r' || c == '\t') {
loop = true;
} else if (c == '<') {
state = 1;
buffer.setLength(0);
bufferPos = 0;
buffer.append((char) c);
loop = true;
} else {
buffer.setLength(0);
bufferPos = 0;
buffer.append((char) c);
loop = false;
hasContent = true;
this.state = 3;
}
break;
case 1:
c = in.read();
if (c == -1) {
loop = false;
hasContent = true;
this.state = 3;
} else if (c != '!') {
buffer.append((char) c);
this.state = 3;
loop = false;
hasContent = true;
this.state = 3;
} else {
buffer.append((char) c);
state = 2;
loop = true;
}
break;
case 2:
c = in.read();
if (c == -1) {
loop = false;
hasContent = true;
this.state = 3;
} else if (c == '-') {
buffer.append((char) c);
state = 3;
loop = true;
} else {
buffer.append((char) c);
loop = false;
hasContent = true;
this.state = 3;
}
break;
case 3:
c = in.read();
if (c == -1) {
loop = false;
hasContent = true;
this.state = 3;
} else if (c == '-') {
buffer.append((char) c);
state = 4;
loop = true;
} else {
buffer.append((char) c);
loop = false;
hasContent = true;
this.state = 3;
}
break;
case 4:
c = in.read();
if (c == -1) {
loop = false;
hasContent = true;
this.state = 3;
} else if (c != '-') {
buffer.append((char) c);
loop = true;
} else {
buffer.append((char) c);
state = 5;
loop = true;
}
break;
case 5:
c = in.read();
if (c == -1) {
loop = false;
hasContent = true;
this.state = 3;
} else if (c != '-') {
buffer.append((char) c);
loop = true;
state = 4;
} else {
buffer.append((char) c);
state = 6;
loop = true;
}
break;
case 6:
c = in.read();
if (c == -1) {
loop = false;
hasContent = true;
this.state = 3;
} else if (c != '>') {
buffer.append((char) c);
loop = true;
state = 4;
} else {
buffer.setLength(0);
state = 0;
loop = true;
}
break;
default:
throw new IOException("It shouldn't happen");
}
} while (loop);
return hasContent;
}
@Override
public int read() throws IOException {
boolean loop;
if (!trimmed) { // trims XML stream
trimmed = true;
if (!trimStream()) {
return -1;
}
}
int c;
do { // converts literal entities to coded entities
switch (state) {
case 0: // reading chars from stream
c = in.read();
if (c > -1) {
if (c == '&') {
state = 1;
buffer.setLength(0);
bufferPos = 0;
buffer.append((char) c);
loop = true;
} else if (c == '<') {
state = 4;
buffer.setLength(0);
bufferPos = 0;
buffer.append((char) c);
loop = true;
} else if (c == ']' && cdata) {
state = 5;
buffer.setLength(0);
bufferPos = 0;
buffer.append((char) c);
loop = true;
} else {
loop = false;
}
} else {
loop = false;
}
break;
case 1: // reading entity from stream
c = in.read();
if (c > -1) {
if (c == ';') {
buffer.append((char) c);
state = 2;
loop = true;
} else if (c >= 'a' && c <= 'z' || c >= 'A' && c <= 'Z' || c == '#' || c >= '0' && c <= '9') {
buffer.append((char) c);
loop = true;
} else {
// no ';' to match the '&' lets just make the '&'
// a legal xml character entity '&'
if (!cdata) {
buffer.insert(1, "amp;");
}
buffer.append((char) c);
state = 3;
loop = true;
}
} else {
// no ';' to match the '&' lets just make the '&'
// a legal xml character entity '&'
if (!cdata) {
buffer.insert(1, "amp;");
}
state = 3;
loop = true;
}
break;
case 2: // replacing entity
c = 0;
final String literalEntity = buffer.toString();
final String codedEntity = CODED_ENTITIES.get(literalEntity);
if (codedEntity != null) {
buffer.setLength(0);
buffer.append(codedEntity);
} // else we leave what was in the stream
state = 3;
loop = true;
break;
case 3: // consuming buffer
if (bufferPos < buffer.length()) {
c = buffer.charAt(bufferPos++);
loop = false;
} else {
c = 0;
state = 0;
loop = true;
}
break;
case 4: // checking for CDATA
c = in.read();
loop = true;
state = 3;
switch (c) {
case -1:
// end of stream
break;
case ' ':
case '>':
case '/':
// tag end or something like this
buffer.append((char) c);
break;
case '[':
buffer.append((char) c);
final String actBufferContent = buffer.toString();
if ("':
buffer.append((char) c);
final String actBufferContent = buffer.toString();
if ("]]>".equals(actBufferContent)) {
cdata = false;
}
break;
default:
buffer.append((char) c);
break;
}
break;
default:
throw new IOException("It shouldn't happen");
}
} while (loop);
return c;
}
@Override
public int read(final char[] buffer, final int offset, final int len) throws IOException {
int charsRead = 0;
int c = this.read();
if (c == -1) {
return -1;
}
buffer[offset + charsRead++] = (char) c;
while (charsRead < len && (c = this.read()) > -1) {
buffer[offset + charsRead++] = (char) c;
}
return charsRead;
}
@Override
public long skip(final long n) throws IOException {
if (n == 0) {
return 0;
} else if (n < 0) {
throw new IllegalArgumentException("'n' cannot be negative");
}
int c = this.read();
long counter = 1;
while (c > -1 && counter < n) {
c = this.read();
counter++;
}
return counter;
}
@Override
public boolean ready() throws IOException {
return state != 0 || in.ready();
}
@Override
public boolean markSupported() {
return false;
}
@Override
public void mark(final int readAheadLimit) throws IOException {
throw new IOException("Stream does not support mark");
}
@Override
public void reset() throws IOException {
throw new IOException("Stream does not support mark");
}
@Override
public void close() throws IOException {
in.close();
}
private static Map CODED_ENTITIES = new HashMap();
static {
// note: refer to Character entity references in HTML 4
// at http://www.w3.org/TR/REC-html40/sgml/entities.html
// Character entity set.
// HTMLlat1 "-//W3C//ENTITIES Latin 1//EN//HTML"
CODED_ENTITIES.put(" ", " ");
CODED_ENTITIES.put("¡", "¡");
CODED_ENTITIES.put("¢", "¢");
CODED_ENTITIES.put("£", "£");
CODED_ENTITIES.put("¤", "¤");
CODED_ENTITIES.put("¥", "¥");
CODED_ENTITIES.put("¦", "¦");
CODED_ENTITIES.put("§", "§");
CODED_ENTITIES.put("¨", "¨");
CODED_ENTITIES.put("©", "©");
CODED_ENTITIES.put("ª", "ª");
CODED_ENTITIES.put("«", "«");
CODED_ENTITIES.put("¬", "¬");
CODED_ENTITIES.put("", "");
CODED_ENTITIES.put("®", "®");
CODED_ENTITIES.put("¯", "¯");
CODED_ENTITIES.put("°", "°");
CODED_ENTITIES.put("±", "±");
CODED_ENTITIES.put("²", "²");
CODED_ENTITIES.put("³", "³");
CODED_ENTITIES.put("´", "´");
CODED_ENTITIES.put("µ", "µ");
CODED_ENTITIES.put("¶", "¶");
CODED_ENTITIES.put("·", "·");
CODED_ENTITIES.put("¸", "¸");
CODED_ENTITIES.put("¹", "¹");
CODED_ENTITIES.put("º", "º");
CODED_ENTITIES.put("»", "»");
CODED_ENTITIES.put("¼", "¼");
CODED_ENTITIES.put("½", "½");
CODED_ENTITIES.put("¾", "¾");
CODED_ENTITIES.put("¿", "¿");
CODED_ENTITIES.put("À", "À");
CODED_ENTITIES.put("Á", "Á");
CODED_ENTITIES.put("Â", "Â");
CODED_ENTITIES.put("Ã", "Ã");
CODED_ENTITIES.put("Ä", "Ä");
CODED_ENTITIES.put("Å", "Å");
CODED_ENTITIES.put("Æ", "Æ");
CODED_ENTITIES.put("Ç", "Ç");
CODED_ENTITIES.put("È", "È");
CODED_ENTITIES.put("É", "É");
CODED_ENTITIES.put("Ê", "Ê");
CODED_ENTITIES.put("Ë", "Ë");
CODED_ENTITIES.put("Ì", "Ì");
CODED_ENTITIES.put("Í", "Í");
CODED_ENTITIES.put("Î", "Î");
CODED_ENTITIES.put("Ï", "Ï");
CODED_ENTITIES.put("Ð", "Ð");
CODED_ENTITIES.put("Ñ", "Ñ");
CODED_ENTITIES.put("Ò", "Ò");
CODED_ENTITIES.put("Ó", "Ó");
CODED_ENTITIES.put("Ô", "Ô");
CODED_ENTITIES.put("Õ", "Õ");
CODED_ENTITIES.put("Ö", "Ö");
CODED_ENTITIES.put("×", "×");
CODED_ENTITIES.put("Ø", "Ø");
CODED_ENTITIES.put("Ù", "Ù");
CODED_ENTITIES.put("Ú", "Ú");
CODED_ENTITIES.put("Û", "Û");
CODED_ENTITIES.put("Ü", "Ü");
CODED_ENTITIES.put("Ý", "Ý");
CODED_ENTITIES.put("Þ", "Þ");
CODED_ENTITIES.put("ß", "ß");
CODED_ENTITIES.put("à", "à");
CODED_ENTITIES.put("á", "á");
CODED_ENTITIES.put("â", "â");
CODED_ENTITIES.put("ã", "ã");
CODED_ENTITIES.put("ä", "ä");
CODED_ENTITIES.put("å", "å");
CODED_ENTITIES.put("æ", "æ");
CODED_ENTITIES.put("ç", "ç");
CODED_ENTITIES.put("è", "è");
CODED_ENTITIES.put("é", "é");
CODED_ENTITIES.put("ê", "ê");
CODED_ENTITIES.put("ë", "ë");
CODED_ENTITIES.put("ì", "ì");
CODED_ENTITIES.put("í", "í");
CODED_ENTITIES.put("î", "î");
CODED_ENTITIES.put("ï", "ï");
CODED_ENTITIES.put("ð", "ð");
CODED_ENTITIES.put("ñ", "ñ");
CODED_ENTITIES.put("ò", "ò");
CODED_ENTITIES.put("ó", "ó");
CODED_ENTITIES.put("ô", "ô");
CODED_ENTITIES.put("õ", "õ");
CODED_ENTITIES.put("ö", "ö");
CODED_ENTITIES.put("÷", "÷");
CODED_ENTITIES.put("ø", "ø");
CODED_ENTITIES.put("ù", "ù");
CODED_ENTITIES.put("ú", "ú");
CODED_ENTITIES.put("û", "û");
CODED_ENTITIES.put("ü", "ü");
CODED_ENTITIES.put("ý", "ý");
CODED_ENTITIES.put("þ", "þ");
CODED_ENTITIES.put("ÿ", "ÿ");
// Mathematical, Greek and Symbolic characters for HTML.
// HTMLsymbol "-//W3C//ENTITIES Symbols//EN//HTML"
CODED_ENTITIES.put("ƒ", "ƒ");
CODED_ENTITIES.put("Α", "Α");
CODED_ENTITIES.put("Β", "Β");
CODED_ENTITIES.put("Γ", "Γ");
CODED_ENTITIES.put("Δ", "Δ");
CODED_ENTITIES.put("Ε", "Ε");
CODED_ENTITIES.put("Ζ", "Ζ");
CODED_ENTITIES.put("Η", "Η");
CODED_ENTITIES.put("Θ", "Θ");
CODED_ENTITIES.put("Ι", "Ι");
CODED_ENTITIES.put("Κ", "Κ");
CODED_ENTITIES.put("Λ", "Λ");
CODED_ENTITIES.put("Μ", "Μ");
CODED_ENTITIES.put("Ν", "Ν");
CODED_ENTITIES.put("Ξ", "Ξ");
CODED_ENTITIES.put("Ο", "Ο");
CODED_ENTITIES.put("Π", "Π");
CODED_ENTITIES.put("Ρ", "Ρ");
CODED_ENTITIES.put("Σ", "Σ");
CODED_ENTITIES.put("Τ", "Τ");
CODED_ENTITIES.put("Υ", "Υ");
CODED_ENTITIES.put("Φ", "Φ");
CODED_ENTITIES.put("Χ", "Χ");
CODED_ENTITIES.put("Ψ", "Ψ");
CODED_ENTITIES.put("Ω", "Ω");
CODED_ENTITIES.put("α", "α");
CODED_ENTITIES.put("β", "β");
CODED_ENTITIES.put("γ", "γ");
CODED_ENTITIES.put("δ", "δ");
CODED_ENTITIES.put("ε", "ε");
CODED_ENTITIES.put("ζ", "ζ");
CODED_ENTITIES.put("η", "η");
CODED_ENTITIES.put("θ", "θ");
CODED_ENTITIES.put("ι", "ι");
CODED_ENTITIES.put("κ", "κ");
CODED_ENTITIES.put("λ", "λ");
CODED_ENTITIES.put("μ", "μ");
CODED_ENTITIES.put("ν", "ν");
CODED_ENTITIES.put("ξ", "ξ");
CODED_ENTITIES.put("ο", "ο");
CODED_ENTITIES.put("π", "π");
CODED_ENTITIES.put("ρ", "ρ");
CODED_ENTITIES.put("ς", "ς");
CODED_ENTITIES.put("σ", "σ");
CODED_ENTITIES.put("τ", "τ");
CODED_ENTITIES.put("υ", "υ");
CODED_ENTITIES.put("φ", "φ");
CODED_ENTITIES.put("χ", "χ");
CODED_ENTITIES.put("ψ", "ψ");
CODED_ENTITIES.put("ω", "ω");
CODED_ENTITIES.put("ϑ", "ϑ");
CODED_ENTITIES.put("ϒ", "ϒ");
CODED_ENTITIES.put("ϖ", "ϖ");
CODED_ENTITIES.put("•", "•");
CODED_ENTITIES.put("…", "…");
CODED_ENTITIES.put("′", "′");
CODED_ENTITIES.put("″", "″");
CODED_ENTITIES.put("‾", "‾");
CODED_ENTITIES.put("⁄", "⁄");
CODED_ENTITIES.put("℘", "℘");
CODED_ENTITIES.put("ℑ", "ℑ");
CODED_ENTITIES.put("ℜ", "ℜ");
CODED_ENTITIES.put("™", "™");
CODED_ENTITIES.put("ℵ", "ℵ");
CODED_ENTITIES.put("←", "←");
CODED_ENTITIES.put("↑", "↑");
CODED_ENTITIES.put("→", "→");
CODED_ENTITIES.put("↓", "↓");
CODED_ENTITIES.put("↔", "↔");
CODED_ENTITIES.put("↵", "↵");
CODED_ENTITIES.put("⇐", "⇐");
CODED_ENTITIES.put("⇑", "⇑");
CODED_ENTITIES.put("⇒", "⇒");
CODED_ENTITIES.put("⇓", "⇓");
CODED_ENTITIES.put("⇔", "⇔");
CODED_ENTITIES.put("∀", "∀");
CODED_ENTITIES.put("∂", "∂");
CODED_ENTITIES.put("∃", "∃");
CODED_ENTITIES.put("∅", "∅");
CODED_ENTITIES.put("∇", "∇");
CODED_ENTITIES.put("∈", "∈");
CODED_ENTITIES.put("∉", "∉");
CODED_ENTITIES.put("∋", "∋");
CODED_ENTITIES.put("∏", "∏");
CODED_ENTITIES.put("∑", "∑");
CODED_ENTITIES.put("−", "−");
CODED_ENTITIES.put("∗", "∗");
CODED_ENTITIES.put("√", "√");
CODED_ENTITIES.put("∝", "∝");
CODED_ENTITIES.put("∞", "∞");
CODED_ENTITIES.put("∠", "∠");
CODED_ENTITIES.put("∧", "∧");
CODED_ENTITIES.put("∨", "∨");
CODED_ENTITIES.put("∩", "∩");
CODED_ENTITIES.put("∪", "∪");
CODED_ENTITIES.put("∫", "∫");
CODED_ENTITIES.put("∴", "∴");
CODED_ENTITIES.put("∼", "∼");
CODED_ENTITIES.put("≅", "≅");
CODED_ENTITIES.put("≈", "≈");
CODED_ENTITIES.put("≠", "≠");
CODED_ENTITIES.put("≡", "≡");
CODED_ENTITIES.put("≤", "≤");
CODED_ENTITIES.put("≥", "≥");
CODED_ENTITIES.put("⊂", "⊂");
CODED_ENTITIES.put("⊃", "⊃");
CODED_ENTITIES.put("⊄", "⊄");
CODED_ENTITIES.put("⊆", "⊆");
CODED_ENTITIES.put("⊇", "⊇");
CODED_ENTITIES.put("⊕", "⊕");
CODED_ENTITIES.put("⊗", "⊗");
CODED_ENTITIES.put("⊥", "⊥");
CODED_ENTITIES.put("⋅", "⋅");
CODED_ENTITIES.put("⌈", "⌈");
CODED_ENTITIES.put("⌉", "⌉");
CODED_ENTITIES.put("⌊", "⌊");
CODED_ENTITIES.put("⌋", "⌋");
CODED_ENTITIES.put("〈", "〈");
CODED_ENTITIES.put("〉", "〉");
CODED_ENTITIES.put("◊", "◊");
CODED_ENTITIES.put("♠", "♠");
CODED_ENTITIES.put("♣", "♣");
CODED_ENTITIES.put("♥", "♥");
CODED_ENTITIES.put("♦", "♦");
// Special characters for HTML.
// HTMLspecial "-//W3C//ENTITIES Special//EN//HTML"
CODED_ENTITIES.put(""", """);
CODED_ENTITIES.put("&", "&");
CODED_ENTITIES.put("<", "<");
CODED_ENTITIES.put(">", ">");
CODED_ENTITIES.put("Œ", "Œ");
CODED_ENTITIES.put("œ", "œ");
CODED_ENTITIES.put("Š", "Š");
CODED_ENTITIES.put("š", "š");
CODED_ENTITIES.put("Ÿ", "Ÿ");
CODED_ENTITIES.put("ˆ", "ˆ");
CODED_ENTITIES.put("˜", "˜");
CODED_ENTITIES.put(" ", " ");
CODED_ENTITIES.put(" ", " ");
CODED_ENTITIES.put(" ", " ");
CODED_ENTITIES.put("", "");
CODED_ENTITIES.put("", "");
CODED_ENTITIES.put("", "");
CODED_ENTITIES.put("", "");
CODED_ENTITIES.put("–", "–");
CODED_ENTITIES.put("—", "—");
CODED_ENTITIES.put("‘", "‘");
CODED_ENTITIES.put("’", "’");
CODED_ENTITIES.put("‚", "‚");
CODED_ENTITIES.put("“", "“");
CODED_ENTITIES.put("”", "”");
CODED_ENTITIES.put("„", "„");
CODED_ENTITIES.put("†", "†");
CODED_ENTITIES.put("‡", "‡");
CODED_ENTITIES.put("‰", "‰");
CODED_ENTITIES.put("‹", "‹");
CODED_ENTITIES.put("›", "›");
CODED_ENTITIES.put("€", "€");
}
//
// It shouldn't be here but well, just reusing the CODED_ENTITIES Map :)
//
private static Pattern ENTITIES_PATTERN = Pattern.compile("&[A-Za-z^#]+;");
public String processHtmlEntities(final String s) {
if (s.indexOf('&') == -1) {
return s;
}
final StringBuffer sb = new StringBuffer(s.length());
int pos = 0;
while (pos < s.length()) {
String chunck = s.substring(pos);
final Matcher m = ENTITIES_PATTERN.matcher(chunck);
if (m.find()) {
final int b = pos + m.start();
final int e = pos + m.end();
if (b > pos) {
sb.append(s.substring(pos, b));
pos = b;
}
chunck = s.substring(pos, e);
String codedEntity = CODED_ENTITIES.get(chunck);
if (codedEntity == null) {
codedEntity = chunck;
}
sb.append(codedEntity);
pos = e;
} else {
sb.append(chunck);
pos += chunck.length();
}
}
return sb.toString();
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy