
com.rometools.rome.io.impl.XmlFixerReader Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of rome Show documentation
Show all versions of rome Show documentation
All Roads Lead to ROME. ROME is a set of Atom/RSS Java utilities that make it
easy to work in Java with most syndication formats. Today it accepts all flavors of RSS
(0.90, 0.91, 0.92, 0.93, 0.94, 1.0 and 2.0), Atom 0.3 and Atom 1.0 feeds. Rome includes
a set of parsers and generators for the various flavors of feeds, as well as converters
to convert from one format to another. The parsers can give you back Java objects that
are either specific for the format you want to work with, or a generic normalized
SyndFeed object that lets you work on with the data without bothering about the
underlying format.
/*
* Copyright 2005 Sun Microsystems, Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*
*/
package com.rometools.rome.io.impl;
import java.io.IOException;
import java.io.Reader;
import java.util.HashMap;
import java.util.Map;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
/**
* @author Alejandro Abdelnur
*/
public class XmlFixerReader extends Reader {
protected Reader in;
public XmlFixerReader(final Reader in) {
super(in);
this.in = in;
buffer = new StringBuffer();
state = 0;
}
private boolean trimmed;
private boolean cdata = false;
private final StringBuffer buffer;
private int bufferPos;
private int state = 0;
private boolean trimStream() throws IOException {
boolean hasContent = true;
int state = 0;
boolean loop;
int c;
do {
switch (state) {
case 0:
c = in.read();
if (c == -1) {
loop = false;
hasContent = false;
} else if (c == ' ' || c == '\n' || c == '\r' || c == '\t') {
loop = true;
} else if (c == '<') {
state = 1;
buffer.setLength(0);
bufferPos = 0;
buffer.append((char) c);
loop = true;
} else {
buffer.setLength(0);
bufferPos = 0;
buffer.append((char) c);
loop = false;
hasContent = true;
this.state = 3;
}
break;
case 1:
c = in.read();
if (c == -1) {
loop = false;
hasContent = true;
this.state = 3;
} else if (c != '!') {
buffer.append((char) c);
this.state = 3;
loop = false;
hasContent = true;
this.state = 3;
} else {
buffer.append((char) c);
state = 2;
loop = true;
}
break;
case 2:
c = in.read();
if (c == -1) {
loop = false;
hasContent = true;
this.state = 3;
} else if (c == '-') {
buffer.append((char) c);
state = 3;
loop = true;
} else {
buffer.append((char) c);
loop = false;
hasContent = true;
this.state = 3;
}
break;
case 3:
c = in.read();
if (c == -1) {
loop = false;
hasContent = true;
this.state = 3;
} else if (c == '-') {
buffer.append((char) c);
state = 4;
loop = true;
} else {
buffer.append((char) c);
loop = false;
hasContent = true;
this.state = 3;
}
break;
case 4:
c = in.read();
if (c == -1) {
loop = false;
hasContent = true;
this.state = 3;
} else if (c != '-') {
buffer.append((char) c);
loop = true;
} else {
buffer.append((char) c);
state = 5;
loop = true;
}
break;
case 5:
c = in.read();
if (c == -1) {
loop = false;
hasContent = true;
this.state = 3;
} else if (c != '-') {
buffer.append((char) c);
loop = true;
state = 4;
} else {
buffer.append((char) c);
state = 6;
loop = true;
}
break;
case 6:
c = in.read();
if (c == -1) {
loop = false;
hasContent = true;
this.state = 3;
} else if (c != '>') {
buffer.append((char) c);
loop = true;
state = 4;
} else {
buffer.setLength(0);
state = 0;
loop = true;
}
break;
default:
throw new IOException("It shouldn't happen");
}
} while (loop);
return hasContent;
}
@Override
public int read() throws IOException {
boolean loop;
if (!trimmed) { // trims XML stream
trimmed = true;
if (!trimStream()) {
return -1;
}
}
int c;
do { // converts literal entities to coded entities
switch (state) {
case 0: // reading chars from stream
c = in.read();
if (c > -1) {
if (c == '&') {
state = 1;
buffer.setLength(0);
bufferPos = 0;
buffer.append((char) c);
loop = true;
} else if (c == '<') {
state = 4;
buffer.setLength(0);
bufferPos = 0;
buffer.append((char) c);
loop = true;
} else if (c == ']' && cdata) {
state = 5;
buffer.setLength(0);
bufferPos = 0;
buffer.append((char) c);
loop = true;
} else {
loop = false;
}
} else {
loop = false;
}
break;
case 1: // reading entity from stream
c = in.read();
if (c > -1) {
if (c == ';') {
buffer.append((char) c);
state = 2;
loop = true;
} else if (c >= 'a' && c <= 'z' || c >= 'A' && c <= 'Z' || c == '#' || c >= '0' && c <= '9') {
buffer.append((char) c);
loop = true;
} else {
// no ';' to match the '&' lets just make the '&'
// a legal xml character entity '&'
if (!cdata) {
buffer.insert(1, "amp;");
}
buffer.append((char) c);
state = 3;
loop = true;
}
} else {
// no ';' to match the '&' lets just make the '&'
// a legal xml character entity '&'
if (!cdata) {
buffer.insert(1, "amp;");
}
state = 3;
loop = true;
}
break;
case 2: // replacing entity
c = 0;
final String literalEntity = buffer.toString();
final String codedEntity = CODED_ENTITIES.get(literalEntity);
if (codedEntity != null) {
buffer.setLength(0);
buffer.append(codedEntity);
} // else we leave what was in the stream
state = 3;
loop = true;
break;
case 3: // consuming buffer
if (bufferPos < buffer.length()) {
c = buffer.charAt(bufferPos++);
loop = false;
} else {
c = 0;
state = 0;
loop = true;
}
break;
case 4: // checking for CDATA
c = in.read();
loop = true;
state = 3;
switch (c) {
case -1:
// end of stream
break;
case ' ':
case '>':
case '/':
// tag end or something like this
buffer.append((char) c);
break;
case '[':
buffer.append((char) c);
final String actBufferContent = buffer.toString();
if ("':
buffer.append((char) c);
final String actBufferContent = buffer.toString();
if ("]]>".equals(actBufferContent)) {
cdata = false;
}
break;
default:
buffer.append((char) c);
break;
}
break;
default:
throw new IOException("It shouldn't happen");
}
} while (loop);
return c;
}
@Override
public int read(final char[] buffer, final int offset, final int len) throws IOException {
int charsRead = 0;
int c = this.read();
if (c == -1) {
return -1;
}
buffer[offset + charsRead++] = (char) c;
while (charsRead < len && (c = this.read()) > -1) {
buffer[offset + charsRead++] = (char) c;
}
return charsRead;
}
@Override
public long skip(final long n) throws IOException {
if (n == 0) {
return 0;
} else if (n < 0) {
throw new IllegalArgumentException("'n' cannot be negative");
}
int c = this.read();
long counter = 1;
while (c > -1 && counter < n) {
c = this.read();
counter++;
}
return counter;
}
@Override
public boolean ready() throws IOException {
return state != 0 || in.ready();
}
@Override
public boolean markSupported() {
return false;
}
@Override
public void mark(final int readAheadLimit) throws IOException {
throw new IOException("Stream does not support mark");
}
@Override
public void reset() throws IOException {
throw new IOException("Stream does not support mark");
}
@Override
public void close() throws IOException {
in.close();
}
private static Map CODED_ENTITIES = new HashMap();
static {
// note: refer to Character entity references in HTML 4
// at http://www.w3.org/TR/REC-html40/sgml/entities.html
// Character entity set.
// HTMLlat1 "-//W3C//ENTITIES Latin 1//EN//HTML"
CODED_ENTITIES.put(" ", " ");
CODED_ENTITIES.put("¡", "¡");
CODED_ENTITIES.put("¢", "¢");
CODED_ENTITIES.put("£", "£");
CODED_ENTITIES.put("¤", "¤");
CODED_ENTITIES.put("¥", "¥");
CODED_ENTITIES.put("¦", "¦");
CODED_ENTITIES.put("§", "§");
CODED_ENTITIES.put("¨", "¨");
CODED_ENTITIES.put("©", "©");
CODED_ENTITIES.put("ª", "ª");
CODED_ENTITIES.put("«", "«");
CODED_ENTITIES.put("¬", "¬");
CODED_ENTITIES.put("", "");
CODED_ENTITIES.put("®", "®");
CODED_ENTITIES.put("¯", "¯");
CODED_ENTITIES.put("°", "°");
CODED_ENTITIES.put("±", "±");
CODED_ENTITIES.put("²", "²");
CODED_ENTITIES.put("³", "³");
CODED_ENTITIES.put("´", "´");
CODED_ENTITIES.put("µ", "µ");
CODED_ENTITIES.put("¶", "¶");
CODED_ENTITIES.put("·", "·");
CODED_ENTITIES.put("¸", "¸");
CODED_ENTITIES.put("¹", "¹");
CODED_ENTITIES.put("º", "º");
CODED_ENTITIES.put("»", "»");
CODED_ENTITIES.put("¼", "¼");
CODED_ENTITIES.put("½", "½");
CODED_ENTITIES.put("¾", "¾");
CODED_ENTITIES.put("¿", "¿");
CODED_ENTITIES.put("À", "À");
CODED_ENTITIES.put("Á", "Á");
CODED_ENTITIES.put("Â", "Â");
CODED_ENTITIES.put("Ã", "Ã");
CODED_ENTITIES.put("Ä", "Ä");
CODED_ENTITIES.put("Å", "Å");
CODED_ENTITIES.put("Æ", "Æ");
CODED_ENTITIES.put("Ç", "Ç");
CODED_ENTITIES.put("È", "È");
CODED_ENTITIES.put("É", "É");
CODED_ENTITIES.put("Ê", "Ê");
CODED_ENTITIES.put("Ë", "Ë");
CODED_ENTITIES.put("Ì", "Ì");
CODED_ENTITIES.put("Í", "Í");
CODED_ENTITIES.put("Î", "Î");
CODED_ENTITIES.put("Ï", "Ï");
CODED_ENTITIES.put("Ð", "Ð");
CODED_ENTITIES.put("Ñ", "Ñ");
CODED_ENTITIES.put("Ò", "Ò");
CODED_ENTITIES.put("Ó", "Ó");
CODED_ENTITIES.put("Ô", "Ô");
CODED_ENTITIES.put("Õ", "Õ");
CODED_ENTITIES.put("Ö", "Ö");
CODED_ENTITIES.put("×", "×");
CODED_ENTITIES.put("Ø", "Ø");
CODED_ENTITIES.put("Ù", "Ù");
CODED_ENTITIES.put("Ú", "Ú");
CODED_ENTITIES.put("Û", "Û");
CODED_ENTITIES.put("Ü", "Ü");
CODED_ENTITIES.put("Ý", "Ý");
CODED_ENTITIES.put("Þ", "Þ");
CODED_ENTITIES.put("ß", "ß");
CODED_ENTITIES.put("à", "à");
CODED_ENTITIES.put("á", "á");
CODED_ENTITIES.put("â", "â");
CODED_ENTITIES.put("ã", "ã");
CODED_ENTITIES.put("ä", "ä");
CODED_ENTITIES.put("å", "å");
CODED_ENTITIES.put("æ", "æ");
CODED_ENTITIES.put("ç", "ç");
CODED_ENTITIES.put("è", "è");
CODED_ENTITIES.put("é", "é");
CODED_ENTITIES.put("ê", "ê");
CODED_ENTITIES.put("ë", "ë");
CODED_ENTITIES.put("ì", "ì");
CODED_ENTITIES.put("í", "í");
CODED_ENTITIES.put("î", "î");
CODED_ENTITIES.put("ï", "ï");
CODED_ENTITIES.put("ð", "ð");
CODED_ENTITIES.put("ñ", "ñ");
CODED_ENTITIES.put("ò", "ò");
CODED_ENTITIES.put("ó", "ó");
CODED_ENTITIES.put("ô", "ô");
CODED_ENTITIES.put("õ", "õ");
CODED_ENTITIES.put("ö", "ö");
CODED_ENTITIES.put("÷", "÷");
CODED_ENTITIES.put("ø", "ø");
CODED_ENTITIES.put("ù", "ù");
CODED_ENTITIES.put("ú", "ú");
CODED_ENTITIES.put("û", "û");
CODED_ENTITIES.put("ü", "ü");
CODED_ENTITIES.put("ý", "ý");
CODED_ENTITIES.put("þ", "þ");
CODED_ENTITIES.put("ÿ", "ÿ");
// Mathematical, Greek and Symbolic characters for HTML.
// HTMLsymbol "-//W3C//ENTITIES Symbols//EN//HTML"
CODED_ENTITIES.put("ƒ", "ƒ");
CODED_ENTITIES.put("Α", "Α");
CODED_ENTITIES.put("Β", "Β");
CODED_ENTITIES.put("Γ", "Γ");
CODED_ENTITIES.put("Δ", "Δ");
CODED_ENTITIES.put("Ε", "Ε");
CODED_ENTITIES.put("Ζ", "Ζ");
CODED_ENTITIES.put("Η", "Η");
CODED_ENTITIES.put("Θ", "Θ");
CODED_ENTITIES.put("Ι", "Ι");
CODED_ENTITIES.put("Κ", "Κ");
CODED_ENTITIES.put("Λ", "Λ");
CODED_ENTITIES.put("Μ", "Μ");
CODED_ENTITIES.put("Ν", "Ν");
CODED_ENTITIES.put("Ξ", "Ξ");
CODED_ENTITIES.put("Ο", "Ο");
CODED_ENTITIES.put("Π", "Π");
CODED_ENTITIES.put("Ρ", "Ρ");
CODED_ENTITIES.put("Σ", "Σ");
CODED_ENTITIES.put("Τ", "Τ");
CODED_ENTITIES.put("Υ", "Υ");
CODED_ENTITIES.put("Φ", "Φ");
CODED_ENTITIES.put("Χ", "Χ");
CODED_ENTITIES.put("Ψ", "Ψ");
CODED_ENTITIES.put("Ω", "Ω");
CODED_ENTITIES.put("α", "α");
CODED_ENTITIES.put("β", "β");
CODED_ENTITIES.put("γ", "γ");
CODED_ENTITIES.put("δ", "δ");
CODED_ENTITIES.put("ε", "ε");
CODED_ENTITIES.put("ζ", "ζ");
CODED_ENTITIES.put("η", "η");
CODED_ENTITIES.put("θ", "θ");
CODED_ENTITIES.put("ι", "ι");
CODED_ENTITIES.put("κ", "κ");
CODED_ENTITIES.put("λ", "λ");
CODED_ENTITIES.put("μ", "μ");
CODED_ENTITIES.put("ν", "ν");
CODED_ENTITIES.put("ξ", "ξ");
CODED_ENTITIES.put("ο", "ο");
CODED_ENTITIES.put("π", "π");
CODED_ENTITIES.put("ρ", "ρ");
CODED_ENTITIES.put("ς", "ς");
CODED_ENTITIES.put("σ", "σ");
CODED_ENTITIES.put("τ", "τ");
CODED_ENTITIES.put("υ", "υ");
CODED_ENTITIES.put("φ", "φ");
CODED_ENTITIES.put("χ", "χ");
CODED_ENTITIES.put("ψ", "ψ");
CODED_ENTITIES.put("ω", "ω");
CODED_ENTITIES.put("ϑ", "ϑ");
CODED_ENTITIES.put("ϒ", "ϒ");
CODED_ENTITIES.put("ϖ", "ϖ");
CODED_ENTITIES.put("•", "•");
CODED_ENTITIES.put("…", "…");
CODED_ENTITIES.put("′", "′");
CODED_ENTITIES.put("″", "″");
CODED_ENTITIES.put("‾", "‾");
CODED_ENTITIES.put("⁄", "⁄");
CODED_ENTITIES.put("℘", "℘");
CODED_ENTITIES.put("ℑ", "ℑ");
CODED_ENTITIES.put("ℜ", "ℜ");
CODED_ENTITIES.put("™", "™");
CODED_ENTITIES.put("ℵ", "ℵ");
CODED_ENTITIES.put("←", "←");
CODED_ENTITIES.put("↑", "↑");
CODED_ENTITIES.put("→", "→");
CODED_ENTITIES.put("↓", "↓");
CODED_ENTITIES.put("↔", "↔");
CODED_ENTITIES.put("↵", "↵");
CODED_ENTITIES.put("⇐", "⇐");
CODED_ENTITIES.put("⇑", "⇑");
CODED_ENTITIES.put("⇒", "⇒");
CODED_ENTITIES.put("⇓", "⇓");
CODED_ENTITIES.put("⇔", "⇔");
CODED_ENTITIES.put("∀", "∀");
CODED_ENTITIES.put("∂", "∂");
CODED_ENTITIES.put("∃", "∃");
CODED_ENTITIES.put("∅", "∅");
CODED_ENTITIES.put("∇", "∇");
CODED_ENTITIES.put("∈", "∈");
CODED_ENTITIES.put("∉", "∉");
CODED_ENTITIES.put("∋", "∋");
CODED_ENTITIES.put("∏", "∏");
CODED_ENTITIES.put("∑", "∑");
CODED_ENTITIES.put("−", "−");
CODED_ENTITIES.put("∗", "∗");
CODED_ENTITIES.put("√", "√");
CODED_ENTITIES.put("∝", "∝");
CODED_ENTITIES.put("∞", "∞");
CODED_ENTITIES.put("∠", "∠");
CODED_ENTITIES.put("∧", "∧");
CODED_ENTITIES.put("∨", "∨");
CODED_ENTITIES.put("∩", "∩");
CODED_ENTITIES.put("∪", "∪");
CODED_ENTITIES.put("∫", "∫");
CODED_ENTITIES.put("∴", "∴");
CODED_ENTITIES.put("∼", "∼");
CODED_ENTITIES.put("≅", "≅");
CODED_ENTITIES.put("≈", "≈");
CODED_ENTITIES.put("≠", "≠");
CODED_ENTITIES.put("≡", "≡");
CODED_ENTITIES.put("≤", "≤");
CODED_ENTITIES.put("≥", "≥");
CODED_ENTITIES.put("⊂", "⊂");
CODED_ENTITIES.put("⊃", "⊃");
CODED_ENTITIES.put("⊄", "⊄");
CODED_ENTITIES.put("⊆", "⊆");
CODED_ENTITIES.put("⊇", "⊇");
CODED_ENTITIES.put("⊕", "⊕");
CODED_ENTITIES.put("⊗", "⊗");
CODED_ENTITIES.put("⊥", "⊥");
CODED_ENTITIES.put("⋅", "⋅");
CODED_ENTITIES.put("⌈", "⌈");
CODED_ENTITIES.put("⌉", "⌉");
CODED_ENTITIES.put("⌊", "⌊");
CODED_ENTITIES.put("⌋", "⌋");
CODED_ENTITIES.put("〈", "〈");
CODED_ENTITIES.put("〉", "〉");
CODED_ENTITIES.put("◊", "◊");
CODED_ENTITIES.put("♠", "♠");
CODED_ENTITIES.put("♣", "♣");
CODED_ENTITIES.put("♥", "♥");
CODED_ENTITIES.put("♦", "♦");
// Special characters for HTML.
// HTMLspecial "-//W3C//ENTITIES Special//EN//HTML"
CODED_ENTITIES.put(""", """);
CODED_ENTITIES.put("&", "&");
CODED_ENTITIES.put("<", "<");
CODED_ENTITIES.put(">", ">");
CODED_ENTITIES.put("Œ", "Œ");
CODED_ENTITIES.put("œ", "œ");
CODED_ENTITIES.put("Š", "Š");
CODED_ENTITIES.put("š", "š");
CODED_ENTITIES.put("Ÿ", "Ÿ");
CODED_ENTITIES.put("ˆ", "ˆ");
CODED_ENTITIES.put("˜", "˜");
CODED_ENTITIES.put(" ", " ");
CODED_ENTITIES.put(" ", " ");
CODED_ENTITIES.put(" ", " ");
CODED_ENTITIES.put("", "");
CODED_ENTITIES.put("", "");
CODED_ENTITIES.put("", "");
CODED_ENTITIES.put("", "");
CODED_ENTITIES.put("–", "–");
CODED_ENTITIES.put("—", "—");
CODED_ENTITIES.put("‘", "‘");
CODED_ENTITIES.put("’", "’");
CODED_ENTITIES.put("‚", "‚");
CODED_ENTITIES.put("“", "“");
CODED_ENTITIES.put("”", "”");
CODED_ENTITIES.put("„", "„");
CODED_ENTITIES.put("†", "†");
CODED_ENTITIES.put("‡", "‡");
CODED_ENTITIES.put("‰", "‰");
CODED_ENTITIES.put("‹", "‹");
CODED_ENTITIES.put("›", "›");
CODED_ENTITIES.put("€", "€");
}
//
// It shouldn't be here but well, just reusing the CODED_ENTITIES Map :)
//
private static Pattern ENTITIES_PATTERN = Pattern.compile("&[A-Za-z^#]+;");
public String processHtmlEntities(final String s) {
if (s.indexOf('&') == -1) {
return s;
}
final StringBuffer sb = new StringBuffer(s.length());
int pos = 0;
while (pos < s.length()) {
String chunck = s.substring(pos);
final Matcher m = ENTITIES_PATTERN.matcher(chunck);
if (m.find()) {
final int b = pos + m.start();
final int e = pos + m.end();
if (b > pos) {
sb.append(s.substring(pos, b));
pos = b;
}
chunck = s.substring(pos, e);
String codedEntity = CODED_ENTITIES.get(chunck);
if (codedEntity == null) {
codedEntity = chunck;
}
sb.append(codedEntity);
pos = e;
} else {
sb.append(chunck);
pos += chunck.length();
}
}
return sb.toString();
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy