All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.rometools.rome.io.impl.XmlFixerReader Maven / Gradle / Ivy

/*
 * Copyright 2005 Sun Microsystems, Inc.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 *
 */
package com.rometools.rome.io.impl;

import java.io.IOException;
import java.io.Reader;
import java.util.HashMap;
import java.util.Map;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

public class XmlFixerReader extends Reader {

    protected Reader in;

    public XmlFixerReader(final Reader in) {
        super(in);
        this.in = in;
        buffer = new StringBuffer();
        state = 0;
    }

    private boolean trimmed;
    private boolean cdata = false;
    private final StringBuffer buffer;
    private int bufferPos;
    private int state = 0;

    private boolean trimStream() throws IOException {
        boolean hasContent = true;
        int state = 0;
        boolean loop;
        int c;
        do {
            switch (state) {
                case 0:
                    c = in.read();
                    if (c == -1) {
                        loop = false;
                        hasContent = false;
                    } else if (c == ' ' || c == '\n' || c == '\r' || c == '\t') {
                        loop = true;
                    } else if (c == '<') {
                        state = 1;
                        buffer.setLength(0);
                        bufferPos = 0;
                        buffer.append((char) c);
                        loop = true;
                    } else {
                        buffer.setLength(0);
                        bufferPos = 0;
                        buffer.append((char) c);
                        loop = false;
                        hasContent = true;
                        this.state = 3;
                    }
                    break;
                case 1:
                    c = in.read();
                    if (c == -1) {
                        loop = false;
                        hasContent = true;
                        this.state = 3;
                    } else if (c != '!') {
                        buffer.append((char) c);
                        this.state = 3;
                        loop = false;
                        hasContent = true;
                        this.state = 3;
                    } else {
                        buffer.append((char) c);
                        state = 2;
                        loop = true;
                    }
                    break;
                case 2:
                    c = in.read();
                    if (c == -1) {
                        loop = false;
                        hasContent = true;
                        this.state = 3;
                    } else if (c == '-') {
                        buffer.append((char) c);
                        state = 3;
                        loop = true;
                    } else {
                        buffer.append((char) c);
                        loop = false;
                        hasContent = true;
                        this.state = 3;
                    }
                    break;
                case 3:
                    c = in.read();
                    if (c == -1) {
                        loop = false;
                        hasContent = true;
                        this.state = 3;
                    } else if (c == '-') {
                        buffer.append((char) c);
                        state = 4;
                        loop = true;
                    } else {
                        buffer.append((char) c);
                        loop = false;
                        hasContent = true;
                        this.state = 3;
                    }
                    break;
                case 4:
                    c = in.read();
                    if (c == -1) {
                        loop = false;
                        hasContent = true;
                        this.state = 3;
                    } else if (c != '-') {
                        buffer.append((char) c);
                        loop = true;
                    } else {
                        buffer.append((char) c);
                        state = 5;
                        loop = true;
                    }
                    break;
                case 5:
                    c = in.read();
                    if (c == -1) {
                        loop = false;
                        hasContent = true;
                        this.state = 3;
                    } else if (c != '-') {
                        buffer.append((char) c);
                        loop = true;
                        state = 4;
                    } else {
                        buffer.append((char) c);
                        state = 6;
                        loop = true;
                    }
                    break;
                case 6:
                    c = in.read();
                    if (c == -1) {
                        loop = false;
                        hasContent = true;
                        this.state = 3;
                    } else if (c != '>') {
                        buffer.append((char) c);
                        loop = true;
                        state = 4;
                    } else {
                        buffer.setLength(0);
                        state = 0;
                        loop = true;
                    }
                    break;
                default:
                    throw new IOException("It shouldn't happen");
            }
        } while (loop);
        return hasContent;
    }

    @Override
    public int read() throws IOException {
        boolean loop;
        if (!trimmed) { // trims XML stream
            trimmed = true;
            if (!trimStream()) {
                return -1;
            }
        }
        int c;
        do { // converts literal entities to coded entities
            switch (state) {
                case 0: // reading chars from stream
                    c = in.read();
                    if (c > -1) {
                        if (c == '&') {
                            state = 1;
                            buffer.setLength(0);
                            bufferPos = 0;
                            buffer.append((char) c);
                            loop = true;
                        } else if (c == '<') {
                            state = 4;
                            buffer.setLength(0);
                            bufferPos = 0;
                            buffer.append((char) c);
                            loop = true;
                        } else if (c == ']' && cdata) {
                            state = 5;
                            buffer.setLength(0);
                            bufferPos = 0;
                            buffer.append((char) c);
                            loop = true;
                        } else {
                            loop = false;
                        }
                    } else {
                        loop = false;
                    }
                    break;
                case 1: // reading entity from stream
                    c = in.read();
                    if (c > -1) {
                        if (c == ';') {
                            buffer.append((char) c);
                            state = 2;
                            loop = true;
                        } else if (c >= 'a' && c <= 'z' || c >= 'A' && c <= 'Z' || c == '#' || c >= '0' && c <= '9') {
                            buffer.append((char) c);
                            loop = true;
                        } else {
                            // no ';' to match the '&' lets just make the '&'
                            // a legal xml character entity '&'
                            if (!cdata) {
                                buffer.insert(1, "amp;");
                            }
                            buffer.append((char) c);
                            state = 3;
                            loop = true;
                        }
                    } else {
                        // no ';' to match the '&' lets just make the '&'
                        // a legal xml character entity '&'
                        if (!cdata) {
                            buffer.insert(1, "amp;");
                        }
                        state = 3;
                        loop = true;
                    }
                    break;
                case 2: // replacing entity
                    c = 0;
                    final String literalEntity = buffer.toString();
                    final String codedEntity = CODED_ENTITIES.get(literalEntity);
                    if (codedEntity != null) {
                        buffer.setLength(0);
                        buffer.append(codedEntity);
                    } // else we leave what was in the stream
                    state = 3;
                    loop = true;
                    break;
                case 3: // consuming buffer
                    if (bufferPos < buffer.length()) {
                        c = buffer.charAt(bufferPos++);
                        loop = false;
                    } else {
                        c = 0;
                        state = 0;
                        loop = true;
                    }
                    break;
                case 4: // checking for CDATA
                    c = in.read();
                    loop = true;
                    state = 3;
                    switch (c) {
                        case -1:
                            // end of stream
                            break;
                        case ' ':
                        case '>':
                        case '/':
                            // tag end or something like this
                            buffer.append((char) c);
                            break;
                        case '[':
                            buffer.append((char) c);
                            final String actBufferContent = buffer.toString();
                            if ("':
                            buffer.append((char) c);
                            final String actBufferContent = buffer.toString();
                            if ("]]>".equals(actBufferContent)) {
                                cdata = false;
                            }
                            break;
                        default:
                            buffer.append((char) c);
                            break;
                    }
                    break;
                default:
                    throw new IOException("It shouldn't happen");
            }
        } while (loop);
        return c;
    }

    @Override
    public int read(final char[] buffer, final int offset, final int len) throws IOException {
        int charsRead = 0;
        int c = this.read();
        if (c == -1) {
            return -1;
        }
        buffer[offset + charsRead++] = (char) c;
        while (charsRead < len && (c = this.read()) > -1) {
            buffer[offset + charsRead++] = (char) c;
        }
        return charsRead;
    }

    @Override
    public long skip(final long n) throws IOException {
        if (n == 0) {
            return 0;
        } else if (n < 0) {
            throw new IllegalArgumentException("'n' cannot be negative");
        }
        int c = this.read();
        long counter = 1;
        while (c > -1 && counter < n) {
            c = this.read();
            counter++;
        }
        return counter;
    }

    @Override
    public boolean ready() throws IOException {
        return state != 0 || in.ready();
    }

    @Override
    public boolean markSupported() {
        return false;
    }

    @Override
    public void mark(final int readAheadLimit) throws IOException {
        throw new IOException("Stream does not support mark");
    }

    @Override
    public void reset() throws IOException {
        throw new IOException("Stream does not support mark");
    }

    @Override
    public void close() throws IOException {
        in.close();
    }

    private static Map CODED_ENTITIES = new HashMap();

    static {
        // note: refer to Character entity references in HTML 4
        // at http://www.w3.org/TR/REC-html40/sgml/entities.html

        // Character entity set.
        // HTMLlat1 "-//W3C//ENTITIES Latin 1//EN//HTML"

        CODED_ENTITIES.put(" ", " ");
        CODED_ENTITIES.put("¡", "¡");
        CODED_ENTITIES.put("¢", "¢");
        CODED_ENTITIES.put("£", "£");
        CODED_ENTITIES.put("¤", "¤");
        CODED_ENTITIES.put("¥", "¥");
        CODED_ENTITIES.put("¦", "¦");
        CODED_ENTITIES.put("§", "§");
        CODED_ENTITIES.put("¨", "¨");
        CODED_ENTITIES.put("©", "©");
        CODED_ENTITIES.put("ª", "ª");
        CODED_ENTITIES.put("«", "«");
        CODED_ENTITIES.put("¬", "¬");
        CODED_ENTITIES.put("­", "­");
        CODED_ENTITIES.put("®", "®");
        CODED_ENTITIES.put("¯", "¯");
        CODED_ENTITIES.put("°", "°");
        CODED_ENTITIES.put("±", "±");
        CODED_ENTITIES.put("²", "²");
        CODED_ENTITIES.put("³", "³");
        CODED_ENTITIES.put("´", "´");
        CODED_ENTITIES.put("µ", "µ");
        CODED_ENTITIES.put("¶", "¶");
        CODED_ENTITIES.put("·", "·");
        CODED_ENTITIES.put("¸", "¸");
        CODED_ENTITIES.put("¹", "¹");
        CODED_ENTITIES.put("º", "º");
        CODED_ENTITIES.put("»", "»");
        CODED_ENTITIES.put("¼", "¼");
        CODED_ENTITIES.put("½", "½");
        CODED_ENTITIES.put("¾", "¾");
        CODED_ENTITIES.put("¿", "¿");
        CODED_ENTITIES.put("À", "À");
        CODED_ENTITIES.put("Á", "Á");
        CODED_ENTITIES.put("Â", "Â");
        CODED_ENTITIES.put("Ã", "Ã");
        CODED_ENTITIES.put("Ä", "Ä");
        CODED_ENTITIES.put("Å", "Å");
        CODED_ENTITIES.put("Æ", "Æ");
        CODED_ENTITIES.put("Ç", "Ç");
        CODED_ENTITIES.put("È", "È");
        CODED_ENTITIES.put("É", "É");
        CODED_ENTITIES.put("Ê", "Ê");
        CODED_ENTITIES.put("Ë", "Ë");
        CODED_ENTITIES.put("Ì", "Ì");
        CODED_ENTITIES.put("Í", "Í");
        CODED_ENTITIES.put("Î", "Î");
        CODED_ENTITIES.put("Ï", "Ï");
        CODED_ENTITIES.put("Ð", "Ð");
        CODED_ENTITIES.put("Ñ", "Ñ");
        CODED_ENTITIES.put("Ò", "Ò");
        CODED_ENTITIES.put("Ó", "Ó");
        CODED_ENTITIES.put("Ô", "Ô");
        CODED_ENTITIES.put("Õ", "Õ");
        CODED_ENTITIES.put("Ö", "Ö");
        CODED_ENTITIES.put("×", "×");
        CODED_ENTITIES.put("Ø", "Ø");
        CODED_ENTITIES.put("Ù", "Ù");
        CODED_ENTITIES.put("Ú", "Ú");
        CODED_ENTITIES.put("Û", "Û");
        CODED_ENTITIES.put("Ü", "Ü");
        CODED_ENTITIES.put("Ý", "Ý");
        CODED_ENTITIES.put("Þ", "Þ");
        CODED_ENTITIES.put("ß", "ß");
        CODED_ENTITIES.put("à", "à");
        CODED_ENTITIES.put("á", "á");
        CODED_ENTITIES.put("â", "â");
        CODED_ENTITIES.put("ã", "ã");
        CODED_ENTITIES.put("ä", "ä");
        CODED_ENTITIES.put("å", "å");
        CODED_ENTITIES.put("æ", "æ");
        CODED_ENTITIES.put("ç", "ç");
        CODED_ENTITIES.put("è", "è");
        CODED_ENTITIES.put("é", "é");
        CODED_ENTITIES.put("ê", "ê");
        CODED_ENTITIES.put("ë", "ë");
        CODED_ENTITIES.put("ì", "ì");
        CODED_ENTITIES.put("í", "í");
        CODED_ENTITIES.put("î", "î");
        CODED_ENTITIES.put("ï", "ï");
        CODED_ENTITIES.put("ð", "ð");
        CODED_ENTITIES.put("ñ", "ñ");
        CODED_ENTITIES.put("ò", "ò");
        CODED_ENTITIES.put("ó", "ó");
        CODED_ENTITIES.put("ô", "ô");
        CODED_ENTITIES.put("õ", "õ");
        CODED_ENTITIES.put("ö", "ö");
        CODED_ENTITIES.put("÷", "÷");
        CODED_ENTITIES.put("ø", "ø");
        CODED_ENTITIES.put("ù", "ù");
        CODED_ENTITIES.put("ú", "ú");
        CODED_ENTITIES.put("û", "û");
        CODED_ENTITIES.put("ü", "ü");
        CODED_ENTITIES.put("ý", "ý");
        CODED_ENTITIES.put("þ", "þ");
        CODED_ENTITIES.put("ÿ", "ÿ");

        // Mathematical, Greek and Symbolic characters for HTML.
        // HTMLsymbol "-//W3C//ENTITIES Symbols//EN//HTML"

        CODED_ENTITIES.put("ƒ", "ƒ");
        CODED_ENTITIES.put("Α", "Α");
        CODED_ENTITIES.put("Β", "Β");
        CODED_ENTITIES.put("Γ", "Γ");
        CODED_ENTITIES.put("Δ", "Δ");
        CODED_ENTITIES.put("Ε", "Ε");
        CODED_ENTITIES.put("Ζ", "Ζ");
        CODED_ENTITIES.put("Η", "Η");
        CODED_ENTITIES.put("Θ", "Θ");
        CODED_ENTITIES.put("Ι", "Ι");
        CODED_ENTITIES.put("Κ", "Κ");
        CODED_ENTITIES.put("Λ", "Λ");
        CODED_ENTITIES.put("Μ", "Μ");
        CODED_ENTITIES.put("Ν", "Ν");
        CODED_ENTITIES.put("Ξ", "Ξ");
        CODED_ENTITIES.put("Ο", "Ο");
        CODED_ENTITIES.put("Π", "Π");
        CODED_ENTITIES.put("Ρ", "Ρ");
        CODED_ENTITIES.put("Σ", "Σ");
        CODED_ENTITIES.put("Τ", "Τ");
        CODED_ENTITIES.put("Υ", "Υ");
        CODED_ENTITIES.put("Φ", "Φ");
        CODED_ENTITIES.put("Χ", "Χ");
        CODED_ENTITIES.put("Ψ", "Ψ");
        CODED_ENTITIES.put("Ω", "Ω");
        CODED_ENTITIES.put("α", "α");
        CODED_ENTITIES.put("β", "β");
        CODED_ENTITIES.put("γ", "γ");
        CODED_ENTITIES.put("δ", "δ");
        CODED_ENTITIES.put("ε", "ε");
        CODED_ENTITIES.put("ζ", "ζ");
        CODED_ENTITIES.put("η", "η");
        CODED_ENTITIES.put("θ", "θ");
        CODED_ENTITIES.put("ι", "ι");
        CODED_ENTITIES.put("κ", "κ");
        CODED_ENTITIES.put("λ", "λ");
        CODED_ENTITIES.put("μ", "μ");
        CODED_ENTITIES.put("ν", "ν");
        CODED_ENTITIES.put("ξ", "ξ");
        CODED_ENTITIES.put("ο", "ο");
        CODED_ENTITIES.put("π", "π");
        CODED_ENTITIES.put("ρ", "ρ");
        CODED_ENTITIES.put("ς", "ς");
        CODED_ENTITIES.put("σ", "σ");
        CODED_ENTITIES.put("τ", "τ");
        CODED_ENTITIES.put("υ", "υ");
        CODED_ENTITIES.put("φ", "φ");
        CODED_ENTITIES.put("χ", "χ");
        CODED_ENTITIES.put("ψ", "ψ");
        CODED_ENTITIES.put("ω", "ω");
        CODED_ENTITIES.put("ϑ", "ϑ");
        CODED_ENTITIES.put("ϒ", "ϒ");
        CODED_ENTITIES.put("ϖ", "ϖ");
        CODED_ENTITIES.put("•", "•");
        CODED_ENTITIES.put("…", "…");
        CODED_ENTITIES.put("′", "′");
        CODED_ENTITIES.put("″", "″");
        CODED_ENTITIES.put("‾", "‾");
        CODED_ENTITIES.put("⁄", "⁄");
        CODED_ENTITIES.put("℘", "℘");
        CODED_ENTITIES.put("ℑ", "ℑ");
        CODED_ENTITIES.put("ℜ", "ℜ");
        CODED_ENTITIES.put("™", "™");
        CODED_ENTITIES.put("ℵ", "ℵ");
        CODED_ENTITIES.put("←", "←");
        CODED_ENTITIES.put("↑", "↑");
        CODED_ENTITIES.put("→", "→");
        CODED_ENTITIES.put("↓", "↓");
        CODED_ENTITIES.put("↔", "↔");
        CODED_ENTITIES.put("↵", "↵");
        CODED_ENTITIES.put("⇐", "⇐");
        CODED_ENTITIES.put("⇑", "⇑");
        CODED_ENTITIES.put("⇒", "⇒");
        CODED_ENTITIES.put("⇓", "⇓");
        CODED_ENTITIES.put("⇔", "⇔");
        CODED_ENTITIES.put("∀", "∀");
        CODED_ENTITIES.put("∂", "∂");
        CODED_ENTITIES.put("∃", "∃");
        CODED_ENTITIES.put("∅", "∅");
        CODED_ENTITIES.put("∇", "∇");
        CODED_ENTITIES.put("∈", "∈");
        CODED_ENTITIES.put("∉", "∉");
        CODED_ENTITIES.put("∋", "∋");
        CODED_ENTITIES.put("∏", "∏");
        CODED_ENTITIES.put("∑", "∑");
        CODED_ENTITIES.put("−", "−");
        CODED_ENTITIES.put("∗", "∗");
        CODED_ENTITIES.put("√", "√");
        CODED_ENTITIES.put("∝", "∝");
        CODED_ENTITIES.put("∞", "∞");
        CODED_ENTITIES.put("∠", "∠");
        CODED_ENTITIES.put("∧", "∧");
        CODED_ENTITIES.put("∨", "∨");
        CODED_ENTITIES.put("∩", "∩");
        CODED_ENTITIES.put("∪", "∪");
        CODED_ENTITIES.put("∫", "∫");
        CODED_ENTITIES.put("∴", "∴");
        CODED_ENTITIES.put("∼", "∼");
        CODED_ENTITIES.put("≅", "≅");
        CODED_ENTITIES.put("≈", "≈");
        CODED_ENTITIES.put("≠", "≠");
        CODED_ENTITIES.put("≡", "≡");
        CODED_ENTITIES.put("≤", "≤");
        CODED_ENTITIES.put("≥", "≥");
        CODED_ENTITIES.put("⊂", "⊂");
        CODED_ENTITIES.put("⊃", "⊃");
        CODED_ENTITIES.put("⊄", "⊄");
        CODED_ENTITIES.put("⊆", "⊆");
        CODED_ENTITIES.put("⊇", "⊇");
        CODED_ENTITIES.put("⊕", "⊕");
        CODED_ENTITIES.put("⊗", "⊗");
        CODED_ENTITIES.put("⊥", "⊥");
        CODED_ENTITIES.put("⋅", "⋅");
        CODED_ENTITIES.put("⌈", "⌈");
        CODED_ENTITIES.put("⌉", "⌉");
        CODED_ENTITIES.put("⌊", "⌊");
        CODED_ENTITIES.put("⌋", "⌋");
        CODED_ENTITIES.put("⟨", "〈");
        CODED_ENTITIES.put("⟩", "〉");
        CODED_ENTITIES.put("◊", "◊");
        CODED_ENTITIES.put("♠", "♠");
        CODED_ENTITIES.put("♣", "♣");
        CODED_ENTITIES.put("♥", "♥");
        CODED_ENTITIES.put("♦", "♦");

        // Special characters for HTML.
        // HTMLspecial "-//W3C//ENTITIES Special//EN//HTML"

        CODED_ENTITIES.put(""", """);
        CODED_ENTITIES.put("&", "&");
        CODED_ENTITIES.put("<", "<");
        CODED_ENTITIES.put(">", ">");
        CODED_ENTITIES.put("Œ", "Œ");
        CODED_ENTITIES.put("œ", "œ");
        CODED_ENTITIES.put("Š", "Š");
        CODED_ENTITIES.put("š", "š");
        CODED_ENTITIES.put("Ÿ", "Ÿ");
        CODED_ENTITIES.put("ˆ", "ˆ");
        CODED_ENTITIES.put("˜", "˜");
        CODED_ENTITIES.put(" ", " ");
        CODED_ENTITIES.put(" ", " ");
        CODED_ENTITIES.put(" ", " ");
        CODED_ENTITIES.put("‌", "‌");
        CODED_ENTITIES.put("‍", "‍");
        CODED_ENTITIES.put("‎", "‎");
        CODED_ENTITIES.put("‏", "‏");
        CODED_ENTITIES.put("–", "–");
        CODED_ENTITIES.put("—", "—");
        CODED_ENTITIES.put("‘", "‘");
        CODED_ENTITIES.put("’", "’");
        CODED_ENTITIES.put("‚", "‚");
        CODED_ENTITIES.put("“", "“");
        CODED_ENTITIES.put("”", "”");
        CODED_ENTITIES.put("„", "„");
        CODED_ENTITIES.put("†", "†");
        CODED_ENTITIES.put("‡", "‡");
        CODED_ENTITIES.put("‰", "‰");
        CODED_ENTITIES.put("‹", "‹");
        CODED_ENTITIES.put("›", "›");
        CODED_ENTITIES.put("€", "€");
    }

    //
    // It shouldn't be here but well, just reusing the CODED_ENTITIES Map :)
    //

    private static Pattern ENTITIES_PATTERN = Pattern.compile("&[A-Za-z^#]+;");

    public String processHtmlEntities(final String s) {
        if (s.indexOf('&') == -1) {
            return s;
        }
        final StringBuffer sb = new StringBuffer(s.length());
        int pos = 0;
        while (pos < s.length()) {
            String chunck = s.substring(pos);
            final Matcher m = ENTITIES_PATTERN.matcher(chunck);
            if (m.find()) {
                final int b = pos + m.start();
                final int e = pos + m.end();
                if (b > pos) {
                    sb.append(s.substring(pos, b));
                    pos = b;
                }
                chunck = s.substring(pos, e);
                String codedEntity = CODED_ENTITIES.get(chunck);
                if (codedEntity == null) {
                    codedEntity = chunck;
                }
                sb.append(codedEntity);
                pos = e;
            } else {
                sb.append(chunck);
                pos += chunck.length();
            }
        }
        return sb.toString();
    }

}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy