All Downloads are FREE. Search and download functionalities are using the official Maven repository.

nu.validator.io.DataUri Maven / Gradle / Ivy

Go to download

An HTML-checking library (used by https://html5.validator.nu and the HTML5 facet of the W3C Validator)

There is a newer version: 20.7.2
Show newest version
/*
 * Copyright (c) 2007-2015 Mozilla Foundation
 *
 * Permission is hereby granted, free of charge, to any person obtaining a 
 * copy of this software and associated documentation files (the "Software"), 
 * to deal in the Software without restriction, including without limitation 
 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 
 * and/or sell copies of the Software, and to permit persons to whom the 
 * Software is furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be included in 
 * all copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 
 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 
 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 
 * DEALINGS IN THE SOFTWARE.
 */

package nu.validator.io;

import java.io.IOException;
import java.io.InputStream;
import java.io.StringReader;
import java.net.MalformedURLException;

import io.mola.galimatias.URL;
import io.mola.galimatias.GalimatiasParseException;

public class DataUri {

    public static boolean startsWithData(String uri) {
        return uri != null && uri.length() >= 5
                && (uri.charAt(0) == 'd' || uri.charAt(0) == 'D')
                && (uri.charAt(1) == 'a' || uri.charAt(1) == 'A')
                && (uri.charAt(2) == 't' || uri.charAt(2) == 'T')
                && (uri.charAt(3) == 'a' || uri.charAt(3) == 'A')
                && (uri.charAt(4) == ':');
    }

    private enum State {
        AT_START, IN_SUPERTYPE, AT_SUBTYPE_START, IN_SUBTYPE, SEMICOLON_SEEN, WS_BEFORE_SEMICOLON, IN_PARAM_NAME, EQUALS_SEEN, IN_QUOTED_STRING, IN_UNQUOTED_STRING, IN_QUOTED_PAIR, CLOSE_QUOTE_SEEN
    }

    private String contentType;

    private InputStream inputStream;

    /**
     * @param url
     * @throws MalformedURLException
     * @throws IOException
     */
    protected void init(URL url) throws IOException, MalformedURLException {
        if (!url.scheme().equals("data")) {
            throw new IllegalArgumentException("The input did not start with data:.");
        }

        if (url.fragment() != null) {
            throw new MalformedURLException(
                    "Fragment is not allowed for data: URIs according to RFC 2397.");
        }

        InputStream is = new PercentDecodingReaderInputStream(new StringReader(url.schemeData()));
        StringBuilder sb = new StringBuilder();
        State state = State.AT_START;
        int i = 0; // string counter
        for (;;i++) {
            int b = is.read();
            if (b == -1) {
                is.close();
                throw new MalformedURLException("Premature end of URI.");
            }
            if (b >= 0x80) {
                is.close();
                throw new MalformedURLException(
                        "Non-ASCII character in MIME type part of the data URI.");
            }
            char c = (char) b;
            sb.append(c);
            switch (state) {
                case AT_START:
                    if (isTokenChar(c)) {
                        state = State.IN_SUPERTYPE;
                        continue;
                    } else if (c == ';') {
                        sb.setLength(0);
                        sb.append("text/plain;");
                        state = State.SEMICOLON_SEEN;
                        continue;
                    } else if (c == ',') {
                        contentType = "text/plain;charset=US-ASCII";
                        inputStream = is;
                        return;
                    } else {
                        throw newDatatypeException(i, 
                                "Expected a token character or a semicolon but saw ",
                                        c, " instead.");
                    }
                case IN_SUPERTYPE:
                    if (isTokenChar(c)) {
                        continue;
                    } else if (c == '/') {
                        state = State.AT_SUBTYPE_START;
                        continue;
                    } else {
                        throw newDatatypeException(i, 
                                "Expected a token character or \u201C/\u201D but saw ",
                                        c, " instead.");
                    }
                case AT_SUBTYPE_START:
                    if (isTokenChar(c)) {
                        state = State.IN_SUBTYPE;
                        continue;
                    } else {
                        throw newDatatypeException(i, 
                                "Expected a token character but saw ",
                                        c, " instead.");
                    }
                case IN_SUBTYPE:
                    if (isTokenChar(c)) {
                        continue;
                    } else if (c == ';') {
                        state = State.SEMICOLON_SEEN;
                        continue;
                    } else if (isWhitespace(c)) {
                        state = State.WS_BEFORE_SEMICOLON;
                        continue;
                    } else if (c == ',') {
                        contentType = sb.substring(0, sb.length() - 1);
                        inputStream = is;
                        return;
                    } else {
                        throw newDatatypeException(i, 
                                "Expected a token character, whitespace, a semicolon or a comma but saw ",
                                        c, " instead.");
                    }
                case WS_BEFORE_SEMICOLON:
                    if (isWhitespace(c)) {
                        continue;
                    } else if (c == ';') {
                        state = State.SEMICOLON_SEEN;
                        continue;
                    } else {
                        throw newDatatypeException(i, 
                                "Expected whitespace or a semicolon but saw ",
                                        c, " instead.");
                    }
                case SEMICOLON_SEEN:
                    if (isWhitespace(c)) {
                        continue;
                    } else if (isTokenChar(c)) {
                        state = State.IN_PARAM_NAME;
                        continue;
                    } else {
                        throw newDatatypeException(i, 
                                "Expected whitespace or a token character but saw ",
                                        c, " instead.");
                    }
                case IN_PARAM_NAME:
                    if (isTokenChar(c)) {
                        continue;
                    } else if (c == '=') {
                        state = State.EQUALS_SEEN;
                        continue;
                    } else if (c == ',') {
                        // let's see if we had ;base64,
                        int baseFirst = sb.length() - 8;
                        if (baseFirst >= 0 && ";base64,".equals(sb.substring(baseFirst, sb.length()))) {
                            contentType = sb.substring(0, baseFirst);
                            inputStream = new Base64InputStream(is);
                            return;
                        }
                    } else {
                        throw newDatatypeException(i, 
                                "Expected an equals sign, a comma or a token character but saw ",
                                        c, " instead.");
                    }
                case EQUALS_SEEN:
                    if (c == '\"') {
                        state = State.IN_QUOTED_STRING;
                        continue;
                    } else if (isTokenChar(c)) {
                        state = State.IN_UNQUOTED_STRING;
                        continue;
                    } else {
                        throw newDatatypeException(i, 
                                "Expected a double quote or a token character but saw ",
                                        c, " instead.");
                    }
                case IN_QUOTED_STRING:
                    if (c == '\\') {
                        state = State.IN_QUOTED_PAIR;
                        continue;
                    } else if (c == '\"') {
                        state = State.CLOSE_QUOTE_SEEN;
                        continue;
                    } else if (isQDTextChar(c)) {
                        continue;
                    } else {
                        throw newDatatypeException(i, 
                                "Expected a non-control ASCII character but saw ",
                                        c, " instead.");
                    }
                case IN_QUOTED_PAIR:
                    if (c <= 127) {
                        state = State.IN_QUOTED_STRING;
                        continue;
                    } else {
                        throw newDatatypeException(i, 
                                "Expected an ASCII character but saw ",
                                        c, " instead.");
                    }
                case CLOSE_QUOTE_SEEN:
                    if (c == ';') {
                        state = State.SEMICOLON_SEEN;
                        continue;
                    } else if (isWhitespace(c)) {
                        state = State.WS_BEFORE_SEMICOLON;
                        continue;
                    } else if (c == ',') {
                        contentType = sb.substring(0, sb.length() - 1);
                        inputStream = is;
                        return;
                    } else {
                        throw newDatatypeException(i, 
                                "Expected an ASCII character but saw ",
                                        c, " instead.");
                    }
                case IN_UNQUOTED_STRING:
                    if (isTokenChar(c)) {
                        continue;
                    } else if (c == ';') {
                        state = State.SEMICOLON_SEEN;
                        continue;
                    } else if (isWhitespace(c)) {
                        state = State.WS_BEFORE_SEMICOLON;
                        continue;
                    } else if (c == ',') {
                        contentType = sb.substring(0, sb.length() - 1);
                        inputStream = is;
                        return;
                    } else {
                        throw newDatatypeException(i, 
                                "Expected a token character, whitespace, a semicolon, or a comma but saw ",
                                        c, " instead.");
                    }
            }
        }

    }

    public DataUri(String url) throws IOException {
        try {
            init(URL.parse(url));
        } catch (GalimatiasParseException e) {
            throw new MalformedURLException(e.getMessage());
        }
    }

    /**
     * @param url
     * @throws MalformedURLException
     * @throws IOException
     */
    public DataUri(URL url) throws IOException, MalformedURLException {
        init(url);
    }

    private IOException newDatatypeException(int i, String head, char c, String tail) {
        return new DataUriException(i, head, c, tail);
    }

    private boolean isQDTextChar(char c) {
        return (c >= ' ' && c <= 126) || (c == '\n') || (c == '\r')
                || (c == '\t');
    }

    private boolean isTokenChar(char c) {
        return (c >= 33 && c <= 126)
                && !(c == '(' || c == ')' || c == '<' || c == '>' || c == '@'
                        || c == ',' || c == ';' || c == ':' || c == '\\'
                        || c == '\"' || c == '/' || c == '[' || c == ']'
                        || c == '?' || c == '=' || c == '{' || c == '}');
    }

    /**
     * Checks if a UTF-16 code unit represents a whitespace character (U+0020, 
     * U+0009, U+000D or U+000A).
     * @param c the code unit
     * @return true if whitespace, false otherwise
     */
    private boolean isWhitespace(char c) {
        return c == ' ' || c == '\t' || c == '\n' || c == '\r';
    }

    /**
     * Returns the contentType.
     * 
     * @return the contentType
     */
    public String getContentType() {
        return contentType;
    }

    /**
     * Returns the inputStream.
     * 
     * @return the inputStream
     */
    public InputStream getInputStream() {
        return inputStream;
    }
}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy