All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.jwat.common.UriProfile Maven / Gradle / Ivy

/**
 * Java Web Archive Toolkit - Software to read and validate ARC, WARC
 * and GZip files. (http://jwat.org/)
 * Copyright 2011-2012 Netarkivet.dk (http://netarkivet.dk/)
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.jwat.common;

import java.net.URISyntaxException;

/**
 * Implementation of an URI profile. A profile can be used to customize which
 * characters and features are acceptable when a certain profile is used.
 *
 * An array of integers is used to defined which categories the first
 * 8-bit characters belong to.
 *
 * @author nicl
 */
public class UriProfile {

    /** Bit to categorize a char as an alpha. */
    public static final int B_ALPHAS = 1 << 0;
    /** Bit to categorize a char as a digit. */
    public static final int B_DIGITS = 1 << 1;
    /** Bit to categorize a char as first in scheme. */
    public static final int B_SCHEME_FIRST = 1 << 2;
    /** Bit to categorize a char as following in scheme. */
    public static final int B_SCHEME_FOLLOW = 1 << 3;
    /** Bit to categorize a char as UNRESERVED in RFC3986. */
    public static final int B_UNRESERVED = 1 << 4;
    /** Bit to categorize a char as GEN-DELIMS in RFC3986. */
    public static final int B_GEN_DELIMS = 1 << 5;
    /** Bit to categorize a char as SUB-DELIMS in RFC3986. */
    public static final int B_SUB_DELIMS = 1 << 6;
    /** Bit to categorize a char as RESERVED in RFC3986. */
    public static final int B_RESERVED = 1 << 7;
    /** Bit to categorize a char as PCHAR in RFC3986. */
    public static final int B_PCHAR = 1 << 8;
    /** Bit to categorize a char as USERINFO in RFC3986. */
    public static final int B_USERINFO = 1 << 9;
    /** Bit to categorize a char as REGNAME in RFC3986. */
    public static final int B_REGNAME = 1 << 10;
    /** Bit to categorize a char as SEGMENT in RFC3986. */
    public static final int B_SEGMENT = 1 << 11;
    /** Bit to categorize a char as SEGMENT-NZ in RFC3986. */
    public static final int B_SEGMENT_NZ = 1 << 12;
    /** Bit to categorize a char as SEGMENT-NZ-NC in RFC3986. */
    public static final int B_SEGMENT_NZ_NC = 1 << 13;
    /** Bit to categorize a char as PATH in RFC3986. */
    public static final int B_PATH = 1 << 14;
    /** Bit to categorize a char as QUERY in RFC3986. */
    public static final int B_QUERY = 1 << 15;
    /** Bit to categorize a char as FRAGMENT in RFC3986. */
    public static final int B_FRAGMENT = 1 << 16;

    /** Array of integers used to categorize all 8bit chars. */
    protected final int[] charTypeMap = new int[256];

    /** Does profile allow relative URIs. */
    public boolean bAllowRelativeUris;

    /** Does profile allow 16-bit percent encoding. */
    public boolean bAllow16bitPercentEncoding;

    /** Does profile allow invalid percent encoding. */
    public boolean bAllowinvalidPercentEncoding;

    /**
     * Construct an UriProfile initialized with RFC3986
     * rules.
     */
    public UriProfile() {
        for (int i=0; iUriProfile initialized from another profile.
     * @param uriProfile URI profile to base a new profile on
     */
    public UriProfile(UriProfile uriProfile) {
        for (int i=0; i 255.
            if (pos == 0 && ((charTypeMap[c] & UriProfile.B_SCHEME_FIRST) == 0)) {
                throw new URISyntaxException(str, "Invalid URI scheme component");
            } else if ((charTypeMap[c] & UriProfile.B_SCHEME_FOLLOW) == 0) {
                throw new URISyntaxException(str, "Invalid URI scheme component");
            }
            ++pos;
        }
    }

    /**
     * Validates an URI component according to the supplied character category
     * bitfield.
     * @param bw_and bits identifying one or more character categories
     * @param componentName URI component name
     * @param str URI component string
     * @return decoded and validated string
     * @throws URISyntaxException if an error occurs parsing/validating component
     */
    public String validate_decode(int bw_and, String componentName, String str) throws URISyntaxException {
        StringBuilder sb = new StringBuilder();
        int pos = 0;
        int ppos;
        int limit = str.length();
        char c;
        int decode = 0;
        int tmpC;
        char decodedC;
        boolean bValid;
        while (pos < limit) {
            c = str.charAt(pos++);
            if (c < 256) {
                if ((charTypeMap[c] & bw_and) == 0) {
                    if (c == '%') {
                        ppos = pos - 1;
                        if (pos < limit) {
                            c = str.charAt(pos);
                            if (c == 'u' || c == 'U') {
                                if (!bAllow16bitPercentEncoding) {
                                    if (!bAllowinvalidPercentEncoding) {
                                        throw new URISyntaxException(str, "Invalid URI " + componentName + " component - 16-bit percent encoding not allowed");
                                    } else {
                                        bValid = false;
                                    }
                                } else {
                                    ++pos;
                                    decode = 4;
                                    bValid = true;
                                }
                            } else {
                                decode = 2;
                                bValid = true;
                            }
                            decodedC = 0;
                            while (bValid && decode > 0) {
                                if (pos < limit) {
                                    c = str.charAt(pos++);
                                    decodedC <<= 4;
                                    if (c < 256) {
                                        tmpC = asciiHexTab[c];
                                        if (tmpC != -1) {
                                            decodedC |= tmpC;
                                            --decode;
                                        } else {
                                            bValid = false;
                                        }
                                    } else {
                                        bValid = false;
                                    }
                                } else {
                                    bValid = false;
                                }
                            }
                            if (!bValid && !bAllowinvalidPercentEncoding) {
                                throw new URISyntaxException(str, "Invalid URI " + componentName + " component - invalid percent encoding");
                            }
                            sb.append((char) decodedC);
                        } else {
                            if (!bAllowinvalidPercentEncoding) {
                                throw new URISyntaxException(str, "Invalid URI " + componentName + " component - incomplete percent encoding");
                            } else {
                                bValid = false;
                            }
                        }
                        if (!bValid) {
                            while (ppos < pos) {
                                sb.append(str.charAt(ppos++));
                            }
                        }
                    } else {
                        throw new URISyntaxException(str, "Invalid URI " + componentName + " component - invalid character '" + (Character.isISOControl(c)?String.format("0x%02x", (int)c):c) + "'");
                    }
                } else {
                    sb.append(c);
                }
            } else {
                throw new URISyntaxException(str, "Invalid URI " + componentName + " component - invalid character '" + (Character.isISOControl(c)?String.format("0x%02x", (int)c):c) + "'");
            }
        }
        return sb.toString();
    }

    /** Hex char to integer conversion table. */
    public static int[] asciiHexTab = new int[256];

    /** Integer to hex char conversion table. */
    public static char[] hexTab = { '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 'A', 'B', 'C', 'D', 'E', 'F' };

    /*
     * Initialize ASCII hex table.
     */
    static {
        String hex = "0123456789abcdef";
        for (int i=0; i\^`{|}¡¢£¤¥¦§¨©ª«¬­®¯°±²³´µ¶·¸¹º»¼½¾¿ÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖ×ØÙÚÛÜÝÞßàáâãäåæçèéêëìíîïðñòóôõö÷øùúûüýþ
        RFC3986_ABS_16BIT_LAX = new UriProfile();
        RFC3986_ABS_16BIT_LAX.bAllowRelativeUris = false;
        RFC3986_ABS_16BIT_LAX.bAllow16bitPercentEncoding = true;
        RFC3986_ABS_16BIT_LAX.bAllowinvalidPercentEncoding = true;
        RFC3986_ABS_16BIT_LAX.charTypeAddAndOr(sb.toString(), 0, B_PATH | B_QUERY | B_FRAGMENT);
        RFC3986_ABS_16BIT_LAX.charTypeAddAndOr("#", 0, B_FRAGMENT);
    }

}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy