org.jwat.common.UriProfile Maven / Gradle / Ivy
/** * Java Web Archive Toolkit - Software to read and validate ARC, WARC * and GZip files. (http://jwat.org/) * Copyright 2011-2012 Netarkivet.dk (http://netarkivet.dk/) * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.jwat.common; import java.net.URISyntaxException; /** * Implementation of an URI profile. A profile can be used to customize which * characters and features are acceptable when a certain profile is used. * * An array of integers is used to defined which categories the first * 8-bit characters belong to. * * @author nicl */ public class UriProfile { /** Bit to categorize a char as an alpha. */ public static final int B_ALPHAS = 1 << 0; /** Bit to categorize a char as a digit. */ public static final int B_DIGITS = 1 << 1; /** Bit to categorize a char as first in scheme. */ public static final int B_SCHEME_FIRST = 1 << 2; /** Bit to categorize a char as following in scheme. */ public static final int B_SCHEME_FOLLOW = 1 << 3; /** Bit to categorize a char as UNRESERVED in RFC3986. */ public static final int B_UNRESERVED = 1 << 4; /** Bit to categorize a char as GEN-DELIMS in RFC3986. */ public static final int B_GEN_DELIMS = 1 << 5; /** Bit to categorize a char as SUB-DELIMS in RFC3986. */ public static final int B_SUB_DELIMS = 1 << 6; /** Bit to categorize a char as RESERVED in RFC3986. */ public static final int B_RESERVED = 1 << 7; /** Bit to categorize a char as PCHAR in RFC3986. */ public static final int B_PCHAR = 1 << 8; /** Bit to categorize a char as USERINFO in RFC3986. */ public static final int B_USERINFO = 1 << 9; /** Bit to categorize a char as REGNAME in RFC3986. */ public static final int B_REGNAME = 1 << 10; /** Bit to categorize a char as SEGMENT in RFC3986. */ public static final int B_SEGMENT = 1 << 11; /** Bit to categorize a char as SEGMENT-NZ in RFC3986. */ public static final int B_SEGMENT_NZ = 1 << 12; /** Bit to categorize a char as SEGMENT-NZ-NC in RFC3986. */ public static final int B_SEGMENT_NZ_NC = 1 << 13; /** Bit to categorize a char as PATH in RFC3986. */ public static final int B_PATH = 1 << 14; /** Bit to categorize a char as QUERY in RFC3986. */ public static final int B_QUERY = 1 << 15; /** Bit to categorize a char as FRAGMENT in RFC3986. */ public static final int B_FRAGMENT = 1 << 16; /** Array of integers used to categorize all 8bit chars. */ protected final int[] charTypeMap = new int[256]; /** Does profile allow relative URIs. */ public boolean bAllowRelativeUris; /** Does profile allow 16-bit percent encoding. */ public boolean bAllow16bitPercentEncoding; /** Does profile allow invalid percent encoding. */ public boolean bAllowinvalidPercentEncoding; /** * Construct an
initialized from another profile. * @param uriProfile URI profile to base a new profile on */ public UriProfile(UriProfile uriProfile) { for (int i=0; iUriProfile
initialized with RFC3986 * rules. */ public UriProfile() { for (int i=0; iUriProfile 255. if (pos == 0 && ((charTypeMap[c] & UriProfile.B_SCHEME_FIRST) == 0)) { throw new URISyntaxException(str, "Invalid URI scheme component"); } else if ((charTypeMap[c] & UriProfile.B_SCHEME_FOLLOW) == 0) { throw new URISyntaxException(str, "Invalid URI scheme component"); } ++pos; } } /** * Validates an URI component according to the supplied character category * bitfield. * @param bw_and bits identifying one or more character categories * @param componentName URI component name * @param str URI component string * @return decoded and validated string * @throws URISyntaxException if an error occurs parsing/validating component */ public String validate_decode(int bw_and, String componentName, String str) throws URISyntaxException { StringBuilder sb = new StringBuilder(); int pos = 0; int ppos; int limit = str.length(); char c; int decode = 0; int tmpC; char decodedC; boolean bValid; while (pos < limit) { c = str.charAt(pos++); if (c < 256) { if ((charTypeMap[c] & bw_and) == 0) { if (c == '%') { ppos = pos - 1; if (pos < limit) { c = str.charAt(pos); if (c == 'u' || c == 'U') { if (!bAllow16bitPercentEncoding) { if (!bAllowinvalidPercentEncoding) { throw new URISyntaxException(str, "Invalid URI " + componentName + " component - 16-bit percent encoding not allowed"); } else { bValid = false; } } else { ++pos; decode = 4; bValid = true; } } else { decode = 2; bValid = true; } decodedC = 0; while (bValid && decode > 0) { if (pos < limit) { c = str.charAt(pos++); decodedC <<= 4; if (c < 256) { tmpC = asciiHexTab[c]; if (tmpC != -1) { decodedC |= tmpC; --decode; } else { bValid = false; } } else { bValid = false; } } else { bValid = false; } } if (!bValid && !bAllowinvalidPercentEncoding) { throw new URISyntaxException(str, "Invalid URI " + componentName + " component - invalid percent encoding"); } sb.append((char) decodedC); } else { if (!bAllowinvalidPercentEncoding) { throw new URISyntaxException(str, "Invalid URI " + componentName + " component - incomplete percent encoding"); } else { bValid = false; } } if (!bValid) { while (ppos < pos) { sb.append(str.charAt(ppos++)); } } } else { throw new URISyntaxException(str, "Invalid URI " + componentName + " component - invalid character '" + (Character.isISOControl(c)?String.format("0x%02x", (int)c):c) + "'"); } } else { sb.append(c); } } else { throw new URISyntaxException(str, "Invalid URI " + componentName + " component - invalid character '" + (Character.isISOControl(c)?String.format("0x%02x", (int)c):c) + "'"); } } return sb.toString(); } /** Hex char to integer conversion table. */ public static int[] asciiHexTab = new int[256]; /** Integer to hex char conversion table. */ public static char[] hexTab = { '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 'A', 'B', 'C', 'D', 'E', 'F' }; /* * Initialize ASCII hex table. */ static { String hex = "0123456789abcdef"; for (int i=0; i \^`{|}¡¢£¤¥¦§¨©ª«¬®¯°±²³´µ¶·¸¹º»¼½¾¿ÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖ×ØÙÚÛÜÝÞßàáâãäåæçèéêëìíîïðñòóôõö÷øùúûüýþ RFC3986_ABS_16BIT_LAX = new UriProfile(); RFC3986_ABS_16BIT_LAX.bAllowRelativeUris = false; RFC3986_ABS_16BIT_LAX.bAllow16bitPercentEncoding = true; RFC3986_ABS_16BIT_LAX.bAllowinvalidPercentEncoding = true; RFC3986_ABS_16BIT_LAX.charTypeAddAndOr(sb.toString(), 0, B_PATH | B_QUERY | B_FRAGMENT); RFC3986_ABS_16BIT_LAX.charTypeAddAndOr("#", 0, B_FRAGMENT); } }
© 2015 - 2025 Weber Informatics LLC | Privacy Policy