org.apache.jena.iri.impl.PatternCompiler Maven / Gradle / Ivy
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
// TODO e-mail uri list about . at end of domain name
// TODO e-mail uri list about IPv4 vs host:
// If host matches the rule for IPv4address, then it should be considered an IPv4 address literal and not a reg-name.
package org.apache.jena.iri.impl;
import java.lang.reflect.Field;
import org.apache.jena.iri.ViolationCodes ;
public class PatternCompiler implements ViolationCodes {
// static VarPattern notMatching[] = {
// new VarPattern("[[a]&&[b]]")
// };
/*
static VarPattern iri = new VarPattern(
"^(@{scheme}:)?(//@{authority})?(@{path})?(\\?@{query})?(#@{fragment})?");
static VarPattern scheme[] = {
new VarPattern(
"@{alphaPreferLowerCase}(@{alphaPreferLowerCase}|@{digit}|[-+.])*"),
new VarPattern("(@{alphaPreferLowerCase}|@{digit}|[\\-\\+\\.])+",
SCHEME_MUST_START_WITH_LETTER),
new VarPattern("a*", EMPTY_SCHEME),
new VarPattern("[^:/\\?#]+", ILLEGAL_CHARACTER), };
static VarPattern alphaPreferLowerCase[] = { new VarPattern("[a-z]"),
new VarPattern("[A-Za-z]", LOWERCASE_PREFERRED), };
static VarPattern digit[] = { new VarPattern("[0-9]") };
static VarPattern authority[] = {
new VarPattern("(@{userinfo}@)?@{host}(:@{port})"),
new VarPattern("[^/\\?#]*"), };
static VarPattern userinfo[] = {
new VarPattern("(@{unreserved}|@{pctEncoded}|@{subDelims}|:)*"),
new VarPattern("[^@]*", ILLEGAL_CHARACTER), };
static VarPattern unreserved[] = {
new VarPattern("@{unreservedNotDot}|\\."),
};
static VarPattern unreservedNotDot[] = {
// new VarPattern("[\\-a-zA-Z0-9_\\~]"),
new VarPattern("[\\-a-zA-Z0-9_\\~]|@{unwise}"),
new VarPattern("[\\-a-zA-Z0-9_\\~\\x7F-\\uFFFF\\x00-\\x08\\x0B\\x0C\\x0E\\x0F]|@{unwise}",
new int[] { NON_URI_CHARACTER }),
};
static VarPattern unreservedDNSLabel[] = {
new VarPattern("[\\a-z0-9_]"),
new VarPattern("[\\a-zA-Z0-9_]", LOWERCASE_PREFERRED),
new VarPattern(
"[\\a-zA-Z0-9_\\xA0-\\uFFFF]",
NON_URI_CHARACTER)
};
static VarPattern pctEncoded[] = {
new VarPattern("a"),
new VarPattern("%20",PERCENT_20),
new VarPattern("%@{upperHexDig}{2}",PERCENT),
new VarPattern("%", ILLEGAL_PERCENT_ENCODING),
};
static VarPattern subDelims[] = { new VarPattern("[!$&'()*+,;=]") };
static VarPattern port[] = { new VarPattern("@{nonZeroDigit}@{digit}*"),
new VarPattern("@{digit}+", PORT_SHOULD_NOT_START_IN_ZERO),
new VarPattern("0*", PORT_SHOULD_NOT_BE_EMPTY),
new VarPattern(".*", ILLEGAL_CHARACTER), };
static VarPattern nonZeroDigit[] = { new VarPattern("[1-9]") };
static VarPattern unwise[] = {
new VarPattern("a"),
new VarPattern("[\\>\\<\\\"{}\\|\\^`]", UNWISE_CHARACTER),
new VarPattern("[\\>\\<\\\"{}\\|\\^`\\x20]", WHITESPACE),
new VarPattern("[\\>\\<\\\"{}\\|\\^`\\x20\\t\\n\\r]",
NOT_XML_SCHEMA_WHITESPACE),
};
static VarPattern regname[] = {
new VarPattern("(@{label}\\.)*@{label}"),
new VarPattern("(@{unreserved}|@{pctEncodedHost}|@{subDelims})*",
NOT_DNS_NAME),
// new VarPattern("(@{unreserved}|@{pctEncodedHost}|@{subDelims}|@{unwise})*",
// NOT_DNS_NAME),
new VarPattern("[^:/@]*", ILLEGAL_CHARACTER), };
static VarPattern pctEncodedHost[] = {
// Should check pct encoding is UTF-8
// Also punycode preferred
new VarPattern("a"),
new VarPattern("@{pctEncoded}", USE_PUNYCODE_NOT_PERCENTS) };
static VarPattern label[] = {
// new VarPattern("@{acePrefix}@{labelChar}*",LABEL_HAS_ACE_PREFIX),
// new VarPattern("!(!@{labelAny}|@{labelDoubleDash})"),
new VarPattern("@{labelSingleDashInside}"),
new VarPattern("-?@{labelSingleDashInside}-?",
DNS_LABEL_DASH_START_OR_END),
new VarPattern("@{acePrefix}@{labelSingleDashInside}", ACE_PREFIX),
new VarPattern("@{acePrefix}@{labelSingleDashInside}-", new int[] {
ACE_PREFIX, DNS_LABEL_DASH_START_OR_END }),
new VarPattern("@{acePrefix}(@{labelChar}|-)*-", new int[] {
ACE_PREFIX, DOUBLE_DASH_IN_REG_NAME,
DNS_LABEL_DASH_START_OR_END }),
new VarPattern("(@{labelChar}|-)*-", new int[] {
DOUBLE_DASH_IN_REG_NAME, DNS_LABEL_DASH_START_OR_END }),
new VarPattern("-(@{labelChar}|-)*", new int[] {
DOUBLE_DASH_IN_REG_NAME, DNS_LABEL_DASH_START_OR_END }),
new VarPattern("@{labelAny}", DOUBLE_DASH_IN_REG_NAME),
};
static VarPattern labelSingleDashInside[] = { new VarPattern(
"(@{labelChar}+-)*@{labelChar}+"), };
// static VarPattern labelDoubleDash[] = {
// new VarPattern("@{labelChar}*--@{labelChar}*"),
// };
static VarPattern labelAny[] = { new VarPattern("(-|@{labelChar})+"), };
static VarPattern acePrefix[] = { new VarPattern("[a-z0-9]{2}--"),
new VarPattern("[a-zA-Z0-9]{2}--", LOWERCASE_PREFERRED), };
static VarPattern labelChar[] = {
// new VarPattern("--",DOUBLE_DASH_IN_REG_NAME),
new VarPattern("@{unreservedDNSLabel}|@{pctEncodedHost}") };
// static VarPattern
static VarPattern path[] = {
new VarPattern("@{pathAbempty}"),
// new VarPattern("@{pathAbsolute}"),
// new VarPattern("@{pathNoscheme}"),
new VarPattern("@{pathRootless}"),
// new VarPattern("@{pathEmpty}"),
new VarPattern("[^?#]*"),
};
static VarPattern pathAbempty[] = { new VarPattern("(\\/@{segment})*"), };
static VarPattern pathRootless[] = { new VarPattern(
"@{segmentNz}(\\/@{segment})*"), };
static VarPattern segment[] = {
new VarPattern("(a?|@{nonDotSegment})"),
new VarPattern("a?|\\.|\\.\\.|@{nonDotSegment}", NON_INITIAL_DOT_SEGMENT),
new VarPattern("[^/?#]*", ILLEGAL_CHARACTER),
};
static VarPattern nonDotSegment[] = {
// new VarPattern("@{pchar}*(@{pcharNotDot}|(\\.\\.\\.))@{pchar}*"),
new VarPattern(".{0,2}(@{pcharNotDot}|(\\.\\.\\.))@{pchar}*"),
};
static VarPattern segmentNz[] = {
new VarPattern("(\\.|(\\.\\.\\/)*(\\.\\.)|@{pchar}+)"),
new VarPattern("[^/?#]+", ILLEGAL_CHARACTER), };
static VarPattern pchar[] = { new VarPattern(
"@{unreserved}|@{pctEncoded}|@{subDelims}|[:@]"), };
static VarPattern pcharNotDot[] = {
new VarPattern(
"@{unreservedNotDot}|@{pctEncoded}|@{subDelims}|[:@]"), };
static VarPattern query[] = {
new VarPattern("(@{pchar}|[/\\?])*"),
new VarPattern("[^#]*", ILLEGAL_CHARACTER), };
static VarPattern fragment[] = {
new VarPattern("(@{pchar}|[/\\?])*"),
new VarPattern("[^]*", ILLEGAL_CHARACTER), };
*/
static VarPattern ipLiteral[] = {
new VarPattern("\\[@{ipVFuture}\\]"),
new VarPattern("\\[@{ipV6Address}\\]"),
new VarPattern("\\[[^]*",IP_V6_OR_FUTURE_ADDRESS_SYNTAX)
};
static VarPattern ipVFuture[] = {
new VarPattern("v@{lowerHexDig}+\\.[-a-zA-Z0-9._~!$&'()*+,;=:]*")
};
static VarPattern ipV6Address[] = {
new VarPattern("((@{h16}:){6}@{ls32}" + "|::(@{h16}:){5}@{ls32}"
+ "|@{h16}?::(@{h16}:){4}@{ls32}"
+ "|((@{h16}:){0,1}@{h16})?::(@{h16}:){3}@{ls32}"
+ "|((@{h16}:){0,2}@{h16})?::(@{h16}:){2}@{ls32}"
+ "|((@{h16}:){0,3}@{h16})?::(@{h16}:){1}@{ls32}"
+ "|((@{h16}:){0,4}@{h16})?::@{ls32}"
+ "|((@{h16}:){0,5}@{h16})?::@{h16}"
+ "|((@{h16}:){0,6}@{h16})?::)")
};
static VarPattern h16[] = { new VarPattern("@{lowerHexDig}{1,4}"), };
static VarPattern ls32[] = { new VarPattern(
"(@{h16}:@{h16}|@{ipV4Address})"),
};
static VarPattern ipV4Address[] = {
new VarPattern("(@{decOctet}\\.){3}@{decOctet}"),
new VarPattern("([0-9]+\\.){3}[0-9]+", IP_V4_OCTET_RANGE),
// RFC 1123 sec 2.1 modified the rules for host names to make just digits legal as a DNS name.
//new VarPattern("[0-9\\.]+\\.[0-9\\.]+", IP_V4_HAS_FOUR_COMPONENTS),
};
static VarPattern decOctet[] = { new VarPattern(
"([0-9]|[1-9][0-9]|1[0-9][0-9]|2[0-4][0-9]|25[0-5])") };
static VarPattern regname[] = {
new VarPattern("(@{label}\\.)*@{label}\\.?"),
new VarPattern("[^]*", NOT_DNS_NAME),
};
// new VarPattern("(@{unreserved}|@{pctEncodedHost}|@{subDelims}|@{unwise})*",
// NOT_DNS_NAME),
// new VarPattern("[^:/@]*", ILLEGAL_CHARACTER), };
static VarPattern host[] = {
new VarPattern("@{ipLiteral}"),
new VarPattern("@{ipV4Address}"),
new VarPattern("@{regname}"),
// new VarPattern("[^:]*", ILLEGAL_CHARACTER),
};
static VarPattern lowerHexDig[] = { new VarPattern("[0-9a-f]"),
new VarPattern("[0-9A-Fa-f]", IPv6ADDRESS_SHOULD_BE_LOWERCASE), };
/*
static VarPattern upperHexDig[] = {
new VarPattern("[0-9A-F]"),
new VarPattern("[0-9A-Fa-f]", PERCENT_ENCODING_SHOULD_BE_UPPERCASE), };
*/
/*
static VarPattern pctEncodedHost[] = {
// Should check pct encoding is UTF-8
// Also punycode preferred
new VarPattern("a"),
new VarPattern("@{pctEncoded}", USE_PUNYCODE_NOT_PERCENTS) };
*/
static VarPattern label[] = {
// new VarPattern("@{acePrefix}@{labelChar}*",LABEL_HAS_ACE_PREFIX),
// new VarPattern("!(!@{labelAny}|@{labelDoubleDash})"),
new VarPattern("@{labelPrefix}(@{labelInside}@{labelPostfix})?"),
};
static VarPattern labelInside[] = {
new VarPattern("@{labelSingleDashInside}?"),
new VarPattern("(@{labelChar}|-)*",
DOUBLE_DASH_IN_REG_NAME),
};
static VarPattern labelPrefix[] = {
new VarPattern("@{labelChar}"),
new VarPattern("-|@{labelChar}",DNS_LABEL_DASH_START_OR_END),
new VarPattern("@{labelChar}|@{acePrefix}",ACE_PREFIX),
new VarPattern("@{labelChar}|@{acePrefix}|-",new int[] {
ACE_PREFIX,
DNS_LABEL_DASH_START_OR_END }),
};
static VarPattern labelPostfix[] = {
new VarPattern("@{labelChar}"),
new VarPattern("-|@{labelChar}",DNS_LABEL_DASH_START_OR_END),
};
static VarPattern labelSingleDashInside[] = { new VarPattern(
"(@{labelChar}+-)*@{labelChar}+"), };
static VarPattern acePrefix[] = { new VarPattern("@{letterDigit}{2}--"), };
static VarPattern letterDigit[] = { new VarPattern("[a-z0-9]"),
new VarPattern("[a-zA-Z0-9]", LOWERCASE_PREFERRED), };
static VarPattern labelChar[] = {
new VarPattern("@{unreservedDNSLabel}") };
static VarPattern unreservedDNSLabel[] = {
new VarPattern("@{letterDigit}|_"),
new VarPattern(
"@{letterDigit}|[_\\x80-\\uFFFF]",
NON_URI_CHARACTER)
};
public static VarPattern[] lookup(String name) {
try {
Field f = PatternCompiler.class.getDeclaredField(name);
return (VarPattern[]) f.get(null);
} catch (RuntimeException rte) {
throw rte;
} catch (Exception e) {
throw new RuntimeException(e);
}
}
static String eCodeNames[];
public static String errorCodeName(int j) {
if (eCodeNames == null) {
eCodeNames = constantsFromClass(ViolationCodes.class, 200);
}
return eCodeNames[j];
}
static String[] constantsFromClass(Class> cl, int cnt) {
String[] names;
names = new String[cnt];
Field f[] = cl.getDeclaredFields();
for ( Field aF : f )
{
try
{
names[aF.getInt( null )] = aF.getName();
}
catch ( IllegalArgumentException | IllegalAccessException e )
{
e.printStackTrace();
}
}
return names;
}
public static int errorCode(String s) throws NoSuchFieldException {
Field f;
try {
f = ViolationCodes.class.getDeclaredField(s);
return f.getInt(null);
} catch (SecurityException | IllegalAccessException | IllegalArgumentException e) {
e.printStackTrace();
throw new RuntimeException(e);
}
}
/*
*
* Unicode LTR stuff:
*
* 200E ????-??- ????? ???? 200F ?????-??-???? ???? 202A ????-??-?????
* ????????? 202B ?????-??-???? ????????? 202C ??? ??????????? ??????????
* 202D ????-??-????? ???????? 202E ?????-??-???? ????????
*
* XSD preserve No normalization is done, the value is not changed (this is
* the behavior required by [XML 1.0 (Second Edition)] for element content)
* replace All occurrences of #x9 (tab), #xA (line feed) and #xD (carriage
* return) are replaced with #x20 (space) collapse After the processing
* implied by replace, contiguous sequences of #x20's are collapsed to a
* single #x20, and leading and trailing #x20's are removed.
*
*
*
*
*
*
*
*
*
*
* XML 1.0
*
* [2] Char ::= #x9 | #xA | #xD | [#x20-#xD7FF] | [#xE000-#xFFFD] |
* [#x10000-#x10FFFF] /* any Unicode character, excluding the surrogate
* blocks, FFFE, and FFFF.
*
*
* Note:
*
* Document authors are encouraged to avoid "compatibility characters", as
* defined in section 6.8 of [Unicode] (see also D21 in section 3.6 of
* [Unicode3]). The characters defined in the following ranges are also
* discouraged. They are either control characters or permanently undefined
* Unicode characters:
*
* [#x7F-#x84], [#x86-#x9F], [#xFDD0-#xFDDF], [#1FFFE-#x1FFFF],
* [#2FFFE-#x2FFFF], [#3FFFE-#x3FFFF], [#4FFFE-#x4FFFF], [#5FFFE-#x5FFFF],
* [#6FFFE-#x6FFFF], [#7FFFE-#x7FFFF], [#8FFFE-#x8FFFF], [#9FFFE-#x9FFFF],
* [#AFFFE-#xAFFFF], [#BFFFE-#xBFFFF], [#CFFFE-#xCFFFF], [#DFFFE-#xDFFFF],
* [#EFFFE-#xEFFFF], [#FFFFE-#xFFFFF], [#10FFFE-#x10FFFF].
*
*
* XML 1.1 [Definition: A parsed entity contains text, a sequence of
* characters, which may represent markup or character data.] [Definition: A
* character is an atomic unit of text as specified by ISO/IEC 10646
* [ISO/IEC 10646]. Legal characters are tab, carriage return, line feed,
* and the legal characters of Unicode and ISO/IEC 10646. The versions of
* these standards cited in A.1 Normative References were current at the
* time this document was prepared. New characters may be added to these
* standards by amendments or new editions. Consequently, XML processors
* MUST accept any character in the range specified for Char.] Character
* Range [2] Char ::= [#x1-#xD7FF] | [#xE000-#xFFFD] | [#x10000-#x10FFFF] /*
* any Unicode character, excluding the surrogate blocks, FFFE, and FFFF. * /
* [2a] RestrictedChar ::= [#x1-#x8] | [#xB-#xC] | [#xE-#x1F] | [#x7F-#x84] |
* [#x86-#x9F]
*
* The mechanism for encoding character code points into bit patterns MAY
* vary from entity to entity. All XML processors MUST accept the UTF-8 and
* UTF-16 encodings of Unicode [Unicode]; the mechanisms for signaling which
* of the two is in use, or for bringing other encodings into play, are
* discussed later, in 4.3.3 Character Encoding in Entities.
*
* Note:
*
* Document authors are encouraged to avoid "compatibility characters", as
* defined in Unicode [Unicode]. The characters defined in the following
* ranges are also discouraged. They are either control characters or
* permanently undefined Unicode characters:
*
* [#x7F-#x84], [#x86-#x9F], [#xFDD0-#xFDDF], [#1FFFE-#x1FFFF],
* [#2FFFE-#x2FFFF], [#3FFFE-#x3FFFF], [#4FFFE-#x4FFFF], [#5FFFE-#x5FFFF],
* [#6FFFE-#x6FFFF], [#7FFFE-#x7FFFF], [#8FFFE-#x8FFFF], [#9FFFE-#x9FFFF],
* [#AFFFE-#xAFFFF], [#BFFFE-#xBFFFF], [#CFFFE-#xCFFFF], [#DFFFE-#xDFFFF],
* [#EFFFE-#xEFFFF], [#FFFFE-#xFFFFF], [#10FFFE-#x10FFFF].
*/
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy