org.semanticweb.owlapi.io.XMLUtils Maven / Gradle / Ivy
The newest version!
/*
* This file is part of the OWL API.
*
* The contents of this file are subject to the LGPL License, Version 3.0.
*
* Copyright (C) 2011, The University of Manchester
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see http://www.gnu.org/licenses/.
*
*
* Alternatively, the contents of this file may be used under the terms of the Apache License, Version 2.0
* in which case, the provisions of the Apache License Version 2.0 are applicable instead of those above.
*
* Copyright 2011, The University of Manchester
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*
*
*/
package org.semanticweb.owlapi.io;
/**
* Author: Matthew Horridge
* The University of Manchester
* Bio-Health Informatics Group
* Date: 22/09/2011
*
* This class contains various methods for checking QNames, NCNames etc.
* The implementation is based on the W3C namespaces in XML specification.
* @since 3.3.0
*/
@SuppressWarnings("javadoc")
public class XMLUtils {
public static final String LT = "<";
public static final String GT = ">";
public static final String QUOT = """;
public static final String AMP = "&";
public static final String APOS = "'";
// For some point in the future
public static final String OWL_PROCESSING_INSTRUCTION_NAME = "owl";
/**
* Determines if a character is an XML name start character.
* @param codePoint The code point of the character to be tested. For UTF-16 characters the code point corresponds
* to the value of the char that represents the character.
* @return true
if codePoint
is an XML name start character, otherwise false
*/
public static boolean isXMLNameStartCharacter(int codePoint) {
return codePoint == ':'
|| codePoint >= 'A' && codePoint <= 'Z'
|| codePoint == '_'
|| codePoint >= 'a' && codePoint <= 'z'
|| codePoint >= 0xC0 && codePoint <= 0xD6
|| codePoint >= 0xD8 && codePoint <= 0xF6
|| codePoint >= 0xF8 && codePoint <= 0x2FF
|| codePoint >= 0x370 && codePoint <= 0x37D
|| codePoint >= 0x37F && codePoint <= 0x1FFF
|| codePoint >= 0x200C && codePoint <= 0x200D
|| codePoint >= 0x2070 && codePoint <= 0x218F
|| codePoint >= 0x2C00 && codePoint <= 0x2FEF
|| codePoint >= 0x3001 && codePoint <= 0xD7FF
|| codePoint >= 0xF900 && codePoint <= 0xFDCF
|| codePoint >= 0xFDF0 && codePoint <= 0xFFFD
|| codePoint >= 0x10000 && codePoint <= 0xEFFFF;
}
/**
* Determines if a character is an XML name character.
* @param codePoint The code point of the character to be tested. For UTF-8 and UTF-16 characters the code point
* corresponds to the value of the char that represents the character.
* @return true
if codePoint
is an XML name start character, otherwise false
*/
public static boolean isXMLNameChar(int codePoint) {
return isXMLNameStartCharacter(codePoint)
|| codePoint == '-'
|| codePoint == '.'
|| codePoint >= '0' && codePoint <= '9'
|| codePoint == 0xB7
|| codePoint >= 0x0300 && codePoint <= 0x036F
|| codePoint >= 0x203F && codePoint <= 0x2040;
}
/**
* Deterimines if a character is an NCName (Non-Colonised Name) start character.
* @param codePoint The code point of the character to be tested. For UTF-8 and UTF-16 characters the code point
* corresponds to the value of the char that represents the character.
* @return true
if codePoint
is a NCName start character, otherwise false
.
*/
public static boolean isNCNameStartChar(int codePoint) {
return codePoint != ':' && isXMLNameStartCharacter(codePoint);
}
/**
* Deterimines if a character is an NCName (Non-Colonised Name) character.
* @param codePoint The code point of the character to be tested. For UTF-8 and UTF-16 characters the code point
* corresponds to the value of the char that represents the character.
* @return true
if codePoint
is a NCName character, otherwise false
.
*/
public static boolean isNCNameChar(int codePoint) {
return codePoint != ':' && isXMLNameChar(codePoint);
}
/**
* Determines if a character sequence is an NCName (Non-Colonised Name). An NCName is a string which starts with an
* NCName start character and is followed by zero or more NCName characters.
* @param s The character sequence to be tested.
* @return true
if s
is an NCName, otherwise false
.
*/
public static boolean isNCName(CharSequence s) {
if (isNullOrEmpty(s)) {
return false;
}
int firstCodePoint = Character.codePointAt(s, 0);
if(!isNCNameStartChar(firstCodePoint)) {
return false;
}
for(int i = Character.charCount(firstCodePoint); i < s.length(); ) {
int codePoint = Character.codePointAt(s, i);
if(!isNCNameChar(codePoint)) {
return false;
}
i += Character.charCount(codePoint);
}
return true;
}
/**
* Determines if a character sequence is a QName. A QName is either an NCName (LocalName), or an NCName followed by a colon
* followed by another NCName (where the first NCName is referred to as the 'Prefix Name' and the second NCName is referred to
* as the 'Local Name' - i.e. PrefixName:LocalName).
* @param s The character sequence to be tested.
* @return true
if s
is a QName, otherwise false
.
*/
public static boolean isQName(CharSequence s) {
if (isNullOrEmpty(s)) {
return false;
}
boolean foundColon = false;
boolean inNCName = false;
for(int i = 0; i < s.length(); ) {
int codePoint = Character.codePointAt(s, i);
if(codePoint == ':') {
if(foundColon) {
return false;
}
foundColon = true;
if(!inNCName) {
return false;
}
inNCName = false;
}
else {
if(!inNCName) {
if(!isXMLNameStartCharacter(codePoint)) {
return false;
}
inNCName = true;
}
else {
if(!isXMLNameChar(codePoint)) {
return false;
}
}
}
i += Character.charCount(codePoint);
}
return true;
}
/**
* Determines if a character sequence has a suffix that is an NCName.
* @param s The character sequence.
* @return true
if the character sequence s
has a suffix that is an NCName.
*/
public static boolean hasNCNameSuffix(CharSequence s) {
return getNCNameSuffixIndex(s) != -1;
}
/**
* Gets the index of the longest NCName that is the suffix of a character sequence.
* @param s The character sequence.
* @return The index of the longest suffix of the specified character sequence s
that is an NCName, or
* -1 if the character sequence s
does not have a suffix that is an NCName.
*/
public static int getNCNameSuffixIndex(CharSequence s) {
// identify bnode labels and do not try to split them
if (s.length() > 1 && s.charAt(0) == '_' && s.charAt(1) == ':') {
return -1;
}
int index = -1;
for(int i = s.length() - 1; i > -1; i--) {
if (!Character.isLowSurrogate(s.charAt(i))) {
int codePoint = Character.codePointAt(s, i);
if(isNCNameStartChar(codePoint)) {
index = i;
}
if(!isNCNameChar(codePoint)) {
break;
}
}
}
return index;
}
/**
* Get the longest NCName that is a suffix of a character sequence.
* @param s The character sequence.
* @return The String which is the longest suffix of the character sequence s
that is an NCName, or
* null
if the character sequence s
does not have a suffix that is an NCName.
*/
public static String getNCNameSuffix(CharSequence s) {
if (s.length() > 1 && s.charAt(0) == '_' && s.charAt(1) == ':') {
return null;
}
int localPartStartIndex = getNCNameSuffixIndex(s);
if(localPartStartIndex > -1) {
return s.toString().substring(localPartStartIndex);
}
else {
return null;
}
}
/**utility to get the part of a charsequence that is not the NCName fragment
* @param s the charsequence to split
* @return the prefix split at the last non-ncname character, or the whole input if no ncname is found*/
public static String getNCNamePrefix(CharSequence s) {
if (s.length() > 1 && s.charAt(0) == '_' && s.charAt(1) == ':') {
return s.toString();
}
int localPartStartIndex = getNCNameSuffixIndex(s);
if (localPartStartIndex > -1) {
return s.toString().substring(0, localPartStartIndex);
}
else {
return s.toString();
}
}
/**
* Escapes a character sequence so that it is valid XML.
* @param s The character sequence.
* @return The escaped version of the character sequence.
*/
public static String escapeXML(CharSequence s) {
// double quote -- quot
// ampersand -- amp
// less than -- lt
// greater than -- gt
// apostrophe -- apos
StringBuilder sb = new StringBuilder(s.length() * 2);
for (int i = 0; i < s.length(); ) {
int codePoint = Character.codePointAt(s, i);
if (codePoint == '<') {
sb.append(LT);
}
else if (codePoint == '>') {
sb.append(GT);
}
else if (codePoint == '\"') {
sb.append(QUOT);
}
else if (codePoint == '&') {
sb.append(AMP);
}
else if (codePoint == '\'') {
sb.append(APOS);
}
else {
sb.appendCodePoint(codePoint);
}
i += Character.charCount(codePoint);
}
return sb.toString();
}
/**
* Determines if a character sequence is null
or empty.
* @param s The character sequence.
* @return true
if the character sequence is null
, true
if the character
* sequence is empty, otherwise false
.
*/
private static boolean isNullOrEmpty(CharSequence s) {
return s == null || s.length() == 0;
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy