net.sf.saxon.regex.charclass.Categories Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of Saxon-HE Show documentation
Show all versions of Saxon-HE Show documentation
The XSLT and XQuery Processor
////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
// Copyright (c) 2018-2023 Saxonica Limited
// This Source Code Form is subject to the terms of the Mozilla Public License, v. 2.0.
// If a copy of the MPL was not distributed with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
// This Source Code Form is "Incompatible With Secondary Licenses", as defined by the Mozilla Public License, v. 2.0.
////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
package net.sf.saxon.regex.charclass;
import net.sf.saxon.Configuration;
import net.sf.saxon.Version;
import net.sf.saxon.event.Builder;
import net.sf.saxon.lib.ParseOptions;
import net.sf.saxon.lib.Validation;
import net.sf.saxon.om.AxisInfo;
import net.sf.saxon.om.NamespaceUri;
import net.sf.saxon.om.NodeInfo;
import net.sf.saxon.pattern.NameTest;
import net.sf.saxon.pattern.NodeKindTest;
import net.sf.saxon.serialize.charcode.XMLCharacterData;
import net.sf.saxon.trans.XPathException;
import net.sf.saxon.tree.iter.AxisIterator;
import net.sf.saxon.tree.tiny.TinyElementImpl;
import net.sf.saxon.type.Type;
import net.sf.saxon.z.*;
import javax.xml.transform.stream.StreamSource;
import java.io.InputStream;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.Map;
/**
* Data for Regular expression character categories. The data is in an XML file derived from the Unicode
* database (In Saxon 9.6, this is based on Unicode 6.2.0). Since Saxon 9.4,
* we no longer make use of Java's support for character categories since there are too many differences
* from Unicode.
*/
public class Categories {
/**
* A Category is a CharacterClass represented in a regular expression as \p{Xx}.
* The label Xx is retained, and can be used to determine whether or not two
* categories are disjoint.
*/
public static class Category implements CharacterClass {
private final String label;
private final IntPredicateProxy predicate;
public Category(String label, IntPredicateProxy predicate) {
this.label = label;
this.predicate = predicate;
}
@Override
public boolean test(int value) {
return predicate.test(value);
}
@Override
public boolean isDisjoint(CharacterClass other) {
if (other instanceof Category) {
char majorCat0 = label.charAt(0);
String otherLabel = ((Category)other).label;
char majorCat1 = otherLabel.charAt(0);
return majorCat0 != majorCat1 ||
(label.length() > 1 && otherLabel.length() > 1 && !label.equals(otherLabel));
} else if (other instanceof InverseCharacterClass) {
return other.isDisjoint(this);
} else if (other instanceof SingletonCharacterClass) {
return !test(((SingletonCharacterClass)other).getCodepoint());
} else if (other instanceof IntSetCharacterClass) {
IntSet intSet = other.getIntSet();
if (intSet.size() > 100) {
// too expensive to test, and increasingly likely to be non-disjoint anyway
return false;
}
IntIterator ii = intSet.iterator();
while (ii.hasNext()) {
if (test(ii.next())) {
return false;
}
}
return true;
} else {
return false;
}
}
@Override
public IntSet getIntSet() {
return extent(predicate);
}
private static IntSet extent(IntPredicateProxy predicate) {
if (predicate instanceof IntSetPredicate) {
return ((IntSetPredicate) predicate).getIntSet();
}
return null;
}
}
private static HashMap CATEGORIES = null;
static void build() {
CATEGORIES = new HashMap<>(30);
InputStream in = Version.platform.locateResource("categories.xml", new ArrayList<>());
if (in == null) {
throw new RuntimeException("Unable to read categories.xml file");
}
Configuration config = new Configuration();
ParseOptions options = new ParseOptions()
.withSchemaValidationMode(Validation.SKIP)
.withDTDValidationMode(Validation.SKIP)
.withTreeModel(Builder.TINY_TREE)
.withPleaseCloseAfterUse(true);
NodeInfo doc;
try {
doc = config.buildDocumentTree(new StreamSource(in, "categories.xml"), options).getRootNode();
} catch (XPathException e) {
throw new RuntimeException("Failed to build categories.xml", e);
}
int fp_name = config.getNamePool().allocateFingerprint(NamespaceUri.NULL, "name");
int fp_f = config.getNamePool().allocateFingerprint(NamespaceUri.NULL, "f");
int fp_t = config.getNamePool().allocateFingerprint(NamespaceUri.NULL, "t");
AxisIterator iter = doc.iterateAxis(AxisInfo.DESCENDANT, new NameTest(Type.ELEMENT, NamespaceUri.NULL, "cat", config.getNamePool()));
for (NodeInfo item; (item = iter.next()) != null; ) {
String cat = ((TinyElementImpl)item).getAttributeValue(fp_name);
IntRangeSet irs = new IntRangeSet();
for (NodeInfo r : item.children(NodeKindTest.ELEMENT)) {
String from = ((TinyElementImpl)r).getAttributeValue(fp_f);
String to = ((TinyElementImpl) r).getAttributeValue(fp_t);
irs.addRange(Integer.parseInt(from, 16), Integer.parseInt(to, 16));
}
CATEGORIES.put(cat, new Category(cat, new IntSetPredicate(irs)));
};
String c = "CLMNPSZ";
for (int i = 0; i < c.length(); i++) {
char ch = c.charAt(i);
IntPredicateProxy ip = null;
for (Map.Entry entry : CATEGORIES.entrySet()) {
if (entry.getKey().charAt(0) == ch) {
ip = ip == null ? entry.getValue() : IntUnionPredicate.makeUnion(ip, entry.getValue());
}
}
String label = ch + "";
CATEGORIES.put(label, new Category(label, ip));
}
}
public final static CharacterClass ESCAPE_s =
new IntSetCharacterClass(IntArraySet.make(new int[]{9, 10, 13, 32}, 4));
public final static CharacterClass ESCAPE_S = new InverseCharacterClass(ESCAPE_s);
public final static PredicateCharacterClass ESCAPE_i =
new PredicateCharacterClass(value -> XMLCharacterData.isNCNameStart11(value) || value == ':');
public final static CharacterClass ESCAPE_I = new InverseCharacterClass(ESCAPE_i);
public final static PredicateCharacterClass ESCAPE_c =
new PredicateCharacterClass(value -> XMLCharacterData.isNCName11(value) || value == ':');
public final static CharacterClass ESCAPE_C = new InverseCharacterClass(ESCAPE_c);
public final static Category ESCAPE_d = getCategory("Nd");
public final static CharacterClass ESCAPE_D = new InverseCharacterClass(ESCAPE_d);
static Category CATEGORY_P = getCategory("P");
static Category CATEGORY_Z = getCategory("Z");
static Category CATEGORY_C = getCategory("C");
public final static PredicateCharacterClass ESCAPE_w =
new PredicateCharacterClass(value -> !(CATEGORY_P.test(value) || CATEGORY_Z.test(value) || CATEGORY_C.test(value)));
public final static CharacterClass ESCAPE_W = new InverseCharacterClass(ESCAPE_w);
/**
* Get a predicate to test characters for membership of one of the Unicode
* character categories
*
* @param cat a one-character or two-character category name, for example L or Lu
* @return a predicate that tests whether a given character belongs to the category
*/
public synchronized static Category getCategory(String cat) {
if (CATEGORIES == null) {
build();
}
return CATEGORIES.get(cat);
}
}
// The following stylesheet was used to generate the categories.xml file from the Unicode 6.2.0 database:
//
//
//
//
//
//
//
//
//
//
//
//
//
//
//
//
//
//
//
//
//
//
//
//
//
//
//
//
//
//
//
//
//
//
//
//
//
//
//
//
//
//
//
//
//
//
//
//
//
//
//
//
//
//
//
//
//
//
//
//
//
//
//
//
//
//
//
//
//
//