net.sf.saxon.regex.Categories Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of Saxon-HE Show documentation
Show all versions of Saxon-HE Show documentation
The XSLT and XQuery Processor
////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
// Copyright (c) 2015 Saxonica Limited.
// This Source Code Form is subject to the terms of the Mozilla Public License, v. 2.0.
// If a copy of the MPL was not distributed with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
// This Source Code Form is "Incompatible With Secondary Licenses", as defined by the Mozilla Public License, v. 2.0.
////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
package net.sf.saxon.regex;
import net.sf.saxon.Configuration;
import net.sf.saxon.lib.ParseOptions;
import net.sf.saxon.lib.Validation;
import net.sf.saxon.om.AxisInfo;
import net.sf.saxon.om.NodeInfo;
import net.sf.saxon.pattern.NameTest;
import net.sf.saxon.pattern.NodeKindTest;
import net.sf.saxon.serialize.charcode.XMLCharacterData;
import net.sf.saxon.trans.XPathException;
import net.sf.saxon.tree.iter.AxisIterator;
import net.sf.saxon.tree.util.Navigator;
import net.sf.saxon.type.Type;
import net.sf.saxon.z.*;
import javax.xml.transform.stream.StreamSource;
import java.io.InputStream;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.Map;
/**
* Data for Regular expression character categories. The data is in an XML file derived from the Unicode
* database (In Saxon 9.6, this is based on Unicode 6.2.0). Since Saxon 9.4,
* we no longer make use of Java's support for character categories since there are too many differences
* from Unicode.
*/
public class Categories {
private static HashMap CATEGORIES = null;
static void build() {
CATEGORIES = new HashMap(30);
InputStream in = Configuration.locateResource("categories.xml", new ArrayList(), new ArrayList());
if (in == null) {
throw new RuntimeException("Unable to read categories.xml file");
}
Configuration config = new Configuration();
ParseOptions options = new ParseOptions();
options.setSchemaValidationMode(Validation.SKIP);
NodeInfo doc;
try {
doc = config.buildDocumentTree(new StreamSource(in, "categories.xml"), options).getRootNode();
} catch (XPathException e) {
throw new RuntimeException("Failed to build categories.xml", e);
}
AxisIterator iter = doc.iterateAxis(AxisInfo.DESCENDANT, new NameTest(Type.ELEMENT, "", "cat", config.getNamePool()));
while (true) {
NodeInfo item = iter.next();
if (item == null) {
break;
}
String cat = Navigator.getAttributeValue(item, "", "name");
IntRangeSet irs = new IntRangeSet();
AxisIterator iter2 = item.iterateAxis(AxisInfo.CHILD, NodeKindTest.ELEMENT);
while (true) {
NodeInfo r = iter2.next();
if (r == null) {
break;
}
String from = Navigator.getAttributeValue(r, "", "f");
String to = Navigator.getAttributeValue(r, "", "t");
irs.addRange(Integer.parseInt(from, 16), Integer.parseInt(to, 16));
}
CATEGORIES.put(cat, new IntSetPredicate(irs));
}
String c = "CLMNPSZ";
for (int i = 0; i < c.length(); i++) {
char ch = c.charAt(i);
IntPredicate ip = null;
for (Map.Entry entry : CATEGORIES.entrySet()) {
if (entry.getKey().charAt(0) == ch) {
ip = (ip == null ? entry.getValue() : new IntUnionPredicate(ip, entry.getValue()));
}
}
CATEGORIES.put(ch + "", ip);
}
}
public final static IntPredicate ESCAPE_s =
new IntSetPredicate(IntArraySet.make(new int[]{9, 10, 13, 32}, 4));
public final static IntPredicate ESCAPE_S = new IntComplementPredicate(ESCAPE_s);
public final static IntPredicate ESCAPE_i = new IntPredicate() {
public boolean matches(int value) {
return XMLCharacterData.isNCNameStart11(value) || value == ':';
}
};
public final static IntPredicate ESCAPE_I = new IntPredicate() {
public boolean matches(int value) {
return !(XMLCharacterData.isNCNameStart11(value) || value == ':');
}
};
public final static IntPredicate ESCAPE_c = new IntPredicate() {
public boolean matches(int value) {
return XMLCharacterData.isNCName11(value) || value == ':';
}
};
public final static IntPredicate ESCAPE_C = new IntPredicate() {
public boolean matches(int value) {
return !(XMLCharacterData.isNCName11(value) || value == ':');
}
};
public final static IntPredicate ESCAPE_d = getCategory("Nd");
public final static IntPredicate ESCAPE_D = new IntComplementPredicate(ESCAPE_d);
static IntPredicate CATEGORY_P = getCategory("P");
static IntPredicate CATEGORY_Z = getCategory("Z");
static IntPredicate CATEGORY_C = getCategory("C");
public final static IntPredicate ESCAPE_w = new IntPredicate() {
public boolean matches(int value) {
return !(CATEGORY_P.matches(value) || CATEGORY_Z.matches(value) || CATEGORY_C.matches(value));
}
};
public final static IntPredicate ESCAPE_W = new IntComplementPredicate(ESCAPE_w);
/**
* Get a predicate to test characters for membership of one of the Unicode
* character categories
*
* @param cat a one-character or two-character category name, for example L or Lu
* @return a predicate that tests whether a given character belongs to the category
*/
public static IntPredicate getCategory(String cat) {
if (CATEGORIES == null) {
build();
}
return CATEGORIES.get(cat);
}
}
// The following stylesheet was used to generate the categories.xml file from the Unicode 6.2.0 database:
//
//
//
//
//
//
//
//
//
//
//
//
//
//
//
//
//
//
//
//
//
//
//
//
//
//
//
//
//
//
//
//
//
//
//
//
//
//
//
//
//
//
//
//
//
//
//
//
//
//
//
//
//
//
//
//
//
//
//
//
//
//
//
//
//
//
//
//
//
//
© 2015 - 2025 Weber Informatics LLC | Privacy Policy