All Downloads are FREE. Search and download functionalities are using the official Maven repository.

net.sf.saxon.regex.charclass.Categories Maven / Gradle / Ivy

////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
// Copyright (c) 2018-2023 Saxonica Limited
// This Source Code Form is subject to the terms of the Mozilla Public License, v. 2.0.
// If a copy of the MPL was not distributed with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
// This Source Code Form is "Incompatible With Secondary Licenses", as defined by the Mozilla Public License, v. 2.0.
////////////////////////////////////////////////////////////////////////////////////////////////////////////////////

package net.sf.saxon.regex.charclass;

import net.sf.saxon.Configuration;
import net.sf.saxon.Version;
import net.sf.saxon.event.Builder;
import net.sf.saxon.lib.ParseOptions;
import net.sf.saxon.lib.Validation;
import net.sf.saxon.om.AxisInfo;
import net.sf.saxon.om.NamespaceUri;
import net.sf.saxon.om.NodeInfo;
import net.sf.saxon.pattern.NameTest;
import net.sf.saxon.pattern.NodeKindTest;
import net.sf.saxon.serialize.charcode.XMLCharacterData;
import net.sf.saxon.trans.XPathException;
import net.sf.saxon.tree.iter.AxisIterator;
import net.sf.saxon.tree.tiny.TinyElementImpl;
import net.sf.saxon.type.Type;
import net.sf.saxon.z.*;

import javax.xml.transform.stream.StreamSource;
import java.io.InputStream;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.Map;

/**
 * Singleton class holding data for Regular expression character categories.
 * 

The data is read from an XML file derived from the Unicode * database (In Saxon 9.6, this is based on Unicode 6.2.0). Since Saxon 9.4, * we no longer make use of Java's support for character categories since there are too many differences * from Unicode.

*

Some commonly used categories are hard-coded and made available as static constants.

*/ public class Categories { /** * A Category is a CharacterClass represented in a regular expression as \p{Xx}. * The label Xx is retained, and can be used to determine whether or not two * categories are disjoint. */ public static class Category implements CharacterClass { private final String label; private final IntPredicateProxy predicate; public Category(String label, IntPredicateProxy predicate) { this.label = label; this.predicate = predicate; } @Override public boolean test(int value) { return predicate.test(value); } @Override public boolean isDisjoint(CharacterClass other) { if (other instanceof Category) { char majorCat0 = label.charAt(0); String otherLabel = ((Category)other).label; char majorCat1 = otherLabel.charAt(0); return majorCat0 != majorCat1 || (label.length() > 1 && otherLabel.length() > 1 && !label.equals(otherLabel)); } else if (other instanceof InverseCharacterClass) { return other.isDisjoint(this); } else if (other instanceof SingletonCharacterClass) { return !test(((SingletonCharacterClass)other).getCodepoint()); } else if (other instanceof IntSetCharacterClass) { IntSet intSet = other.getIntSet(); if (intSet.size() > 100) { // too expensive to test, and increasingly likely to be non-disjoint anyway return false; } IntIterator ii = intSet.iterator(); while (ii.hasNext()) { if (test(ii.next())) { return false; } } return true; } else { return false; } } @Override public IntSet getIntSet() { return extent(predicate); } private static IntSet extent(IntPredicateProxy predicate) { if (predicate instanceof IntSetPredicate) { return ((IntSetPredicate) predicate).getIntSet(); } return null; } } private final HashMap CATEGORIES = new HashMap<>(30); private Categories() { build(); } private static class Holder { // See https://en.wikipedia.org/wiki/Initialization-on-demand_holder_idiom // The idea here is that the initialization occurs the first time getInstance() is called, // and it is automatically synchronized by virtue of the Java class loading rules. public static final Categories INSTANCE = new Categories(); } private static Categories getInstance() { return Holder.INSTANCE; } private void build() { InputStream in = Version.platform.locateResource("categories.xml", new ArrayList<>()); if (in == null) { throw new RuntimeException("Unable to read categories.xml file"); } Configuration config = new Configuration(); ParseOptions options = new ParseOptions() .withSchemaValidationMode(Validation.SKIP) .withDTDValidationMode(Validation.SKIP) .withTreeModel(Builder.TINY_TREE) .withPleaseCloseAfterUse(true); NodeInfo doc; try { doc = config.buildDocumentTree(new StreamSource(in, "categories.xml"), options).getRootNode(); } catch (XPathException e) { throw new RuntimeException("Failed to build categories.xml", e); } int fp_name = config.getNamePool().allocateFingerprint(NamespaceUri.NULL, "name"); int fp_f = config.getNamePool().allocateFingerprint(NamespaceUri.NULL, "f"); int fp_t = config.getNamePool().allocateFingerprint(NamespaceUri.NULL, "t"); AxisIterator iter = doc.iterateAxis(AxisInfo.DESCENDANT, new NameTest(Type.ELEMENT, NamespaceUri.NULL, "cat", config.getNamePool())); for (NodeInfo item; (item = iter.next()) != null; ) { String cat = ((TinyElementImpl)item).getAttributeValue(fp_name); IntRangeSet irs = new IntRangeSet(); for (NodeInfo r : item.children(NodeKindTest.ELEMENT)) { String from = ((TinyElementImpl)r).getAttributeValue(fp_f); String to = ((TinyElementImpl) r).getAttributeValue(fp_t); irs.addRange(Integer.parseInt(from, 16), Integer.parseInt(to, 16)); } CATEGORIES.put(cat, new Category(cat, new IntSetPredicate(irs))); } String c = "CLMNPSZ"; for (int i = 0; i < c.length(); i++) { char ch = c.charAt(i); IntPredicateProxy ip = null; for (Map.Entry entry : CATEGORIES.entrySet()) { if (entry.getKey().charAt(0) == ch) { ip = ip == null ? entry.getValue() : IntUnionPredicate.makeUnion(ip, entry.getValue()); } } String label = ch + ""; CATEGORIES.put(label, new Category(label, ip)); } } public final static CharacterClass ESCAPE_s = new IntSetCharacterClass(IntArraySet.make(new int[]{9, 10, 13, 32}, 4)); public final static CharacterClass ESCAPE_S = new InverseCharacterClass(ESCAPE_s); public final static PredicateCharacterClass ESCAPE_i = new PredicateCharacterClass(value -> XMLCharacterData.isNCNameStart11(value) || value == ':'); public final static CharacterClass ESCAPE_I = new InverseCharacterClass(ESCAPE_i); public final static PredicateCharacterClass ESCAPE_c = new PredicateCharacterClass(value -> XMLCharacterData.isNCName11(value) || value == ':'); public final static CharacterClass ESCAPE_C = new InverseCharacterClass(ESCAPE_c); public final static Category ESCAPE_d = getCategory("Nd"); public final static CharacterClass ESCAPE_D = new InverseCharacterClass(ESCAPE_d); static Category CATEGORY_P = getCategory("P"); static Category CATEGORY_Z = getCategory("Z"); static Category CATEGORY_C = getCategory("C"); public final static PredicateCharacterClass ESCAPE_w = new PredicateCharacterClass(value -> !(CATEGORY_P.test(value) || CATEGORY_Z.test(value) || CATEGORY_C.test(value))); public final static CharacterClass ESCAPE_W = new InverseCharacterClass(ESCAPE_w); /** * Get a predicate to test characters for membership of one of the Unicode * character categories * * @param cat a one-character or two-character category name, for example L or Lu * @return a predicate that tests whether a given character belongs to the category */ public synchronized static Category getCategory(String cat) { return getInstance().CATEGORIES.get(cat); } } // The following stylesheet was used to generate the categories.xml file from the Unicode 6.2.0 database: // // // // // // // // // // // // // // // // // // // // // // // // // // // // // // // // // // // // // // // // // // // // // // // // // // // // // // // // // // // // // // // // // // // // // //




© 2015 - 2025 Weber Informatics LLC | Privacy Policy