All Downloads are FREE. Search and download functionalities are using the official Maven repository.

net.sf.saxon.regex.charclass.Categories Maven / Gradle / Ivy

There is a newer version: 12.5
Show newest version
////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
// Copyright (c) 2018-2023 Saxonica Limited
// This Source Code Form is subject to the terms of the Mozilla Public License, v. 2.0.
// If a copy of the MPL was not distributed with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
// This Source Code Form is "Incompatible With Secondary Licenses", as defined by the Mozilla Public License, v. 2.0.
////////////////////////////////////////////////////////////////////////////////////////////////////////////////////

package net.sf.saxon.regex.charclass;

import net.sf.saxon.Configuration;
import net.sf.saxon.Version;
import net.sf.saxon.event.Builder;
import net.sf.saxon.lib.ParseOptions;
import net.sf.saxon.lib.Validation;
import net.sf.saxon.om.AxisInfo;
import net.sf.saxon.om.NamespaceUri;
import net.sf.saxon.om.NodeInfo;
import net.sf.saxon.pattern.NameTest;
import net.sf.saxon.pattern.NodeKindTest;
import net.sf.saxon.serialize.charcode.XMLCharacterData;
import net.sf.saxon.trans.XPathException;
import net.sf.saxon.tree.iter.AxisIterator;
import net.sf.saxon.tree.tiny.TinyElementImpl;
import net.sf.saxon.type.Type;
import net.sf.saxon.z.*;

import javax.xml.transform.stream.StreamSource;
import java.io.InputStream;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.Map;

/**
 * Data for Regular expression character categories. The data is in an XML file derived from the Unicode
 * database (In Saxon 9.6, this is based on Unicode 6.2.0). Since Saxon 9.4,
 * we no longer make use of Java's support for character categories since there are too many differences
 * from Unicode.
 */
public class Categories {

    /**
     * A Category is a CharacterClass represented in a regular expression as \p{Xx}.
     * The label Xx is retained, and can be used to determine whether or not two
     * categories are disjoint.
     */

    public static class Category implements CharacterClass {

        private final String label;
        private final IntPredicateProxy predicate;

        public Category(String label, IntPredicateProxy predicate) {
            this.label = label;
            this.predicate = predicate;
        }

        @Override
        public boolean test(int value) {
            return predicate.test(value);
        }

        @Override
        public boolean isDisjoint(CharacterClass other) {
            if (other instanceof Category) {
                char majorCat0 = label.charAt(0);
                String otherLabel = ((Category)other).label;
                char majorCat1 = otherLabel.charAt(0);
                return majorCat0 != majorCat1 ||
                        (label.length() > 1 && otherLabel.length() > 1 && !label.equals(otherLabel));

            } else if (other instanceof InverseCharacterClass) {
                return other.isDisjoint(this);
            } else if (other instanceof SingletonCharacterClass) {
                return !test(((SingletonCharacterClass)other).getCodepoint());
            } else if (other instanceof IntSetCharacterClass) {
                IntSet intSet = other.getIntSet();
                if (intSet.size() > 100) {
                    // too expensive to test, and increasingly likely to be non-disjoint anyway
                    return false;
                }
                IntIterator ii = intSet.iterator();
                while (ii.hasNext()) {
                    if (test(ii.next())) {
                        return false;
                    }
                }
                return true;
            } else {
                return false;
            }
        }

        @Override
        public IntSet getIntSet() {
            return extent(predicate);
        }

        private static IntSet extent(IntPredicateProxy predicate) {
            if (predicate instanceof IntSetPredicate) {
                return ((IntSetPredicate) predicate).getIntSet();
            }
            return null;
        }
    }


    private static HashMap CATEGORIES = null;

    static void build() {

        CATEGORIES = new HashMap<>(30);

        InputStream in = Version.platform.locateResource("categories.xml", new ArrayList<>());
        if (in == null) {
            throw new RuntimeException("Unable to read categories.xml file");
        }

        Configuration config = new Configuration();
        ParseOptions options = new ParseOptions()
                .withSchemaValidationMode(Validation.SKIP)
                .withDTDValidationMode(Validation.SKIP)
                .withTreeModel(Builder.TINY_TREE)
                .withPleaseCloseAfterUse(true);
        NodeInfo doc;
        try {
            doc = config.buildDocumentTree(new StreamSource(in, "categories.xml"), options).getRootNode();
        } catch (XPathException e) {
            throw new RuntimeException("Failed to build categories.xml", e);
        }

        int fp_name = config.getNamePool().allocateFingerprint(NamespaceUri.NULL, "name");
        int fp_f = config.getNamePool().allocateFingerprint(NamespaceUri.NULL, "f");
        int fp_t = config.getNamePool().allocateFingerprint(NamespaceUri.NULL, "t");

        AxisIterator iter = doc.iterateAxis(AxisInfo.DESCENDANT, new NameTest(Type.ELEMENT, NamespaceUri.NULL, "cat", config.getNamePool()));
        for (NodeInfo item; (item = iter.next()) != null; ) {
            String cat = ((TinyElementImpl)item).getAttributeValue(fp_name);
            IntRangeSet irs = new IntRangeSet();
            for (NodeInfo r : item.children(NodeKindTest.ELEMENT)) {
                String from = ((TinyElementImpl)r).getAttributeValue(fp_f);
                String to = ((TinyElementImpl) r).getAttributeValue(fp_t);
                irs.addRange(Integer.parseInt(from, 16), Integer.parseInt(to, 16));
            }
            CATEGORIES.put(cat, new Category(cat, new IntSetPredicate(irs)));
        };

        String c = "CLMNPSZ";
        for (int i = 0; i < c.length(); i++) {
            char ch = c.charAt(i);
            IntPredicateProxy ip = null;
            for (Map.Entry entry : CATEGORIES.entrySet()) {
                if (entry.getKey().charAt(0) == ch) {
                    ip = ip == null ? entry.getValue() : IntUnionPredicate.makeUnion(ip, entry.getValue());
                }
            }
            String label = ch + "";
            CATEGORIES.put(label, new Category(label, ip));
        }
    }

    public final static CharacterClass ESCAPE_s =
            new IntSetCharacterClass(IntArraySet.make(new int[]{9, 10, 13, 32}, 4));

    public final static CharacterClass ESCAPE_S = new InverseCharacterClass(ESCAPE_s);

    public final static PredicateCharacterClass ESCAPE_i =
            new PredicateCharacterClass(value -> XMLCharacterData.isNCNameStart11(value) || value == ':');

    public final static CharacterClass ESCAPE_I = new InverseCharacterClass(ESCAPE_i);

    public final static PredicateCharacterClass ESCAPE_c =
            new PredicateCharacterClass(value -> XMLCharacterData.isNCName11(value) || value == ':');

    public final static CharacterClass ESCAPE_C = new InverseCharacterClass(ESCAPE_c);

    public final static Category ESCAPE_d = getCategory("Nd");

    public final static CharacterClass ESCAPE_D = new InverseCharacterClass(ESCAPE_d);

    static Category CATEGORY_P = getCategory("P");
    static Category CATEGORY_Z = getCategory("Z");
    static Category CATEGORY_C = getCategory("C");

    public final static PredicateCharacterClass ESCAPE_w =
            new PredicateCharacterClass(value -> !(CATEGORY_P.test(value) || CATEGORY_Z.test(value) || CATEGORY_C.test(value)));

    public final static CharacterClass ESCAPE_W = new InverseCharacterClass(ESCAPE_w);

    /**
     * Get a predicate to test characters for membership of one of the Unicode
     * character categories
     *
     * @param cat a one-character or two-character category name, for example L or Lu
     * @return a predicate that tests whether a given character belongs to the category
     */

    public synchronized static Category getCategory(String cat) {
        if (CATEGORIES == null) {
            build();
        }
        return CATEGORIES.get(cat);
    }


}

// The following stylesheet was used to generate the categories.xml file from the Unicode 6.2.0 database:

//
//
//
//    
//
//    
//
//    
//
//    
//    
//
//    
//      
//        
//        
//            
//
//            
//              
//              
//
//              
//                
//                  
//                
//
//                
//                  
//                
//              
//
//              
//                
//              
//
//            
//        
//      
//    
//
//
//    
//      
//      
//      
//        
//          
//        
//        
//          
//          
//        
//      
//    
//
//    
//      
//      
//        
//          
//        
//        
//          
//          
//        
//      
//    
//
//




© 2015 - 2024 Weber Informatics LLC | Privacy Policy