All Downloads are FREE. Search and download functionalities are using the official Maven repository.

net.sf.saxon.regex.Categories Maven / Gradle / Ivy

There is a newer version: 12.5
Show newest version
////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
// Copyright (c) 2015 Saxonica Limited.
// This Source Code Form is subject to the terms of the Mozilla Public License, v. 2.0.
// If a copy of the MPL was not distributed with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
// This Source Code Form is "Incompatible With Secondary Licenses", as defined by the Mozilla Public License, v. 2.0.
////////////////////////////////////////////////////////////////////////////////////////////////////////////////////

package net.sf.saxon.regex;

import net.sf.saxon.Configuration;
import net.sf.saxon.lib.ParseOptions;
import net.sf.saxon.lib.Validation;
import net.sf.saxon.om.AxisInfo;
import net.sf.saxon.om.NodeInfo;
import net.sf.saxon.pattern.NameTest;
import net.sf.saxon.pattern.NodeKindTest;
import net.sf.saxon.serialize.charcode.XMLCharacterData;
import net.sf.saxon.trans.XPathException;
import net.sf.saxon.tree.iter.AxisIterator;
import net.sf.saxon.tree.util.Navigator;
import net.sf.saxon.type.Type;
import net.sf.saxon.z.*;

import javax.xml.transform.stream.StreamSource;
import java.io.InputStream;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.Map;

/**
 * Data for Regular expression character categories. The data is in an XML file derived from the Unicode
 * database (In Saxon 9.6, this is based on Unicode 6.2.0). Since Saxon 9.4,
 * we no longer make use of Java's support for character categories since there are too many differences
 * from Unicode.
 */
public class Categories {


    private static HashMap CATEGORIES = null;

    static void build() {

        CATEGORIES = new HashMap(30);

        InputStream in = Configuration.locateResource("categories.xml", new ArrayList(), new ArrayList());
        if (in == null) {
            throw new RuntimeException("Unable to read categories.xml file");
        }

        Configuration config = new Configuration();
        ParseOptions options = new ParseOptions();
        options.setSchemaValidationMode(Validation.SKIP);
        NodeInfo doc;
        try {
            doc = config.buildDocumentTree(new StreamSource(in, "categories.xml"), options).getRootNode();
        } catch (XPathException e) {
            throw new RuntimeException("Failed to build categories.xml", e);
        }

        AxisIterator iter = doc.iterateAxis(AxisInfo.DESCENDANT, new NameTest(Type.ELEMENT, "", "cat", config.getNamePool()));
        while (true) {
            NodeInfo item = iter.next();
            if (item == null) {
                break;
            }
            String cat = Navigator.getAttributeValue(item, "", "name");
            IntRangeSet irs = new IntRangeSet();
            AxisIterator iter2 = item.iterateAxis(AxisInfo.CHILD, NodeKindTest.ELEMENT);
            while (true) {
                NodeInfo r = iter2.next();
                if (r == null) {
                    break;
                }
                String from = Navigator.getAttributeValue(r, "", "f");
                String to = Navigator.getAttributeValue(r, "", "t");
                irs.addRange(Integer.parseInt(from, 16), Integer.parseInt(to, 16));
            }

            CATEGORIES.put(cat, new IntSetPredicate(irs));
        }

        String c = "CLMNPSZ";
        for (int i = 0; i < c.length(); i++) {
            char ch = c.charAt(i);
            IntPredicate ip = null;
            for (Map.Entry entry : CATEGORIES.entrySet()) {
                if (entry.getKey().charAt(0) == ch) {
                    ip = (ip == null ? entry.getValue() : new IntUnionPredicate(ip, entry.getValue()));
                }
            }
            CATEGORIES.put(ch + "", ip);
        }
    }

    public final static IntPredicate ESCAPE_s =
            new IntSetPredicate(IntArraySet.make(new int[]{9, 10, 13, 32}, 4));

    public final static IntPredicate ESCAPE_S = new IntComplementPredicate(ESCAPE_s);

    public final static IntPredicate ESCAPE_i = new IntPredicate() {
        public boolean matches(int value) {
            return XMLCharacterData.isNCNameStart11(value) || value == ':';
        }
    };

    public final static IntPredicate ESCAPE_I = new IntPredicate() {
        public boolean matches(int value) {
            return !(XMLCharacterData.isNCNameStart11(value) || value == ':');
        }
    };

    public final static IntPredicate ESCAPE_c = new IntPredicate() {
        public boolean matches(int value) {
            return XMLCharacterData.isNCName11(value) || value == ':';
        }
    };

    public final static IntPredicate ESCAPE_C = new IntPredicate() {
        public boolean matches(int value) {
            return !(XMLCharacterData.isNCName11(value) || value == ':');
        }
    };

    public final static IntPredicate ESCAPE_d = getCategory("Nd");

    public final static IntPredicate ESCAPE_D = new IntComplementPredicate(ESCAPE_d);

    static IntPredicate CATEGORY_P = getCategory("P");
    static IntPredicate CATEGORY_Z = getCategory("Z");
    static IntPredicate CATEGORY_C = getCategory("C");

    public final static IntPredicate ESCAPE_w = new IntPredicate() {
        public boolean matches(int value) {
            return !(CATEGORY_P.matches(value) || CATEGORY_Z.matches(value) || CATEGORY_C.matches(value));
        }
    };

    public final static IntPredicate ESCAPE_W = new IntComplementPredicate(ESCAPE_w);

    /**
     * Get a predicate to test characters for membership of one of the Unicode
     * character categories
     *
     * @param cat a one-character or two-character category name, for example L or Lu
     * @return a predicate that tests whether a given character belongs to the category
     */

    public static IntPredicate getCategory(String cat) {
        if (CATEGORIES == null) {
            build();
        }
        return CATEGORIES.get(cat);
    }


}

// The following stylesheet was used to generate the categories.xml file from the Unicode 6.2.0 database:

//
//
//
//    
//
//    
//
//    
//
//    
//    
//
//    
//      
//        
//        
//            
//
//            
//              
//              
//
//              
//                
//                  
//                
//
//                
//                  
//                
//              
//
//              
//                
//              
//
//            
//        
//      
//    
//
//
//    
//      
//      
//      
//        
//          
//        
//        
//          
//          
//        
//      
//    
//
//    
//      
//      
//        
//          
//        
//        
//          
//          
//        
//      
//    
//
//




© 2015 - 2025 Weber Informatics LLC | Privacy Policy