All Downloads are FREE. Search and download functionalities are using the official Maven repository.

net.sf.saxon.regex.CaseVariants Maven / Gradle / Ivy

////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
// Copyright (c) 2018-2023 Saxonica Limited
// This Source Code Form is subject to the terms of the Mozilla Public License, v. 2.0.
// If a copy of the MPL was not distributed with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
// This Source Code Form is "Incompatible With Secondary Licenses", as defined by the Mozilla Public License, v. 2.0.
////////////////////////////////////////////////////////////////////////////////////////////////////////////////////

package net.sf.saxon.regex;

import net.sf.saxon.Configuration;
import net.sf.saxon.Version;
import net.sf.saxon.lib.ParseOptions;
import net.sf.saxon.lib.Validation;
import net.sf.saxon.om.AxisInfo;
import net.sf.saxon.om.NamespaceUri;
import net.sf.saxon.om.NodeInfo;
import net.sf.saxon.pattern.NameTest;
import net.sf.saxon.trans.XPathException;
import net.sf.saxon.tree.iter.AxisIterator;
import net.sf.saxon.type.Type;
import net.sf.saxon.z.IntArraySet;
import net.sf.saxon.z.IntHashMap;
import net.sf.saxon.z.IntToIntHashMap;
import net.sf.saxon.z.IntToIntMap;

import javax.xml.transform.stream.StreamSource;
import java.io.InputStream;
import java.util.ArrayList;

/**
 * This singleton class holds data about the case-variants of Unicode characters. The data is held
 * in the resource file {@code casevariants.xml}, which is read on first use; the file has been automatically
 * generated from the Unicode database.
 */
public class CaseVariants {

    // Use one hashmap for characters with a single case variant, another for characters with multiple
    // case variants, to reduce the number of objects that need to be allocated

    private final IntToIntMap monoVariants = new IntToIntHashMap(2500);
    private final IntHashMap polyVariants = new IntHashMap<>(100);

    private CaseVariants() {
        build();
    }

    private static class Holder {
        // See https://en.wikipedia.org/wiki/Initialization-on-demand_holder_idiom
        // The idea here is that the initialization occurs the first time getInstance() is called,
        // and it is automatically synchronized by virtue of the Java class loading rules.
        public static final CaseVariants INSTANCE = new CaseVariants();
    }

    private static CaseVariants getInstance() {
        return Holder.INSTANCE;
    }

    private void build() {

        InputStream in = Version.platform.locateResource("casevariants.xml", new ArrayList<>());
        if (in == null) {
            throw new RuntimeException("Unable to read casevariants.xml file");
        }

        Configuration config = new Configuration();
        ParseOptions options = new ParseOptions();
        options = options.withSchemaValidationMode(Validation.SKIP);
        options = options.withDTDValidationMode(Validation.SKIP);
        NodeInfo doc;
        try {
            doc = config.buildDocumentTree(new StreamSource(in, "casevariants.xml"), options).getRootNode();
        } catch (XPathException e) {
            throw new RuntimeException("Failed to build casevariants.xml", e);
        }

        AxisIterator iter = doc.iterateAxis(AxisInfo.DESCENDANT, new NameTest(Type.ELEMENT, NamespaceUri.NULL, "c", config.getNamePool()));
        while (true) {
            NodeInfo item = iter.next();
            if (item == null) {
                break;
            }
            String code = item.getAttributeValue("", "n");
            int icode = Integer.parseInt(code, 16);
            String variants = item.getAttributeValue("", "v");
            String[] vhex = variants.split(",");
            int[] vint = new int[vhex.length];
            for (int i = 0; i < vhex.length; i++) {
                vint[i] = Integer.parseInt(vhex[i], 16);
            }
            if (vhex.length == 1) {
                monoVariants.put(icode, vint[0]);
            } else {
                polyVariants.put(icode, vint);
            }
        }
    }

    /**
     * Get the case variants of a character
     *
     * @param code the character whose case variants are required
     * @return the case variants of the character, excluding the character itself
     */

    public static int[] getCaseVariants(int code) {
        CaseVariants variants = getInstance();
        IntToIntMap monoVariants = variants.monoVariants;
        int mono = monoVariants.get(code);
        if (mono != monoVariants.getDefaultValue()) {
            return new int[]{mono};
        } else {
            int[] result = variants.polyVariants.get(code);
            if (result == null) {
                return IntArraySet.EMPTY_INT_ARRAY;
            } else {
                return result;
            }
        }
    }

    /**
     * Get the case variants of roman letters (A-Z, a-z), other than the letters A-Z and a-z themselves
     */

    /*@NotNull*/ public static int[] ROMAN_VARIANTS = {0x0130, 0x0131, 0x212A, 0x017F};

    // The data file casevariants.xml was formed by applying the following query to the XML
    // version of the Unicode database (for Saxon 9.6, the Unicode 6.2.0 version was used)

//    declare namespace u = "http://www.unicode.org/ns/2003/ucd/1.0";
//    {
//    let $chars := doc('ucd.all.flat.xml')/ * / * /u:char[@suc!='#' or @slc!='#']
//    for $c in $chars
//    let $variants := ($chars[(@cp, @suc[.!='#']) = $c/(@cp, @suc[.!='#'])] |
//                          $chars[(@cp, @slc[.!='#']) = $c/(@cp, @slc[.!='#'])]) except $c
//    return
//         if (count($variants) gt 0) then
//           
//         else ()
//
//    }

}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy