net.sf.saxon.serialize.codenorm.UnicodeDataParserFromXML Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of saxon-he Show documentation
An OSGi bundle for Saxon-HE
There is a newer version: 10.5
////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
// Copyright (c) 2013 Saxonica Limited.
// This Source Code Form is subject to the terms of the Mozilla Public License, v. 2.0.
// If a copy of the MPL was not distributed with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
// This Source Code Form is "Incompatible With Secondary Licenses", as defined by the Mozilla Public License, v. 2.0.
////////////////////////////////////////////////////////////////////////////////////////////////////////////////////

package net.sf.saxon.serialize.codenorm;

import net.sf.saxon.Configuration;
import net.sf.saxon.lib.ParseOptions;
import net.sf.saxon.lib.Validation;
import net.sf.saxon.om.AxisInfo;
import net.sf.saxon.om.DocumentInfo;
import net.sf.saxon.om.NodeInfo;
import net.sf.saxon.pattern.NodeKindTest;
import net.sf.saxon.trans.XPathException;
import net.sf.saxon.tree.iter.AxisIterator;
import net.sf.saxon.z.IntHashMap;
import net.sf.saxon.z.IntToIntHashMap;
import net.sf.saxon.z.IntToIntMap;

import javax.xml.transform.stream.StreamSource;
import java.io.InputStream;
import java.util.ArrayList;
import java.util.BitSet;
import java.util.List;
import java.util.StringTokenizer;

/**
 * This class reads the data compiled into class UnicodeData, and builds hash tables
 * that can be used by the Unicode normalization routines. This operation is performed
 * once only, the first time normalization is attempted after Saxon is loaded.
 */

class UnicodeDataParserFromXML {

    // This class is never instantiated
    private UnicodeDataParserFromXML(){}

    /**
     * Called exactly once by NormalizerData to build the static data
     */

    static NormalizerData build(Configuration config) throws XPathException {

        InputStream in = Configuration.locateResource("normalizationData.xml", new ArrayList(), new ArrayList());
        if (in == null) {
            throw new XPathException("Unable to read normalizationData.xml file");
        }

        BitSet isExcluded = new BitSet(128000);
        BitSet isCompatibility = new BitSet(128000);

        ParseOptions options = new ParseOptions();
        options.setSchemaValidationMode(Validation.SKIP);
        DocumentInfo doc = config.buildDocument(new StreamSource(in, "normalizationData.xml"), options);
        NodeInfo canonicalClassKeys = null;
        NodeInfo canonicalClassValues = null;
        NodeInfo decompositionKeys = null;
        NodeInfo decompositionValues = null;

        AxisIterator iter = doc.iterateAxis(AxisInfo.DESCENDANT, NodeKindTest.ELEMENT);
        while (true) {
            NodeInfo item = (NodeInfo)iter.next();
            if (item == null) {
                break;
            }
            if (item.getLocalPart().equals("CanonicalClassKeys")) {
                canonicalClassKeys = item;
            } else if (item.getLocalPart().equals("CanonicalClassValues")) {
                canonicalClassValues = item;
            } else if (item.getLocalPart().equals("DecompositionKeys")) {
                decompositionKeys = item;
            } else if (item.getLocalPart().equals("DecompositionValues")) {
                decompositionValues = item;
            } else if (item.getLocalPart().equals("ExclusionList")) {
                readExclusionList(item.getStringValue(), isExcluded);
            } else if (item.getLocalPart().equals("CompatibilityList")) {
                readCompatibilityList(item.getStringValue(), isCompatibility);
            }
        }

        IntToIntMap canonicalClass = new IntToIntHashMap(400);
        canonicalClass.setDefaultValue(0);
        readCanonicalClassTable(canonicalClassKeys.getStringValue(), canonicalClassValues.getStringValue(), canonicalClass);


        IntHashMap decompose = new IntHashMap(18000);
        IntToIntMap compose = new IntToIntHashMap(15000);
        compose.setDefaultValue(NormalizerData.NOT_COMPOSITE);

        readDecompositionTable(decompositionKeys.getStringValue(), decompositionValues.getStringValue(),
                decompose, compose, isExcluded, isCompatibility);

        return new NormalizerData(canonicalClass, decompose, compose,
              isCompatibility, isExcluded);
    }

    /**
     * Reads exclusion list and stores the data
     */

    private static void readExclusionList(String s, BitSet isExcluded) {
        StringTokenizer st = new StringTokenizer(s);
        while (st.hasMoreTokens()) {
            String tok = st.nextToken();
            int value = Integer.parseInt(tok, 32);
            isExcluded.set(value);
        }
    }

    /**
     * Reads compatibility list and stores the data
     */

    private static void readCompatibilityList(String s, BitSet isCompatible) {
        StringTokenizer st = new StringTokenizer(s);
        while (st.hasMoreTokens()) {
            String tok = st.nextToken();
            int value = Integer.parseInt(tok, 32);
            isCompatible.set(value);
        }
    }

    /**
     * Read canonical class table (mapping from character codes to their canonical class)
     */

    private static void readCanonicalClassTable(String keyString, String valueString, IntToIntMap canonicalClasses) {
        ArrayList keys = new ArrayList(5000);

        StringTokenizer st = new StringTokenizer(keyString);
        while (st.hasMoreTokens()) {
            String tok = st.nextToken();
            int value = Integer.parseInt(tok, 32);
            keys.add(Integer.valueOf(value));
        }

        int k = 0;
        st = new StringTokenizer(valueString);
        while (st.hasMoreTokens()) {
            String tok = st.nextToken();
            int clss;
            int repeat = 1;
            int star = tok.indexOf('*');
            if (star < 0) {
                clss = Integer.parseInt(tok, 32);
            } else {
                repeat = Integer.parseInt(tok.substring(0, star));
                clss = Integer.parseInt(tok.substring(star+1), 32);
            }
            for (int i=0; i values = new ArrayList(1000);
        StringTokenizer st = new StringTokenizer(decompositionValuesString);
        while (st.hasMoreTokens()) {
            String tok = st.nextToken();
            String value = "";
            for (int c=0; c 1) {
                    first = second;
                    second = value.charAt(1);
                }

                // store composition pair in single integer

                int pair = (first << 16) | second;
                compose.put(pair, key);
            }
        }

        // Add algorithmic Hangul decompositions
        // This fragment code is copied from the normalization code published by Unicode consortium.
        // See module net.sf.saxon.serialize.codenorm.Normalizer for applicable copyright information.

        for (int SIndex = 0; SIndex < SCount; ++SIndex) {
            int TIndex = SIndex % TCount;
            char first, second;
            if (TIndex != 0) { // triple
                first = (char)(SBase + SIndex - TIndex);
                second = (char)(TBase + TIndex);
            } else {
                first = (char)(LBase + SIndex / NCount);
                second = (char)(VBase + (SIndex % NCount) / TCount);
            }
            int pair = (first << 16) | second;
            int key = SIndex + SBase;
            decompose.put(key, String.valueOf(first) + second);
            compose.put(pair, key);
        }
    }

    /**
     * Hangul composition constants
     */
    private static final int
        SBase = 0xAC00, LBase = 0x1100, VBase = 0x1161, TBase = 0x11A7,
        LCount = 19, VCount = 21, TCount = 28,
        NCount = VCount * TCount,   // 588
        SCount = LCount * NCount;   // 11172

    // end of Unicode consortium code

}

// * Copyright (c) 1991-2005 Unicode, Inc.
// * For terms of use, see http://www.unicode.org/terms_of_use.html
// * For documentation, see UAX#15.

// * The Unicode Consortium makes no expressed or implied warranty of any
// * kind, and assumes no liability for errors or omissions.
// * No liability is assumed for incidental and consequential damages
// * in connection with or arising out of the use of the information here.