All Downloads are FREE. Search and download functionalities are using the official Maven repository.

net.sf.saxon.serialize.codenorm.NormalizerData Maven / Gradle / Ivy

////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
// Copyright (c) 2015 Saxonica Limited.
// This Source Code Form is subject to the terms of the Mozilla Public License, v. 2.0.
// If a copy of the MPL was not distributed with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
// This Source Code Form is "Incompatible With Secondary Licenses", as defined by the Mozilla Public License, v. 2.0.
////////////////////////////////////////////////////////////////////////////////////////////////////////////////////

package net.sf.saxon.serialize.codenorm;

import net.sf.saxon.tree.util.FastStringBuffer;
import net.sf.saxon.z.IntHashMap;
import net.sf.saxon.z.IntToIntMap;

import java.util.BitSet;

/**
 * Accesses the Normalization Data used for Forms C and D.
 * 

Copyright (c) 1998-1999 Unicode, Inc. All Rights Reserved.
* The Unicode Consortium makes no expressed or implied warranty of any * kind, and assumes no liability for errors or omissions. * No liability is assumed for incidental and consequential damages * in connection with or arising out of the use of the information here.

* * @author Mark Davis */ public class NormalizerData { static final String copyright = "Copyright (c) 1998-1999 Unicode, Inc."; /** * Constant for use in getPairwiseComposition */ public static final int NOT_COMPOSITE = '\uFFFF'; /** * Gets the combining class of a character from the * Unicode Character Database. * * @param ch the source character * @return value from 0 to 255 */ public int getCanonicalClass(int ch) { return canonicalClass.get(ch); } /** * Returns the composite of the two characters. If the two * characters don't combine, returns NOT_COMPOSITE. * Only has to worry about BMP characters, since those are the only ones that can ever compose. * * @param first first character (e.g. 'c') * @param second second character (e.g. '�' cedilla) * @return composite (e.g. '�') */ public char getPairwiseComposition(int first, int second) { if (first < 0 || first > 0x10FFFF || second < 0 || second > 0x10FFFF) return NOT_COMPOSITE; return (char) compose.get((first << 16) | second); } /** * Gets recursive decomposition of a character from the * Unicode Character Database. * * @param canonical If true * bit is on in this byte, then selects the recursive * canonical decomposition, otherwise selects * the recursive compatibility and canonical decomposition. * @param ch the source character * @param buffer buffer to be filled with the decomposition */ public void getRecursiveDecomposition(boolean canonical, int ch, FastStringBuffer buffer) { String decomp = (String) decompose.get(ch); if (decomp != null && !(canonical && isCompatibility.get(ch))) { for (int i = 0; i < decomp.length(); ++i) { getRecursiveDecomposition(canonical, decomp.charAt(i), buffer); } } else { // if no decomp, append buffer.appendWideChar(ch); } } // ================================================= // PRIVATES // ================================================= /** * Only accessed by NormalizerBuilder. */ NormalizerData(IntToIntMap canonicalClass, IntHashMap decompose, IntToIntMap compose, BitSet isCompatibility, BitSet isExcluded) { this.canonicalClass = canonicalClass; this.decompose = decompose; this.compose = compose; this.isCompatibility = isCompatibility; this.isExcluded = isExcluded; } /** * Just accessible for testing. */ boolean getExcluded(char ch) { return isExcluded.get(ch); } /** * Just accessible for testing. */ /*@NotNull*/ String getRawDecompositionMapping(char ch) { return (String) decompose.get(ch); } /** * For now, just use IntHashtable * Two-stage tables would be used in an optimized implementation. */ private IntToIntMap canonicalClass; /** * The main data table maps chars to a 32-bit int. * It holds either a pair: top = first, bottom = second * or singleton: top = 0, bottom = single. * If there is no decomposition, the value is 0. * Two-stage tables would be used in an optimized implementation. * An optimization could also map chars to a small index, then use that * index in a small array of ints. */ private IntHashMap decompose; /** * Maps from pairs of characters to single. * If there is no decomposition, the value is NOT_COMPOSITE. */ private IntToIntMap compose; /** * Tells whether decomposition is canonical or not. */ private BitSet isCompatibility; /** * Tells whether character is script-excluded or not. * Used only while building, and for testing. */ private BitSet isExcluded; } // * The class is derived from the sample program NormalizerData.java published by the // * Unicode consortium. That code has been modified so that instead of building the run-time // * data structures directly, they are written to a Java "source" module, which is then // * compiled. Also, the ability to construct a condensed version of the data tables has been // * removed.




© 2015 - 2025 Weber Informatics LLC | Privacy Policy