All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.ibm.icu.impl.ImplicitCEGenerator Maven / Gradle / Ivy

Go to download

International Component for Unicode for Java (ICU4J) is a mature, widely used Java library providing Unicode and Globalization support

There is a newer version: 76.1
Show newest version
/**
 *******************************************************************************
 * Copyright (C) 2004-2012, International Business Machines Corporation and         *
 * others. All Rights Reserved.                                                *
 *******************************************************************************
 */
package com.ibm.icu.impl;

/**
 * For generation of Implicit CEs
 * @author Mark Davis
 *
 * Cleaned up so that changes can be made more easily.
 * Old values:
# First Implicit: E26A792D
# Last Implicit: E3DC70C0
# First CJK: E0030300
# Last CJK: E0A9DD00
# First CJK_A: E0A9DF00
# Last CJK_A: E0DE3100
@internal
 */
public class ImplicitCEGenerator {
    
    /**
     * constants
     */
    static final boolean DEBUG = false;
    
    static final long topByte = 0xFF000000L;
    static final long bottomByte = 0xFFL;
    static final long fourBytes = 0xFFFFFFFFL;
    
    static final int MAX_INPUT = 0x220001; // 2 * Unicode range + 2

//    public static final int CJK_BASE = 0x4E00;
//    public static final int CJK_LIMIT = 0x9FFF+1;
//    public static final int CJK_COMPAT_USED_BASE = 0xFA0E;
//    public static final int CJK_COMPAT_USED_LIMIT = 0xFA2F+1;
//    public static final int CJK_A_BASE = 0x3400;
//    public static final int CJK_A_LIMIT = 0x4DBF+1;
//    public static final int CJK_B_BASE = 0x20000;
//    public static final int CJK_B_LIMIT = 0x2A6DF+1;
    
    public static final int 
    // 4E00;;Lo;0;L;;;;;N;;;;;
    // 9FCC;;Lo;0;L;;;;;N;;;;;  (Unicode 6.1)
    CJK_BASE = 0x4E00,
    CJK_LIMIT = 0x9FCC+1,

    CJK_COMPAT_USED_BASE = 0xFA0E,
    CJK_COMPAT_USED_LIMIT = 0xFA2F+1,

    //3400;;Lo;0;L;;;;;N;;;;;
    //4DB5;;Lo;0;L;;;;;N;;;;;

    CJK_A_BASE = 0x3400,
    CJK_A_LIMIT = 0x4DB5+1,

    //20000;;Lo;0;L;;;;;N;;;;;
    //2A6D6;;Lo;0;L;;;;;N;;;;;

    CJK_B_BASE = 0x20000,
    CJK_B_LIMIT = 0x2A6D6+1,

    //2A700;;Lo;0;L;;;;;N;;;;;
    //2B734;;Lo;0;L;;;;;N;;;;;

    CJK_C_BASE = 0x2A700,
    CJK_C_LIMIT = 0x2B734+1,

    //2B740;;Lo;0;L;;;;;N;;;;;
    //2B81D;;Lo;0;L;;;;;N;;;;;

    CJK_D_BASE = 0x2B740,
    CJK_D_LIMIT = 0x2B81D+1

    // when adding to this list, look for all occurrences (in project) of CJK_C_BASE and CJK_C_LIMIT, etc. to check for code that needs changing!!!!
    ;
    
//    private void throwError(String title, int cp) {
//        throw new IllegalArgumentException(title + "\t" + Utility.hex(cp, 6) + "\t" + 
//                                           Utility.hex(getImplicitFromRaw(cp) & fourBytes));
//    }
//
//    private void throwError(String title, long ce) {
//        throw new IllegalArgumentException(title + "\t" + Utility.hex(ce & fourBytes));
//    }
//
//    private void show(int i) {
//        if (i >= 0 && i <= MAX_INPUT) {
//            System.out.println(Utility.hex(i) + "\t" + Utility.hex(getImplicitFromRaw(i) & fourBytes));
//        } 
//    }
    
    /**
     * Precomputed by constructor
     */
    int final3Multiplier;
    int final4Multiplier;
    int final3Count;
    int final4Count;
    int medialCount;
    int min3Primary;
    int min4Primary;
    int max4Primary;
    int minTrail;
    int maxTrail;
    int max3Trail;
    int max4Trail;
    int min4Boundary;
    
    public int getGap4() {
        return final4Multiplier - 1;
    }
    
    public int getGap3() {
        return final3Multiplier - 1;
    }
    
    // old comment
    // we must skip all 00, 01, 02, FF bytes, so most bytes have 252 values
    // we must leave a gap of 01 between all values of the last byte, so the last byte has 126 values (3 byte case)
    // we shift so that HAN all has the same first primary, for compression.
    // for the 4 byte case, we make the gap as large as we can fit.

    /**
     * Supply parameters for generating implicit CEs
     */
    public ImplicitCEGenerator(int minPrimary, int maxPrimary) {
        // 13 is the largest 4-byte gap we can use without getting 2 four-byte forms.
        this(minPrimary, maxPrimary, 0x04, 0xFE, 1, 1);
    }
    
    /**
     * Set up to generate implicits.
     * @param minPrimary The minimum primary value.
     * @param maxPrimary The maximum primary value.
     * @param minTrail final byte
     * @param maxTrail final byte
     * @param gap3 the gap we leave for tailoring for 3-byte forms
     * @param primaries3count number of 3-byte primarys we can use (normally 1)
     */
    public ImplicitCEGenerator(int minPrimary, int maxPrimary, int minTrail, int maxTrail, int gap3, int primaries3count) {
        // some simple parameter checks
        if (minPrimary < 0 || minPrimary >= maxPrimary || maxPrimary > 0xFF) {
            throw new IllegalArgumentException("bad lead bytes");
        }
        if (minTrail < 0 || minTrail >= maxTrail || maxTrail > 0xFF) {
            throw new IllegalArgumentException("bad trail bytes");
        }
        if (primaries3count < 1) {
            throw new IllegalArgumentException("bad three-byte primaries");
        }
        
        this.minTrail = minTrail;
        this.maxTrail = maxTrail;
        
        min3Primary = minPrimary;
        max4Primary = maxPrimary;
        // compute constants for use later.
        // number of values we can use in trailing bytes
        // leave room for empty values between AND above, e.g. if gap = 2
        // range 3..7 => +3 -4 -5 -6 -7: so 1 value
        // range 3..8 => +3 -4 -5 +6 -7 -8: so 2 values
        // range 3..9 => +3 -4 -5 +6 -7 -8 -9: so 2 values
        final3Multiplier = gap3 + 1;
        final3Count = (maxTrail - minTrail + 1) / final3Multiplier;
        max3Trail = minTrail + (final3Count - 1) * final3Multiplier;
        
        // medials can use full range
        medialCount = (maxTrail - minTrail + 1);
        // find out how many values fit in each form
        int threeByteCount = medialCount * final3Count;
        // now determine where the 3/4 boundary is.
        // we use 3 bytes below the boundary, and 4 above
        int primariesAvailable = maxPrimary - minPrimary + 1;
        int primaries4count = primariesAvailable - primaries3count;        
        
        int min3ByteCoverage = primaries3count * threeByteCount;
        min4Primary = minPrimary + primaries3count;
        min4Boundary = min3ByteCoverage;
        // Now expand out the multiplier for the 4 bytes, and redo.
 
        int totalNeeded = MAX_INPUT - min4Boundary;
        int neededPerPrimaryByte = divideAndRoundUp(totalNeeded, primaries4count);
        if (DEBUG) System.out.println("neededPerPrimaryByte: " + neededPerPrimaryByte);
        
        int neededPerFinalByte = divideAndRoundUp(neededPerPrimaryByte, medialCount * medialCount);
        if (DEBUG) System.out.println("neededPerFinalByte: " + neededPerFinalByte);
        
        int gap4 = (maxTrail - minTrail - 1) / neededPerFinalByte;
        if (DEBUG) System.out.println("expandedGap: " + gap4);
        if (gap4 < 1) throw new IllegalArgumentException("must have larger gap4s");
        
        final4Multiplier = gap4 + 1;
        final4Count = neededPerFinalByte;
        max4Trail = minTrail + (final4Count - 1) * final4Multiplier;
        
        if (primaries4count * medialCount * medialCount * final4Count < MAX_INPUT) {
            throw new IllegalArgumentException("internal error");
        } 
        if (DEBUG) {
            System.out.println("final4Count: " + final4Count);
            for (int counter = 0; counter < final4Count; ++counter) {
                int value = minTrail + (1 + counter)*final4Multiplier;
                System.out.println(counter + "\t" + value + "\t" + Utility.hex(value));
            }
        }
    }
    
    static public int divideAndRoundUp(int a, int b) {
        return 1 + (a-1)/b;
    }

    /**
     * Converts implicit CE into raw integer
     * @param implicit The implicit value passed.
     * @return -1 if illegal format
     */
    public int getRawFromImplicit(int implicit) {
        int result;
        int b3 = implicit & 0xFF;
        implicit >>= 8;
        int b2 = implicit & 0xFF;
        implicit >>= 8;
        int b1 = implicit & 0xFF;
        implicit >>= 8;
        int b0 = implicit & 0xFF;

        // simple parameter checks
        if (b0 < min3Primary || b0 > max4Primary
            || b1 < minTrail || b1 > maxTrail) return -1;
        // normal offsets
        b1 -= minTrail;

        // take care of the final values, and compose
        if (b0 < min4Primary) {
            if (b2 < minTrail || b2 > max3Trail || b3 != 0) return -1;
            b2 -= minTrail;
            int remainder = b2 % final3Multiplier;
            if (remainder != 0) return -1;
            b0 -= min3Primary;
            b2 /= final3Multiplier;
            result = ((b0 * medialCount) + b1) * final3Count + b2;
        } else {
            if (b2 < minTrail || b2 > maxTrail
                || b3 < minTrail || b3 > max4Trail) return -1;
            b2 -= minTrail;
            b3 -= minTrail;
            int remainder = b3 % final4Multiplier;
            if (remainder != 0) return -1;
            b3 /= final4Multiplier;
            b0 -= min4Primary;
            result = (((b0 * medialCount) + b1) * medialCount + b2) * final4Count + b3 + min4Boundary;
        }
        // final check
        if (result < 0 || result > MAX_INPUT) return -1;
        return result;
    }
    
    /**
     * Generate the implicit CE, from raw integer.
     * Left shifted to put the first byte at the top of an int.
     * @param cp code point
     * @return Primary implicit weight
     */
    public int getImplicitFromRaw(int cp) {
        if (cp < 0 || cp > MAX_INPUT) {
            throw new IllegalArgumentException("Code point out of range " + Utility.hex(cp));
        }
        int last0 = cp - min4Boundary;
        if (last0 < 0) {
            int last1 = cp / final3Count;
            last0 = cp % final3Count;
                        
            int last2 = last1 / medialCount;
            last1 %= medialCount;
            
            last0 = minTrail + last0*final3Multiplier; // spread out, leaving gap at start
            last1 = minTrail + last1; // offset
            last2 = min3Primary + last2; // offset
            
            if (last2 >= min4Primary) {
                throw new IllegalArgumentException("4-byte out of range: " + 
                                                   Utility.hex(cp) + ", " + Utility.hex(last2));
            } 
            
            return (last2 << 24) + (last1 << 16) + (last0 << 8);
        } else {
            int last1 = last0 / final4Count;
            last0 %= final4Count;
            
            int last2 = last1 / medialCount;
            last1 %= medialCount;
            
            int last3 = last2 / medialCount;
            last2 %= medialCount;
            
            last0 = minTrail + last0*final4Multiplier; // spread out, leaving gap at start           
            last1 = minTrail + last1; // offset
            last2 = minTrail + last2; // offset
            last3 = min4Primary + last3; // offset
            
            if (last3 > max4Primary) {
                throw new IllegalArgumentException("4-byte out of range: " + 
                                                   Utility.hex(cp) + ", " + Utility.hex(last3));
            } 
            
            return (last3 << 24) + (last2 << 16) + (last1 << 8) + last0;
        }
    }

    /**
     * Gets an Implicit from a code point. Internally, 
     * swaps (which produces a raw value 0..220000, 
     * then converts raw to implicit.
     * @param cp The code point to convert to implicit.
     * @return Primary implicit weight
     */
    public int getImplicitFromCodePoint(int cp) {
        if (DEBUG) System.out.println("Incoming: " + Utility.hex(cp));
        
        // Produce Raw value
        // note, we add 1 so that the first value is always empty!!
        cp = ImplicitCEGenerator.swapCJK(cp) + 1;
        // we now have a range of numbers from 0 to 220000.
            
        if (DEBUG) System.out.println("CJK swapped: " + Utility.hex(cp));
            
        return getImplicitFromRaw(cp);
    }

    /**
     * Function used to: 
     * a) collapse the 2 different Han ranges from UCA into one (in the right order), and
     * b) bump any non-CJK characters by 10FFFF.
     * The relevant blocks are:
     * A:    4E00..9FFF; CJK Unified Ideographs
     *       F900..FAFF; CJK Compatibility Ideographs
     * B:    3400..4DBF; CJK Unified Ideographs Extension A
     *       20000..XX;  CJK Unified Ideographs Extension B (and others later on)
     * As long as
     *   no new B characters are allocated between 4E00 and FAFF, and
     *   no new A characters are outside of this range,
     * (very high probability) this simple code will work.
     * The reordered blocks are:
     * Block1 is CJK
     * Block2 is CJK_COMPAT_USED
     * Block3 is CJK_A
     * (all contiguous)
     * Any other CJK gets its normal code point
     * Any non-CJK gets +10FFFF
     * When we reorder Block1, we make sure that it is at the very start,
     * so that it will use a 3-byte form.
     * Warning: the we only pick up the compatibility characters that are
     * NOT decomposed, so that block is smaller!
     */
    
    static int NON_CJK_OFFSET = 0x110000;
        
    public static int swapCJK(int i) {
        
        if (i >= CJK_BASE) {
            if (i < CJK_LIMIT)              return i - CJK_BASE;
            
            if (i < CJK_COMPAT_USED_BASE)   return i + NON_CJK_OFFSET;
            
            if (i < CJK_COMPAT_USED_LIMIT)  return i - CJK_COMPAT_USED_BASE
                                                + (CJK_LIMIT - CJK_BASE);
            if (i < CJK_B_BASE)             return i + NON_CJK_OFFSET;
            
            if (i < CJK_B_LIMIT)            return i; // non-BMP-CJK
            
            if (i < CJK_C_BASE)             return i + NON_CJK_OFFSET;
            
            if (i < CJK_C_LIMIT)            return i; // non-BMP-CJK
            
            if (i < CJK_D_BASE)             return i + NON_CJK_OFFSET;
            
            if (i < CJK_D_LIMIT)            return i; // non-BMP-CJK
            
            return i + NON_CJK_OFFSET;  // non-CJK
        }
        if (i < CJK_A_BASE)                 return i + NON_CJK_OFFSET;
        
        if (i < CJK_A_LIMIT)                return i - CJK_A_BASE
                                                + (CJK_LIMIT - CJK_BASE) 
                                                + (CJK_COMPAT_USED_LIMIT - CJK_COMPAT_USED_BASE);
        return i + NON_CJK_OFFSET; // non-CJK
    }
    

    /**
     * @return Minimal trail value
     */
    public int getMinTrail() {
        return minTrail;
    }

    /**
     * @return Maximal trail value
     */
    public int getMaxTrail() {
        return maxTrail;
    }
    
    public int getCodePointFromRaw(int i) {
        i--;
        int result = 0;
        if(i >= NON_CJK_OFFSET) {
            result = i - NON_CJK_OFFSET;
        } else if(i >= CJK_B_BASE) {
            result = i;
        } else if(i < CJK_A_LIMIT + (CJK_LIMIT - CJK_BASE) + (CJK_COMPAT_USED_LIMIT - CJK_COMPAT_USED_BASE)) { 
            // rest of CJKs, compacted
            if(i < CJK_LIMIT - CJK_BASE) {
                result = i + CJK_BASE;
            } else if(i < (CJK_LIMIT - CJK_BASE) + (CJK_COMPAT_USED_LIMIT - CJK_COMPAT_USED_BASE)) {
                result = i + CJK_COMPAT_USED_BASE - (CJK_LIMIT - CJK_BASE);
            } else {
                result = i + CJK_A_BASE - (CJK_LIMIT - CJK_BASE) - (CJK_COMPAT_USED_LIMIT - CJK_COMPAT_USED_BASE);
            }
        } else {
            result = -1;
        }
        return result;
    }

    public int getRawFromCodePoint(int i) {
        return swapCJK(i)+1;
    }
}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy