All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.ibm.icu.impl.Normalizer2Impl Maven / Gradle / Ivy

There is a newer version: 2.12.15
Show newest version
// © 2016 and later: Unicode, Inc. and others.
// License & terms of use: http://www.unicode.org/copyright.html#License
/*
 *******************************************************************************
 *   Copyright (C) 2009-2015, International Business Machines
 *   Corporation and others.  All Rights Reserved.
 *******************************************************************************
 */

package com.ibm.icu.impl;

import java.io.IOException;
import java.nio.ByteBuffer;
import java.util.ArrayList;
import java.util.Iterator;

import com.ibm.icu.text.UTF16;
import com.ibm.icu.text.UnicodeSet;
import com.ibm.icu.util.ICUUncheckedIOException;
import com.ibm.icu.util.VersionInfo;

public final class Normalizer2Impl {
    public static final class Hangul {
        /* Korean Hangul and Jamo constants */
        public static final int JAMO_L_BASE=0x1100;     /* "lead" jamo */
        public static final int JAMO_L_END=0x1112;
        public static final int JAMO_V_BASE=0x1161;     /* "vowel" jamo */
        public static final int JAMO_V_END=0x1175;
        public static final int JAMO_T_BASE=0x11a7;     /* "trail" jamo */
        public static final int JAMO_T_END=0x11c2;

        public static final int HANGUL_BASE=0xac00;
        public static final int HANGUL_END=0xd7a3;

        public static final int JAMO_L_COUNT=19;
        public static final int JAMO_V_COUNT=21;
        public static final int JAMO_T_COUNT=28;

        public static final int JAMO_L_LIMIT=JAMO_L_BASE+JAMO_L_COUNT;
        public static final int JAMO_V_LIMIT=JAMO_V_BASE+JAMO_V_COUNT;

        public static final int JAMO_VT_COUNT=JAMO_V_COUNT*JAMO_T_COUNT;

        public static final int HANGUL_COUNT=JAMO_L_COUNT*JAMO_V_COUNT*JAMO_T_COUNT;
        public static final int HANGUL_LIMIT=HANGUL_BASE+HANGUL_COUNT;

        public static boolean isHangul(int c) {
            return HANGUL_BASE<=c && c
     * If dest is a StringBuilder, then the buffer writes directly to it.
     * Otherwise, the buffer maintains a StringBuilder for intermediate text segments
     * until no further changes are necessary and whole segments are appended.
     * append() methods that take combining-class values always write to the StringBuilder.
     * Other append() methods flush and append to the Appendable.
     */
    public static final class ReorderingBuffer implements Appendable {
        public ReorderingBuffer(Normalizer2Impl ni, Appendable dest, int destCapacity) {
            impl=ni;
            app=dest;
            if(app instanceof StringBuilder) {
                appIsStringBuilder=true;
                str=(StringBuilder)dest;
                // In Java, the constructor subsumes public void init(int destCapacity) {
                str.ensureCapacity(destCapacity);
                reorderStart=0;
                if(str.length()==0) {
                    lastCC=0;
                } else {
                    setIterator();
                    lastCC=previousCC();
                    // Set reorderStart after the last code point with cc<=1 if there is one.
                    if(lastCC>1) {
                        while(previousCC()>1) {}
                    }
                    reorderStart=codePointLimit;
                }
            } else {
                appIsStringBuilder=false;
                str=new StringBuilder();
                reorderStart=0;
                lastCC=0;
            }
        }

        public boolean isEmpty() { return str.length()==0; }
        public int length() { return str.length(); }
        public int getLastCC() { return lastCC; }

        public StringBuilder getStringBuilder() { return str; }

        public boolean equals(CharSequence s, int start, int limit) {
            return UTF16Plus.equal(str, 0, str.length(), s, start, limit);
        }

        // For Hangul composition, replacing the Leading consonant Jamo with the syllable.
        public void setLastChar(char c) {
            str.setCharAt(str.length()-1, c);
        }

        public void append(int c, int cc) {
            if(lastCC<=cc || cc==0) {
                str.appendCodePoint(c);
                lastCC=cc;
                if(cc<=1) {
                    reorderStart=str.length();
                }
            } else {
                insert(c, cc);
            }
        }
        // s must be in NFD, otherwise change the implementation.
        public void append(CharSequence s, int start, int limit,
                           int leadCC, int trailCC) {
            if(start==limit) {
                return;
            }
            if(lastCC<=leadCC || leadCC==0) {
                if(trailCC<=1) {
                    reorderStart=str.length()+(limit-start);
                } else if(leadCC<=1) {
                    reorderStart=str.length()+1;  // Ok if not a code point boundary.
                }
                str.append(s, start, limit);
                lastCC=trailCC;
            } else {
                int c=Character.codePointAt(s, start);
                start+=Character.charCount(c);
                insert(c, leadCC);  // insert first code point
                while(startcc;) {}
            // insert c at codePointLimit, after the character with prevCC<=cc
            if(c<=0xffff) {
                str.insert(codePointLimit, (char)c);
                if(cc<=1) {
                    reorderStart=codePointLimit+1;
                }
            } else {
                str.insert(codePointLimit, Character.toChars(c));
                if(cc<=1) {
                    reorderStart=codePointLimit+2;
                }
            }
        }

        private final Normalizer2Impl impl;
        private final Appendable app;
        private final StringBuilder str;
        private final boolean appIsStringBuilder;
        private int reorderStart;
        private int lastCC;

        // private backward iterator
        private void setIterator() { codePointStart=str.length(); }
        private void skipPrevious() {  // Requires 0=codePointStart) {
                return 0;
            }
            int c=str.codePointBefore(codePointStart);
            codePointStart-=Character.charCount(c);
            if(c(nextOffset-offset)) {
                throw new ICUUncheckedIOException("Normalizer2 data: not enough bytes for normTrie");
            }
            ICUBinary.skipBytes(bytes, (nextOffset-offset)-trieLength);  // skip padding after trie bytes

            // Read the composition and mapping data.
            offset=nextOffset;
            nextOffset=inIndexes[IX_SMALL_FCD_OFFSET];
            int numChars=(nextOffset-offset)/2;
            if(numChars!=0) {
                maybeYesCompositions=ICUBinary.getString(bytes, numChars, 0);
                extraData=maybeYesCompositions.substring(MIN_NORMAL_MAYBE_YES-minMaybeYes);
            }

            // smallFCD: new in formatVersion 2
            offset=nextOffset;
            smallFCD=new byte[0x100];
            bytes.get(smallFCD);

            // Build tccc180[].
            // gennorm2 enforces lccc=0 for c>=1) {
                if((c&0xff)==0) {
                    bits=smallFCD[c>>8];  // one byte per 0x100 code points
                }
                if((bits&1)!=0) {
                    for(int i=0; i<0x20; ++i, ++c) {
                        tccc180[c]=getFCD16FromNormData(c)&0xff;
                    }
                } else {
                    c+=0x20;
                }
            }

            return this;
        } catch(IOException e) {
            throw new ICUUncheckedIOException(e);
        }
    }
    public Normalizer2Impl load(String name) {
        return load(ICUBinary.getRequiredData(name));
    }

    private void enumLcccRange(int start, int end, int norm16, UnicodeSet set) {
        if(isAlgorithmicNoNo(norm16)) {
            // Range of code points with same-norm16-value algorithmic decompositions.
            // They might have different non-zero FCD16 values.
            do {
                int fcd16=getFCD16(start);
                if(fcd16>0xff) { set.add(start); }
            } while(++start<=end);
        } else {
            int fcd16=getFCD16(start);
            if(fcd16>0xff) { set.add(start, end); }
        }
    }

    private void enumNorm16PropertyStartsRange(int start, int end, int value, UnicodeSet set) {
        /* add the start code point to the USet */
        set.add(start);
        if(start!=end && isAlgorithmicNoNo(value)) {
            // Range of code points with same-norm16-value algorithmic decompositions.
            // They might have different non-zero FCD16 values.
            int prevFCD16=getFCD16(start);
            while(++start<=end) {
                int fcd16=getFCD16(start);
                if(fcd16!=prevFCD16) {
                    set.add(start);
                    prevFCD16=fcd16;
                }
            }
        }
    }

    public void addLcccChars(UnicodeSet set) {
        /* add the start code point of each same-value range of each trie */
        Iterator trieIterator=normTrie.iterator();
        Trie2.Range range;
        while(trieIterator.hasNext() && !(range=trieIterator.next()).leadSurrogate) {
            enumLcccRange(range.startCodePoint, range.endCodePoint, range.value, set);
        }
    }

    public void addPropertyStarts(UnicodeSet set) {
        /* add the start code point of each same-value range of each trie */
        Iterator trieIterator=normTrie.iterator();
        Trie2.Range range;
        while(trieIterator.hasNext() && !(range=trieIterator.next()).leadSurrogate) {
            enumNorm16PropertyStartsRange(range.startCodePoint, range.endCodePoint, range.value, set);
        }

        /* add Hangul LV syllables and LV+1 because of skippables */
        for(int c=Hangul.HANGUL_BASE; c trieIterator=canonIterData.iterator(segmentStarterMapper);
        Trie2.Range range;
        while(trieIterator.hasNext() && !(range=trieIterator.next()).leadSurrogate) {
            /* add the start code point to the USet */
            set.add(range.startCodePoint);
        }
    }
    private static final Trie2.ValueMapper segmentStarterMapper=new Trie2.ValueMapper() {
        @Override
        public int map(int in) {
            return in&CANON_NOT_SEGMENT_STARTER;
        }
    };

    // low-level properties ------------------------------------------------ ***

    public Trie2_16 getNormTrie() { return normTrie; }

    // Note: Normalizer2Impl.java r30983 (2011-nov-27)
    // still had getFCDTrie() which built and cached an FCD trie.
    // That provided faster access to FCD data than getFCD16FromNormData()
    // but required synchronization and consumed some 10kB of heap memory
    // in any process that uses FCD (e.g., via collation).
    // tccc180[] and smallFCD[] are intended to help with any loss of performance,
    // at least for Latin & CJK.

    /**
     * Builds the canonical-iterator data for this instance.
     * This is required before any of {@link #isCanonSegmentStarter(int)} or
     * {@link #getCanonStartSet(int, UnicodeSet)} are called,
     * or else they crash.
     * @return this
     */
    public synchronized Normalizer2Impl ensureCanonIterData() {
        if(canonIterData==null) {
            Trie2Writable newData=new Trie2Writable(0, 0);
            canonStartSets=new ArrayList();
            Iterator trieIterator=normTrie.iterator();
            Trie2.Range range;
            while(trieIterator.hasNext() && !(range=trieIterator.next()).leadSurrogate) {
                final int norm16=range.value;
                if(norm16==0 || (minYesNo<=norm16 && norm16=minMaybeYes) {
                        // not a segment starter if it occurs in a decomposition or has cc!=0
                        newValue|=CANON_NOT_SEGMENT_STARTER;
                        if(norm16=minNoNo) {
                                    while((norm16_2+=Character.charCount(c2))=MIN_NORMAL_MAYBE_YES) {
            return norm16&0xff;
        }
        if(norm16=MIN_NORMAL_MAYBE_YES ? norm16&0xff : 0;
    }

    /**
     * Returns the FCD data for code point c.
     * @param c A Unicode code point.
     * @return The lccc(c) in bits 15..8 and tccc(c) in bits 7..0.
     */
    public int getFCD16(int c) {
        if(c<0) {
            return 0;
        } else if(c<0x180) {
            return tccc180[c];
        } else if(c<=0xffff) {
            if(!singleLeadMightHaveNonZeroFCD16(c)) { return 0; }
        }
        return getFCD16FromNormData(c);
    }
    /** Returns the FCD data for U+0000<=c>8];
        if(bits==0) { return false; }
        return ((bits>>((lead>>5)&7))&1)!=0;
    }

    /** Gets the FCD value from the regular normalization data. */
    public int getFCD16FromNormData(int c) {
        // Only loops for 1:1 algorithmic mappings.
        for(;;) {
            int norm16=getNorm16(c);
            if(norm16<=minYesNo) {
                // no decomposition or Hangul syllable, all zeros
                return 0;
            } else if(norm16>=MIN_NORMAL_MAYBE_YES) {
                // combining mark
                norm16&=0xff;
                return norm16|(norm16<<8);
            } else if(norm16>=minMaybeYes) {
                return 0;
            } else if(isDecompNoAlgorithmic(norm16)) {
                c=mapAlgorithmic(c, norm16);
            } else {
                // c decomposes, get everything from the variable-length extra data
                int firstUnit=extraData.charAt(norm16);
                if((firstUnit&MAPPING_LENGTH_MASK)==0) {
                    // A character that is deleted (maps to an empty string) must
                    // get the worst-case lccc and tccc values because arbitrary
                    // characters on both sides will become adjacent.
                    return 0x1ff;
                } else {
                    int fcd16=firstUnit>>8;  // tccc
                    if((firstUnit&MAPPING_HAS_CCC_LCCC_WORD)!=0) {
                        fcd16|=extraData.charAt(norm16-1)&0xff00;  // lccc
                    }
                    return fcd16;
                }
            }
        }
    }

    /**
     * Gets the decomposition for one code point.
     * @param c code point
     * @return c's decomposition, if it has one; returns null if it does not have a decomposition
     */
    public String getDecomposition(int c) {
        int decomp=-1;
        int norm16;
        for(;;) {
            if(c>7)&1)-1;
                char rm0=extraData.charAt(rawMapping);
                if(rm0<=MAPPING_LENGTH_MASK) {
                    return extraData.substring(rawMapping-rm0, rawMapping);
                } else {
                    // Copy the normal mapping and replace its first two code units with rm0.
                    StringBuilder buffer=new StringBuilder(mLength-1).append(rm0);
                    norm16+=1+2;  // skip over the firstUnit and the first two mapping code units
                    return buffer.append(extraData, norm16, norm16+mLength-2).toString();
                }
            } else {
                norm16+=1;  // skip over the firstUnit
                return extraData.substring(norm16, norm16+mLength);
            }
        }
    }

    /**
     * Returns true if code point c starts a canonical-iterator string segment.
     * {@link #ensureCanonIterData()} must have been called before this method,
     * or else this method will crash.
     * @param c A Unicode code point.
     * @return true if c starts a canonical-iterator string segment.
     */
    public boolean isCanonSegmentStarter(int c) {
        return canonIterData.get(c)>=0;
    }
    /**
     * Returns true if there are characters whose decomposition starts with c.
     * If so, then the set is cleared and then filled with those characters.
     * {@link #ensureCanonIterData()} must have been called before this method,
     * or else this method will crash.
     * @param c A Unicode code point.
     * @param set A UnicodeSet to receive the characters whose decompositions
     *        start with c, if there are any.
     * @return true if there are characters whose decomposition starts with c.
     */
    public boolean getCanonStartSet(int c, UnicodeSet set) {
        int canonValue=canonIterData.get(c)&~CANON_NOT_SEGMENT_STARTER;
        if(canonValue==0) {
            return false;
        }
        set.clear();
        int value=canonValue&CANON_VALUE_MASK;
        if((canonValue&CANON_HAS_SET)!=0) {
            set.addAll(canonStartSets.get(value));
        } else if(value!=0) {
            set.add(value);
        }
        if((canonValue&CANON_HAS_COMPOSITIONS)!=0) {
            int norm16=getNorm16(c);
            if(norm16==JAMO_L) {
                int syllable=Hangul.HANGUL_BASE+(c-Hangul.JAMO_L_BASE)*Hangul.JAMO_VT_COUNT;
                set.add(syllable, syllable+Hangul.JAMO_VT_COUNT-1);
            } else {
                addComposites(getCompositionsList(norm16), set);
            }
        }
        return true;
    }

    public static final int MIN_CCC_LCCC_CP=0x300;

    public static final int MIN_YES_YES_WITH_CC=0xff01;
    public static final int JAMO_VT=0xff00;
    public static final int MIN_NORMAL_MAYBE_YES=0xfe00;
    public static final int JAMO_L=1;
    public static final int MAX_DELTA=0x40;

    // Byte offsets from the start of the data, after the generic header.
    public static final int IX_NORM_TRIE_OFFSET=0;
    public static final int IX_EXTRA_DATA_OFFSET=1;
    public static final int IX_SMALL_FCD_OFFSET=2;
    public static final int IX_RESERVED3_OFFSET=3;
    public static final int IX_TOTAL_SIZE=7;

    // Code point thresholds for quick check codes.
    public static final int IX_MIN_DECOMP_NO_CP=8;
    public static final int IX_MIN_COMP_NO_MAYBE_CP=9;

    // Norm16 value thresholds for quick check combinations and types of extra data.
    // Mappings & compositions in [minYesNo..minYesNoMappingsOnly[.
    public static final int IX_MIN_YES_NO=10;
    public static final int IX_MIN_NO_NO=11;
    public static final int IX_LIMIT_NO_NO=12;
    public static final int IX_MIN_MAYBE_YES=13;

    // Mappings only in [minYesNoMappingsOnly..minNoNo[.
    public static final int IX_MIN_YES_NO_MAPPINGS_ONLY=14;

    public static final int IX_COUNT=16;

    public static final int MAPPING_HAS_CCC_LCCC_WORD=0x80;
    public static final int MAPPING_HAS_RAW_MAPPING=0x40;
    public static final int MAPPING_NO_COMP_BOUNDARY_AFTER=0x20;
    public static final int MAPPING_LENGTH_MASK=0x1f;

    public static final int COMP_1_LAST_TUPLE=0x8000;
    public static final int COMP_1_TRIPLE=1;
    public static final int COMP_1_TRAIL_LIMIT=0x3400;
    public static final int COMP_1_TRAIL_MASK=0x7ffe;
    public static final int COMP_1_TRAIL_SHIFT=9;  // 10-1 for the "triple" bit
    public static final int COMP_2_TRAIL_SHIFT=6;
    public static final int COMP_2_TRAIL_MASK=0xffc0;

    // higher-level functionality ------------------------------------------ ***

    // NFD without an NFD Normalizer2 instance.
    public Appendable decompose(CharSequence s, StringBuilder dest) {
        decompose(s, 0, s.length(), dest, s.length());
        return dest;
    }
    /**
     * Decomposes s[src, limit[ and writes the result to dest.
     * limit can be NULL if src is NUL-terminated.
     * destLengthEstimate is the initial dest buffer capacity and can be -1.
     */
    public void decompose(CharSequence s, int src, int limit, StringBuilder dest,
                   int destLengthEstimate) {
        if(destLengthEstimate<0) {
            destLengthEstimate=limit-src;
        }
        dest.setLength(0);
        ReorderingBuffer buffer=new ReorderingBuffer(this, dest, destLengthEstimate);
        decompose(s, src, limit, buffer);
    }

    // Dual functionality:
    // buffer!=NULL: normalize
    // buffer==NULL: isNormalized/quickCheck/spanQuickCheckYes
    public int decompose(CharSequence s, int src, int limit,
                         ReorderingBuffer buffer) {
        int minNoCP=minDecompNoCP;

        int prevSrc;
        int c=0;
        int norm16=0;

        // only for quick check
        int prevBoundary=src;
        int prevCC=0;

        for(;;) {
            // count code units below the minimum or with irrelevant data for the quick check
            for(prevSrc=src; src!=limit;) {
                if( (c=s.charAt(src))=limit) {
                break;
            }
            c=Character.codePointAt(s, src);
            cc=getCC(getNorm16(c));
        };
        buffer.append(s, 0, src, firstCC, prevCC);
        buffer.append(s, src, limit);
    }
    // Very similar to composeQuickCheck(): Make the same changes in both places if relevant.
    // doCompose: normalize
    // !doCompose: isNormalized (buffer must be empty and initialized)
    public boolean compose(CharSequence s, int src, int limit,
                           boolean onlyContiguous,
                           boolean doCompose,
                           ReorderingBuffer buffer) {
        int minNoMaybeCP=minCompNoMaybeCP;

        /*
         * prevBoundary points to the last character before the current one
         * that has a composition boundary before it with ccc==0 and quick check "yes".
         * Keeping track of prevBoundary saves us looking for a composition boundary
         * when we find a "no" or "maybe".
         *
         * When we back out from prevSrc back to prevBoundary,
         * then we also remove those same characters (which had been simply copied
         * or canonically-order-inserted) from the ReorderingBuffer.
         * Therefore, at all times, the [prevBoundary..prevSrc[ source units
         * must correspond 1:1 to destination units at the end of the destination buffer.
         */
        int prevBoundary=src;
        int prevSrc;
        int c=0;
        int norm16=0;

        // only for isNormalized
        int prevCC=0;

        for(;;) {
            // count code units below the minimum or with irrelevant data for the quick check
            for(prevSrc=src; src!=limit;) {
                if( (c=s.charAt(src))=minNoNo.
             * c is either a "noNo" (has a mapping) or a "maybeYes" (combines backward)
             * or has ccc!=0.
             * Check for Jamo V/T, then for regular characters.
             * c is not a Hangul syllable or Jamo L because those have "yes" properties.
             */
            if(isJamoVT(norm16) && prevBoundary!=prevSrc) {
                char prev=s.charAt(prevSrc-1);
                boolean needToDecompose=false;
                if(c=MIN_YES_YES_WITH_CC) {
                int cc=norm16&0xff;  // cc!=0
                if( onlyContiguous &&  // FCC
                    (doCompose ? buffer.getLastCC() : prevCC)==0 &&
                    prevBoundarycc
                ) {
                    // Fails FCD test, need to decompose and contiguously recompose.
                    if(!doCompose) {
                        return false;
                    }
                } else if(doCompose) {
                    buffer.append(c, cc);
                    continue;
                } else if(prevCC<=cc) {
                    prevCC=cc;
                    continue;
                } else {
                    return false;
                }
            } else if(!doCompose && !isMaybeOrNonZeroCC(norm16)) {
                return false;
            }

            /*
             * Find appropriate boundaries around this character,
             * decompose the source text from between the boundaries,
             * and recompose it.
             *
             * We may need to remove the last few characters from the ReorderingBuffer
             * to account for source text that was copied or appended
             * but needs to take part in the recomposition.
             */

            /*
             * Find the last composition boundary in [prevBoundary..src[.
             * It is either the decomposition of the current character (at prevSrc),
             * or prevBoundary.
             */
            if(hasCompBoundaryBefore(c, norm16)) {
                prevBoundary=prevSrc;
            } else if(doCompose) {
                buffer.removeSuffix(prevSrc-prevBoundary);
            }

            // Find the next composition boundary in [src..limit[ -
            // modifies src to point to the next starter.
            src=findNextCompBoundary(s, src, limit);

            // Decompose [prevBoundary..src[ into the buffer and then recompose that part of it.
            int recomposeStartIndex=buffer.length();
            decomposeShort(s, prevBoundary, src, buffer);
            recompose(buffer, recomposeStartIndex, onlyContiguous);
            if(!doCompose) {
                if(!buffer.equals(s, prevBoundary, src)) {
                    return false;
                }
                buffer.remove();
                prevCC=0;
            }

            // Move to the next starter. We never need to look back before this point again.
            prevBoundary=src;
        }
        return true;
    }
    /**
     * Very similar to compose(): Make the same changes in both places if relevant.
     * doSpan: spanQuickCheckYes (ignore bit 0 of the return value)
     * !doSpan: quickCheck
     * @return bits 31..1: spanQuickCheckYes (==s.length() if "yes") and
     *         bit 0: set if "maybe"; otherwise, if the span length<s.length()
     *         then the quick check result is "no"
     */
    public int composeQuickCheck(CharSequence s, int src, int limit,
                                 boolean onlyContiguous, boolean doSpan) {
        int qcResult=0;
        int minNoMaybeCP=minCompNoMaybeCP;

        /*
         * prevBoundary points to the last character before the current one
         * that has a composition boundary before it with ccc==0 and quick check "yes".
         */
        int prevBoundary=src;
        int prevSrc;
        int c=0;
        int norm16=0;
        int prevCC=0;

        for(;;) {
            // count code units below the minimum or with irrelevant data for the quick check
            for(prevSrc=src;;) {
                if(src==limit) {
                    return (src<<1)|qcResult;  // "yes" or "maybe"
                }
                if( (c=s.charAt(src))=minNoNo.
             * c is either a "noNo" (has a mapping) or a "maybeYes" (combines backward)
             * or has ccc!=0.
             */
            if(isMaybeOrNonZeroCC(norm16)) {
                int cc=getCCFromYesOrMaybe(norm16);
                if( onlyContiguous &&  // FCC
                    cc!=0 &&
                    prevCC==0 &&
                    prevBoundarycc
                ) {
                    // Fails FCD test.
                } else if(prevCC<=cc || cc==0) {
                    prevCC=cc;
                    if(norm16appendZeroCC() because we track
        // the lead and trail combining classes here, rather than leaving it to
        // the ReorderingBuffer.
        // The exception is the call to decomposeShort() which uses the buffer
        // in the normal way.

        // Tracks the last FCD-safe boundary, before lccc=0 or after properly-ordered tccc<=1.
        // Similar to the prevBoundary in the compose() implementation.
        int prevBoundary=src;
        int prevSrc;
        int c=0;
        int prevFCD16=0;
        int fcd16=0;

        for(;;) {
            // count code units with lccc==0
            for(prevSrc=src; src!=limit;) {
                if((c=s.charAt(src))1) {
                        --prevBoundary;
                    }
                } else {
                    int p=src-1;
                    if( Character.isLowSurrogate(s.charAt(p)) && prevSrc

1) { prevBoundary=p; } } if(buffer!=null) { // The last lccc==0 character is excluded from the // flush-and-append call in case it needs to be modified. buffer.flushAndAppendZeroCC(s, prevSrc, prevBoundary); buffer.append(s, prevBoundary, src); } // The start of the current character (c). prevSrc=src; } else if(src==limit) { break; } src+=Character.charCount(c); // The current character (c) at [prevSrc..src[ has a non-zero lead combining class. // Check for proper order, and decompose locally if necessary. if((prevFCD16&0xff)<=(fcd16>>8)) { // proper order: prev tccc <= current lccc if((fcd16&0xff)<=1) { prevBoundary=src; } if(buffer!=null) { buffer.appendZeroCC(c); } prevFCD16=fcd16; continue; } else if(buffer==null) { return prevBoundary; // quick check "no" } else { /* * Back out the part of the source that we copied or appended * already but is now going to be decomposed. * prevSrc is set to after what was copied/appended. */ buffer.removeSuffix(prevSrc-prevBoundary); /* * Find the part of the source that needs to be decomposed, * up to the next safe boundary. */ src=findNextFCDBoundary(s, src, limit); /* * The source text does not fulfill the conditions for FCD. * Decompose and reorder a limited piece of the text. */ decomposeShort(s, prevBoundary, src, buffer); prevBoundary=src; prevFCD16=0; } } return src; } public void makeFCDAndAppend(CharSequence s, boolean doMakeFCD, ReorderingBuffer buffer) { int src=0, limit=s.length(); if(!buffer.isEmpty()) { int firstBoundaryInSrc=findNextFCDBoundary(s, 0, limit); if(0!=firstBoundaryInSrc) { int lastBoundaryInDest=findPreviousFCDBoundary(buffer.getStringBuilder(), buffer.length()); StringBuilder middle=new StringBuilder((buffer.length()-lastBoundaryInDest)+ firstBoundaryInSrc+16); middle.append(buffer.getStringBuilder(), lastBoundaryInDest, buffer.length()); buffer.removeSuffix(buffer.length()-lastBoundaryInDest); middle.append(s, 0, firstBoundaryInSrc); makeFCD(middle, 0, middle.length(), buffer); src=firstBoundaryInSrc; } } if(doMakeFCD) { makeFCD(s, src, limit, buffer); } else { buffer.append(s, src, limit); } } // Note: hasDecompBoundary() could be implemented as aliases to // hasFCDBoundaryBefore() and hasFCDBoundaryAfter() // at the cost of building the FCD trie for a decomposition normalizer. public boolean hasDecompBoundary(int c, boolean before) { for(;;) { if(cMIN_NORMAL_MAYBE_YES) { return false; // ccc!=0 } else if(isDecompNoAlgorithmic(norm16)) { c=mapAlgorithmic(c, norm16); } else { // c decomposes, get everything from the variable-length extra data int firstUnit=extraData.charAt(norm16); if((firstUnit&MAPPING_LENGTH_MASK)==0) { return false; } if(!before) { // decomp after-boundary: same as hasFCDBoundaryAfter(), // fcd16<=1 || trailCC==0 if(firstUnit>0x1ff) { return false; // trailCC>1 } if(firstUnit<=0xff) { return true; // trailCC==0 } // if(trailCC==1) test leadCC==0, same as checking for before-boundary } // true if leadCC==0 (hasFCDBoundaryBefore()) return (firstUnit&MAPPING_HAS_CCC_LCCC_WORD)==0 || (extraData.charAt(norm16-1)&0xff00)==0; } } } public boolean isDecompInert(int c) { return isDecompYesAndZeroCC(getNorm16(c)); } public boolean hasCompBoundaryBefore(int c) { return c= (testInert ? minNoNo : minMaybeYes)) { return false; } else if(isDecompNoAlgorithmic(norm16)) { c=mapAlgorithmic(c, norm16); } else { // c decomposes, get everything from the variable-length extra data. // If testInert, then c must be a yesNo character which has lccc=0, // otherwise it could be a noNo. int firstUnit=extraData.charAt(norm16); // true if // not MAPPING_NO_COMP_BOUNDARY_AFTER // (which is set if // c is not deleted, and // it and its decomposition do not combine forward, and it has a starter) // and if FCC then trailCC<=1 return (firstUnit&MAPPING_NO_COMP_BOUNDARY_AFTER)==0 && (!onlyContiguous || firstUnit<=0x1ff); } } } public boolean hasFCDBoundaryBefore(int c) { return c=minMaybeYes; } private static boolean isInert(int norm16) { return norm16==0; } private static boolean isJamoL(int norm16) { return norm16==1; } private static boolean isJamoVT(int norm16) { return norm16==JAMO_VT; } private boolean isHangul(int norm16) { return norm16==minYesNo; } private boolean isCompYesAndZeroCC(int norm16) { return norm16=MIN_YES_YES_WITH_CC || norm16=limitNoNo; } // For use with isCompYes(). // Perhaps the compiler can combine the two tests for MIN_YES_YES_WITH_CC. // static uint8_t getCCFromYes(uint16_t norm16) { // return norm16>=MIN_YES_YES_WITH_CC ? (uint8_t)norm16 : 0; // } private int getCCFromNoNo(int norm16) { if((extraData.charAt(norm16)&MAPPING_HAS_CCC_LCCC_WORD)!=0) { return extraData.charAt(norm16-1)&0xff; } else { return 0; } } // requires that the [cpStart..cpLimit[ character passes isCompYesAndZeroCC() int getTrailCCFromCompYesAndZeroCC(CharSequence s, int cpStart, int cpLimit) { int c; if(cpStart==(cpLimit-1)) { c=s.charAt(cpStart); } else { c=Character.codePointAt(s, cpStart); } int prevNorm16=getNorm16(c); if(prevNorm16<=minYesNo) { return 0; // yesYes and Hangul LV/LVT have ccc=tccc=0 } else { return extraData.charAt(prevNorm16)>>8; // tccc from yesNo } } // Requires algorithmic-NoNo. private int mapAlgorithmic(int c, int norm16) { return c+norm16-(minMaybeYes-MAX_DELTA-1); } // Requires minYesNo>8; if((firstUnit&MAPPING_HAS_CCC_LCCC_WORD)!=0) { leadCC=extraData.charAt(norm16-1)>>8; } else { leadCC=0; } ++norm16; // skip over the firstUnit buffer.append(extraData, norm16, norm16+length, leadCC, trailCC); } return; } } /** * Finds the recomposition result for * a forward-combining "lead" character, * specified with a pointer to its compositions list, * and a backward-combining "trail" character. * *

If the lead and trail characters combine, then this function returns * the following "compositeAndFwd" value: *

     * Bits 21..1  composite character
     * Bit      0  set if the composite is a forward-combining starter
     * 
* otherwise it returns -1. * *

The compositions list has (trail, compositeAndFwd) pair entries, * encoded as either pairs or triples of 16-bit units. * The last entry has the high bit of its first unit set. * *

The list is sorted by ascending trail characters (there are no duplicates). * A linear search is used. * *

See normalizer2impl.h for a more detailed description * of the compositions list format. */ private static int combine(String compositions, int list, int trail) { int key1, firstUnit; if(trail(firstUnit=compositions.charAt(list))) { list+=2+(firstUnit&COMP_1_TRIPLE); } if(key1==(firstUnit&COMP_1_TRAIL_MASK)) { if((firstUnit&COMP_1_TRIPLE)!=0) { return (compositions.charAt(list+1)<<16)|compositions.charAt(list+2); } else { return compositions.charAt(list+1); } } } else { // trail character is 3400..10FFFF // result entry has 3 units key1=COMP_1_TRAIL_LIMIT+(((trail>>COMP_1_TRAIL_SHIFT))&~COMP_1_TRIPLE); int key2=(trail<(firstUnit=compositions.charAt(list))) { list+=2+(firstUnit&COMP_1_TRIPLE); } else if(key1==(firstUnit&COMP_1_TRAIL_MASK)) { if(key2>(secondUnit=compositions.charAt(list+1))) { if((firstUnit&COMP_1_LAST_TUPLE)!=0) { break; } else { list+=3; } } else if(key2==(secondUnit&COMP_2_TRAIL_MASK)) { return ((secondUnit&~COMP_2_TRAIL_MASK)<<16)|compositions.charAt(list+2); } else { break; } } else { break; } } } return -1; } /** * @param list some character's compositions list * @param set recursively receives the composites from these compositions */ private void addComposites(int list, UnicodeSet set) { int firstUnit, compositeAndFwd; do { firstUnit=maybeYesCompositions.charAt(list); if((firstUnit&COMP_1_TRIPLE)==0) { compositeAndFwd=maybeYesCompositions.charAt(list+1); list+=2; } else { compositeAndFwd=((maybeYesCompositions.charAt(list+1)&~COMP_2_TRAIL_MASK)<<16)| maybeYesCompositions.charAt(list+2); list+=3; } int composite=compositeAndFwd>>1; if((compositeAndFwd&1)!=0) { addComposites(getCompositionsListForComposite(getNorm16(composite)), set); } set.add(composite); } while((firstUnit&COMP_1_LAST_TUPLE)==0); } /* * Recomposes the buffer text starting at recomposeStartIndex * (which is in NFD - decomposed and canonically ordered), * and truncates the buffer contents. * * Note that recomposition never lengthens the text: * Any character consists of either one or two code units; * a composition may contain at most one more code unit than the original starter, * while the combining mark that is removed has at least one code unit. */ private void recompose(ReorderingBuffer buffer, int recomposeStartIndex, boolean onlyContiguous) { StringBuilder sb=buffer.getStringBuilder(); int p=recomposeStartIndex; if(p==sb.length()) { return; } int starter, pRemove; int compositionsList; int c, compositeAndFwd; int norm16; int cc, prevCC; boolean starterIsSupplementary; // Some of the following variables are not used until we have a forward-combining starter // and are only initialized now to avoid compiler warnings. compositionsList=-1; // used as indicator for whether we have a forward-combining starter starter=-1; starterIsSupplementary=false; prevCC=0; for(;;) { c=sb.codePointAt(p); p+=Character.charCount(c); norm16=getNorm16(c); cc=getCCFromYesOrMaybe(norm16); if( // this character combines backward and isMaybe(norm16) && // we have seen a starter that combines forward and compositionsList>=0 && // the backward-combining character is not blocked (prevCC=0) { // The starter and the combining mark (c) do combine. int composite=compositeAndFwd>>1; // Remove the combining mark. pRemove=p-Character.charCount(c); // pRemove & p: start & limit of the combining mark sb.delete(pRemove, p); p=pRemove; // Replace the starter with the composite. if(starterIsSupplementary) { if(composite>0xffff) { // both are supplementary sb.setCharAt(starter, UTF16.getLeadSurrogate(composite)); sb.setCharAt(starter+1, UTF16.getTrailSurrogate(composite)); } else { sb.setCharAt(starter, (char)c); sb.deleteCharAt(starter+1); // The composite is shorter than the starter, // move the intermediate characters forward one. starterIsSupplementary=false; --p; } } else if(composite>0xffff) { // The composite is longer than the starter, // move the intermediate characters back one. starterIsSupplementary=true; sb.setCharAt(starter, UTF16.getLeadSurrogate(composite)); sb.insert(starter+1, UTF16.getTrailSurrogate(composite)); ++p; } else { // both are on the BMP sb.setCharAt(starter, (char)composite); } // Keep prevCC because we removed the combining mark. if(p==sb.length()) { break; } // Is the composite a starter that combines forward? if((compositeAndFwd&1)!=0) { compositionsList= getCompositionsListForComposite(getNorm16(composite)); } else { compositionsList=-1; } // We combined; continue with looking for compositions. continue; } } // no combination this time prevCC=cc; if(p==sb.length()) { break; } // If c did not combine, then check if it is a starter. if(cc==0) { // Found a new starter. if((compositionsList=getCompositionsListForDecompYes(norm16))>=0) { // It may combine with something, prepare for it. if(c<=0xffff) { starterIsSupplementary=false; starter=p-1; } else { starterIsSupplementary=true; starter=p-2; } } } else if(onlyContiguous) { // FCC: no discontiguous compositions; any intervening character blocks. compositionsList=-1; } } buffer.flush(); } public int composePair(int a, int b) { int norm16=getNorm16(a); // maps an out-of-range 'a' to inert norm16=0 int list; if(isInert(norm16)) { return -1; } else if(norm16minYesNo) { // composite 'a' has both mapping & compositions list list+= // mapping pointer 1+ // +1 to skip the first unit with the mapping lenth (extraData.charAt(list)&MAPPING_LENGTH_MASK); // + mapping length } // Turn the offset-into-extraData into an offset-into-maybeYesCompositions. list+=MIN_NORMAL_MAYBE_YES-minMaybeYes; } } else if(norm16>1; } /** * Does c have a composition boundary before it? * True if its decomposition begins with a character that has * ccc=0 && NFC_QC=Yes (isCompYesAndZeroCC()). * As a shortcut, this is true if c itself has ccc=0 && NFC_QC=Yes * (isCompYesAndZeroCC()) so we need not decompose. */ private boolean hasCompBoundaryBefore(int c, int norm16) { for(;;) { if(isCompYesAndZeroCC(norm16)) { return true; } else if(isMaybeOrNonZeroCC(norm16)) { return false; } else if(isDecompNoAlgorithmic(norm16)) { c=mapAlgorithmic(c, norm16); norm16=getNorm16(c); } else { // c decomposes, get everything from the variable-length extra data int firstUnit=extraData.charAt(norm16); if((firstUnit&MAPPING_LENGTH_MASK)==0) { return false; } if((firstUnit&MAPPING_HAS_CCC_LCCC_WORD)!=0 && (extraData.charAt(norm16-1)&0xff00)!=0) { return false; // non-zero leadCC } return isCompYesAndZeroCC(getNorm16(Character.codePointAt(extraData, norm16+1))); } } } private int findPreviousCompBoundary(CharSequence s, int p) { while(p>0) { int c=Character.codePointBefore(s, p); p-=Character.charCount(c); if(hasCompBoundaryBefore(c)) { break; } // We could also test hasCompBoundaryAfter() and return iter.codePointLimit, // but that's probably not worth the extra cost. } return p; } private int findNextCompBoundary(CharSequence s, int p, int limit) { while(p0) { int c=Character.codePointBefore(s, p); p-=Character.charCount(c); if(c canonStartSets; // bits in canonIterData private static final int CANON_NOT_SEGMENT_STARTER = 0x80000000; private static final int CANON_HAS_COMPOSITIONS = 0x40000000; private static final int CANON_HAS_SET = 0x200000; private static final int CANON_VALUE_MASK = 0x1fffff; }





© 2015 - 2024 Weber Informatics LLC | Privacy Policy