com.ibm.icu.impl.Normalizer2Impl Maven / Gradle / Ivy
// © 2016 and later: Unicode, Inc. and others.
// License & terms of use: http://www.unicode.org/copyright.html#License
/*
*******************************************************************************
* Copyright (C) 2009-2015, International Business Machines
* Corporation and others. All Rights Reserved.
*******************************************************************************
*/
package com.ibm.icu.impl;
import java.io.IOException;
import java.nio.ByteBuffer;
import java.util.ArrayList;
import java.util.Iterator;
import com.ibm.icu.text.UTF16;
import com.ibm.icu.text.UnicodeSet;
import com.ibm.icu.util.ICUUncheckedIOException;
import com.ibm.icu.util.VersionInfo;
public final class Normalizer2Impl {
public static final class Hangul {
/* Korean Hangul and Jamo constants */
public static final int JAMO_L_BASE=0x1100; /* "lead" jamo */
public static final int JAMO_L_END=0x1112;
public static final int JAMO_V_BASE=0x1161; /* "vowel" jamo */
public static final int JAMO_V_END=0x1175;
public static final int JAMO_T_BASE=0x11a7; /* "trail" jamo */
public static final int JAMO_T_END=0x11c2;
public static final int HANGUL_BASE=0xac00;
public static final int HANGUL_END=0xd7a3;
public static final int JAMO_L_COUNT=19;
public static final int JAMO_V_COUNT=21;
public static final int JAMO_T_COUNT=28;
public static final int JAMO_L_LIMIT=JAMO_L_BASE+JAMO_L_COUNT;
public static final int JAMO_V_LIMIT=JAMO_V_BASE+JAMO_V_COUNT;
public static final int JAMO_VT_COUNT=JAMO_V_COUNT*JAMO_T_COUNT;
public static final int HANGUL_COUNT=JAMO_L_COUNT*JAMO_V_COUNT*JAMO_T_COUNT;
public static final int HANGUL_LIMIT=HANGUL_BASE+HANGUL_COUNT;
public static boolean isHangul(int c) {
return HANGUL_BASE<=c && c
* If dest is a StringBuilder, then the buffer writes directly to it.
* Otherwise, the buffer maintains a StringBuilder for intermediate text segments
* until no further changes are necessary and whole segments are appended.
* append() methods that take combining-class values always write to the StringBuilder.
* Other append() methods flush and append to the Appendable.
*/
public static final class ReorderingBuffer implements Appendable {
public ReorderingBuffer(Normalizer2Impl ni, Appendable dest, int destCapacity) {
impl=ni;
app=dest;
if(app instanceof StringBuilder) {
appIsStringBuilder=true;
str=(StringBuilder)dest;
// In Java, the constructor subsumes public void init(int destCapacity) {
str.ensureCapacity(destCapacity);
reorderStart=0;
if(str.length()==0) {
lastCC=0;
} else {
setIterator();
lastCC=previousCC();
// Set reorderStart after the last code point with cc<=1 if there is one.
if(lastCC>1) {
while(previousCC()>1) {}
}
reorderStart=codePointLimit;
}
} else {
appIsStringBuilder=false;
str=new StringBuilder();
reorderStart=0;
lastCC=0;
}
}
public boolean isEmpty() { return str.length()==0; }
public int length() { return str.length(); }
public int getLastCC() { return lastCC; }
public StringBuilder getStringBuilder() { return str; }
public boolean equals(CharSequence s, int start, int limit) {
return UTF16Plus.equal(str, 0, str.length(), s, start, limit);
}
// For Hangul composition, replacing the Leading consonant Jamo with the syllable.
public void setLastChar(char c) {
str.setCharAt(str.length()-1, c);
}
public void append(int c, int cc) {
if(lastCC<=cc || cc==0) {
str.appendCodePoint(c);
lastCC=cc;
if(cc<=1) {
reorderStart=str.length();
}
} else {
insert(c, cc);
}
}
// s must be in NFD, otherwise change the implementation.
public void append(CharSequence s, int start, int limit,
int leadCC, int trailCC) {
if(start==limit) {
return;
}
if(lastCC<=leadCC || leadCC==0) {
if(trailCC<=1) {
reorderStart=str.length()+(limit-start);
} else if(leadCC<=1) {
reorderStart=str.length()+1; // Ok if not a code point boundary.
}
str.append(s, start, limit);
lastCC=trailCC;
} else {
int c=Character.codePointAt(s, start);
start+=Character.charCount(c);
insert(c, leadCC); // insert first code point
while(startcc;) {}
// insert c at codePointLimit, after the character with prevCC<=cc
if(c<=0xffff) {
str.insert(codePointLimit, (char)c);
if(cc<=1) {
reorderStart=codePointLimit+1;
}
} else {
str.insert(codePointLimit, Character.toChars(c));
if(cc<=1) {
reorderStart=codePointLimit+2;
}
}
}
private final Normalizer2Impl impl;
private final Appendable app;
private final StringBuilder str;
private final boolean appIsStringBuilder;
private int reorderStart;
private int lastCC;
// private backward iterator
private void setIterator() { codePointStart=str.length(); }
private void skipPrevious() { // Requires 0=codePointStart) {
return 0;
}
int c=str.codePointBefore(codePointStart);
codePointStart-=Character.charCount(c);
if(c(nextOffset-offset)) {
throw new ICUUncheckedIOException("Normalizer2 data: not enough bytes for normTrie");
}
ICUBinary.skipBytes(bytes, (nextOffset-offset)-trieLength); // skip padding after trie bytes
// Read the composition and mapping data.
offset=nextOffset;
nextOffset=inIndexes[IX_SMALL_FCD_OFFSET];
int numChars=(nextOffset-offset)/2;
if(numChars!=0) {
maybeYesCompositions=ICUBinary.getString(bytes, numChars, 0);
extraData=maybeYesCompositions.substring(MIN_NORMAL_MAYBE_YES-minMaybeYes);
}
// smallFCD: new in formatVersion 2
offset=nextOffset;
smallFCD=new byte[0x100];
bytes.get(smallFCD);
// Build tccc180[].
// gennorm2 enforces lccc=0 for c>=1) {
if((c&0xff)==0) {
bits=smallFCD[c>>8]; // one byte per 0x100 code points
}
if((bits&1)!=0) {
for(int i=0; i<0x20; ++i, ++c) {
tccc180[c]=getFCD16FromNormData(c)&0xff;
}
} else {
c+=0x20;
}
}
return this;
} catch(IOException e) {
throw new ICUUncheckedIOException(e);
}
}
public Normalizer2Impl load(String name) {
return load(ICUBinary.getRequiredData(name));
}
private void enumLcccRange(int start, int end, int norm16, UnicodeSet set) {
if(isAlgorithmicNoNo(norm16)) {
// Range of code points with same-norm16-value algorithmic decompositions.
// They might have different non-zero FCD16 values.
do {
int fcd16=getFCD16(start);
if(fcd16>0xff) { set.add(start); }
} while(++start<=end);
} else {
int fcd16=getFCD16(start);
if(fcd16>0xff) { set.add(start, end); }
}
}
private void enumNorm16PropertyStartsRange(int start, int end, int value, UnicodeSet set) {
/* add the start code point to the USet */
set.add(start);
if(start!=end && isAlgorithmicNoNo(value)) {
// Range of code points with same-norm16-value algorithmic decompositions.
// They might have different non-zero FCD16 values.
int prevFCD16=getFCD16(start);
while(++start<=end) {
int fcd16=getFCD16(start);
if(fcd16!=prevFCD16) {
set.add(start);
prevFCD16=fcd16;
}
}
}
}
public void addLcccChars(UnicodeSet set) {
/* add the start code point of each same-value range of each trie */
Iterator trieIterator=normTrie.iterator();
Trie2.Range range;
while(trieIterator.hasNext() && !(range=trieIterator.next()).leadSurrogate) {
enumLcccRange(range.startCodePoint, range.endCodePoint, range.value, set);
}
}
public void addPropertyStarts(UnicodeSet set) {
/* add the start code point of each same-value range of each trie */
Iterator trieIterator=normTrie.iterator();
Trie2.Range range;
while(trieIterator.hasNext() && !(range=trieIterator.next()).leadSurrogate) {
enumNorm16PropertyStartsRange(range.startCodePoint, range.endCodePoint, range.value, set);
}
/* add Hangul LV syllables and LV+1 because of skippables */
for(int c=Hangul.HANGUL_BASE; c trieIterator=canonIterData.iterator(segmentStarterMapper);
Trie2.Range range;
while(trieIterator.hasNext() && !(range=trieIterator.next()).leadSurrogate) {
/* add the start code point to the USet */
set.add(range.startCodePoint);
}
}
private static final Trie2.ValueMapper segmentStarterMapper=new Trie2.ValueMapper() {
@Override
public int map(int in) {
return in&CANON_NOT_SEGMENT_STARTER;
}
};
// low-level properties ------------------------------------------------ ***
public Trie2_16 getNormTrie() { return normTrie; }
// Note: Normalizer2Impl.java r30983 (2011-nov-27)
// still had getFCDTrie() which built and cached an FCD trie.
// That provided faster access to FCD data than getFCD16FromNormData()
// but required synchronization and consumed some 10kB of heap memory
// in any process that uses FCD (e.g., via collation).
// tccc180[] and smallFCD[] are intended to help with any loss of performance,
// at least for Latin & CJK.
/**
* Builds the canonical-iterator data for this instance.
* This is required before any of {@link #isCanonSegmentStarter(int)} or
* {@link #getCanonStartSet(int, UnicodeSet)} are called,
* or else they crash.
* @return this
*/
public synchronized Normalizer2Impl ensureCanonIterData() {
if(canonIterData==null) {
Trie2Writable newData=new Trie2Writable(0, 0);
canonStartSets=new ArrayList();
Iterator trieIterator=normTrie.iterator();
Trie2.Range range;
while(trieIterator.hasNext() && !(range=trieIterator.next()).leadSurrogate) {
final int norm16=range.value;
if(norm16==0 || (minYesNo<=norm16 && norm16=minMaybeYes) {
// not a segment starter if it occurs in a decomposition or has cc!=0
newValue|=CANON_NOT_SEGMENT_STARTER;
if(norm16=minNoNo) {
while((norm16_2+=Character.charCount(c2))=MIN_NORMAL_MAYBE_YES) {
return norm16&0xff;
}
if(norm16=MIN_NORMAL_MAYBE_YES ? norm16&0xff : 0;
}
/**
* Returns the FCD data for code point c.
* @param c A Unicode code point.
* @return The lccc(c) in bits 15..8 and tccc(c) in bits 7..0.
*/
public int getFCD16(int c) {
if(c<0) {
return 0;
} else if(c<0x180) {
return tccc180[c];
} else if(c<=0xffff) {
if(!singleLeadMightHaveNonZeroFCD16(c)) { return 0; }
}
return getFCD16FromNormData(c);
}
/** Returns the FCD data for U+0000<=c>8];
if(bits==0) { return false; }
return ((bits>>((lead>>5)&7))&1)!=0;
}
/** Gets the FCD value from the regular normalization data. */
public int getFCD16FromNormData(int c) {
// Only loops for 1:1 algorithmic mappings.
for(;;) {
int norm16=getNorm16(c);
if(norm16<=minYesNo) {
// no decomposition or Hangul syllable, all zeros
return 0;
} else if(norm16>=MIN_NORMAL_MAYBE_YES) {
// combining mark
norm16&=0xff;
return norm16|(norm16<<8);
} else if(norm16>=minMaybeYes) {
return 0;
} else if(isDecompNoAlgorithmic(norm16)) {
c=mapAlgorithmic(c, norm16);
} else {
// c decomposes, get everything from the variable-length extra data
int firstUnit=extraData.charAt(norm16);
if((firstUnit&MAPPING_LENGTH_MASK)==0) {
// A character that is deleted (maps to an empty string) must
// get the worst-case lccc and tccc values because arbitrary
// characters on both sides will become adjacent.
return 0x1ff;
} else {
int fcd16=firstUnit>>8; // tccc
if((firstUnit&MAPPING_HAS_CCC_LCCC_WORD)!=0) {
fcd16|=extraData.charAt(norm16-1)&0xff00; // lccc
}
return fcd16;
}
}
}
}
/**
* Gets the decomposition for one code point.
* @param c code point
* @return c's decomposition, if it has one; returns null if it does not have a decomposition
*/
public String getDecomposition(int c) {
int decomp=-1;
int norm16;
for(;;) {
if(c>7)&1)-1;
char rm0=extraData.charAt(rawMapping);
if(rm0<=MAPPING_LENGTH_MASK) {
return extraData.substring(rawMapping-rm0, rawMapping);
} else {
// Copy the normal mapping and replace its first two code units with rm0.
StringBuilder buffer=new StringBuilder(mLength-1).append(rm0);
norm16+=1+2; // skip over the firstUnit and the first two mapping code units
return buffer.append(extraData, norm16, norm16+mLength-2).toString();
}
} else {
norm16+=1; // skip over the firstUnit
return extraData.substring(norm16, norm16+mLength);
}
}
}
/**
* Returns true if code point c starts a canonical-iterator string segment.
* {@link #ensureCanonIterData()} must have been called before this method,
* or else this method will crash.
* @param c A Unicode code point.
* @return true if c starts a canonical-iterator string segment.
*/
public boolean isCanonSegmentStarter(int c) {
return canonIterData.get(c)>=0;
}
/**
* Returns true if there are characters whose decomposition starts with c.
* If so, then the set is cleared and then filled with those characters.
* {@link #ensureCanonIterData()} must have been called before this method,
* or else this method will crash.
* @param c A Unicode code point.
* @param set A UnicodeSet to receive the characters whose decompositions
* start with c, if there are any.
* @return true if there are characters whose decomposition starts with c.
*/
public boolean getCanonStartSet(int c, UnicodeSet set) {
int canonValue=canonIterData.get(c)&~CANON_NOT_SEGMENT_STARTER;
if(canonValue==0) {
return false;
}
set.clear();
int value=canonValue&CANON_VALUE_MASK;
if((canonValue&CANON_HAS_SET)!=0) {
set.addAll(canonStartSets.get(value));
} else if(value!=0) {
set.add(value);
}
if((canonValue&CANON_HAS_COMPOSITIONS)!=0) {
int norm16=getNorm16(c);
if(norm16==JAMO_L) {
int syllable=Hangul.HANGUL_BASE+(c-Hangul.JAMO_L_BASE)*Hangul.JAMO_VT_COUNT;
set.add(syllable, syllable+Hangul.JAMO_VT_COUNT-1);
} else {
addComposites(getCompositionsList(norm16), set);
}
}
return true;
}
public static final int MIN_CCC_LCCC_CP=0x300;
public static final int MIN_YES_YES_WITH_CC=0xff01;
public static final int JAMO_VT=0xff00;
public static final int MIN_NORMAL_MAYBE_YES=0xfe00;
public static final int JAMO_L=1;
public static final int MAX_DELTA=0x40;
// Byte offsets from the start of the data, after the generic header.
public static final int IX_NORM_TRIE_OFFSET=0;
public static final int IX_EXTRA_DATA_OFFSET=1;
public static final int IX_SMALL_FCD_OFFSET=2;
public static final int IX_RESERVED3_OFFSET=3;
public static final int IX_TOTAL_SIZE=7;
// Code point thresholds for quick check codes.
public static final int IX_MIN_DECOMP_NO_CP=8;
public static final int IX_MIN_COMP_NO_MAYBE_CP=9;
// Norm16 value thresholds for quick check combinations and types of extra data.
// Mappings & compositions in [minYesNo..minYesNoMappingsOnly[.
public static final int IX_MIN_YES_NO=10;
public static final int IX_MIN_NO_NO=11;
public static final int IX_LIMIT_NO_NO=12;
public static final int IX_MIN_MAYBE_YES=13;
// Mappings only in [minYesNoMappingsOnly..minNoNo[.
public static final int IX_MIN_YES_NO_MAPPINGS_ONLY=14;
public static final int IX_COUNT=16;
public static final int MAPPING_HAS_CCC_LCCC_WORD=0x80;
public static final int MAPPING_HAS_RAW_MAPPING=0x40;
public static final int MAPPING_NO_COMP_BOUNDARY_AFTER=0x20;
public static final int MAPPING_LENGTH_MASK=0x1f;
public static final int COMP_1_LAST_TUPLE=0x8000;
public static final int COMP_1_TRIPLE=1;
public static final int COMP_1_TRAIL_LIMIT=0x3400;
public static final int COMP_1_TRAIL_MASK=0x7ffe;
public static final int COMP_1_TRAIL_SHIFT=9; // 10-1 for the "triple" bit
public static final int COMP_2_TRAIL_SHIFT=6;
public static final int COMP_2_TRAIL_MASK=0xffc0;
// higher-level functionality ------------------------------------------ ***
// NFD without an NFD Normalizer2 instance.
public Appendable decompose(CharSequence s, StringBuilder dest) {
decompose(s, 0, s.length(), dest, s.length());
return dest;
}
/**
* Decomposes s[src, limit[ and writes the result to dest.
* limit can be NULL if src is NUL-terminated.
* destLengthEstimate is the initial dest buffer capacity and can be -1.
*/
public void decompose(CharSequence s, int src, int limit, StringBuilder dest,
int destLengthEstimate) {
if(destLengthEstimate<0) {
destLengthEstimate=limit-src;
}
dest.setLength(0);
ReorderingBuffer buffer=new ReorderingBuffer(this, dest, destLengthEstimate);
decompose(s, src, limit, buffer);
}
// Dual functionality:
// buffer!=NULL: normalize
// buffer==NULL: isNormalized/quickCheck/spanQuickCheckYes
public int decompose(CharSequence s, int src, int limit,
ReorderingBuffer buffer) {
int minNoCP=minDecompNoCP;
int prevSrc;
int c=0;
int norm16=0;
// only for quick check
int prevBoundary=src;
int prevCC=0;
for(;;) {
// count code units below the minimum or with irrelevant data for the quick check
for(prevSrc=src; src!=limit;) {
if( (c=s.charAt(src))=limit) {
break;
}
c=Character.codePointAt(s, src);
cc=getCC(getNorm16(c));
};
buffer.append(s, 0, src, firstCC, prevCC);
buffer.append(s, src, limit);
}
// Very similar to composeQuickCheck(): Make the same changes in both places if relevant.
// doCompose: normalize
// !doCompose: isNormalized (buffer must be empty and initialized)
public boolean compose(CharSequence s, int src, int limit,
boolean onlyContiguous,
boolean doCompose,
ReorderingBuffer buffer) {
int minNoMaybeCP=minCompNoMaybeCP;
/*
* prevBoundary points to the last character before the current one
* that has a composition boundary before it with ccc==0 and quick check "yes".
* Keeping track of prevBoundary saves us looking for a composition boundary
* when we find a "no" or "maybe".
*
* When we back out from prevSrc back to prevBoundary,
* then we also remove those same characters (which had been simply copied
* or canonically-order-inserted) from the ReorderingBuffer.
* Therefore, at all times, the [prevBoundary..prevSrc[ source units
* must correspond 1:1 to destination units at the end of the destination buffer.
*/
int prevBoundary=src;
int prevSrc;
int c=0;
int norm16=0;
// only for isNormalized
int prevCC=0;
for(;;) {
// count code units below the minimum or with irrelevant data for the quick check
for(prevSrc=src; src!=limit;) {
if( (c=s.charAt(src))=minNoNo.
* c is either a "noNo" (has a mapping) or a "maybeYes" (combines backward)
* or has ccc!=0.
* Check for Jamo V/T, then for regular characters.
* c is not a Hangul syllable or Jamo L because those have "yes" properties.
*/
if(isJamoVT(norm16) && prevBoundary!=prevSrc) {
char prev=s.charAt(prevSrc-1);
boolean needToDecompose=false;
if(c=MIN_YES_YES_WITH_CC) {
int cc=norm16&0xff; // cc!=0
if( onlyContiguous && // FCC
(doCompose ? buffer.getLastCC() : prevCC)==0 &&
prevBoundarycc
) {
// Fails FCD test, need to decompose and contiguously recompose.
if(!doCompose) {
return false;
}
} else if(doCompose) {
buffer.append(c, cc);
continue;
} else if(prevCC<=cc) {
prevCC=cc;
continue;
} else {
return false;
}
} else if(!doCompose && !isMaybeOrNonZeroCC(norm16)) {
return false;
}
/*
* Find appropriate boundaries around this character,
* decompose the source text from between the boundaries,
* and recompose it.
*
* We may need to remove the last few characters from the ReorderingBuffer
* to account for source text that was copied or appended
* but needs to take part in the recomposition.
*/
/*
* Find the last composition boundary in [prevBoundary..src[.
* It is either the decomposition of the current character (at prevSrc),
* or prevBoundary.
*/
if(hasCompBoundaryBefore(c, norm16)) {
prevBoundary=prevSrc;
} else if(doCompose) {
buffer.removeSuffix(prevSrc-prevBoundary);
}
// Find the next composition boundary in [src..limit[ -
// modifies src to point to the next starter.
src=findNextCompBoundary(s, src, limit);
// Decompose [prevBoundary..src[ into the buffer and then recompose that part of it.
int recomposeStartIndex=buffer.length();
decomposeShort(s, prevBoundary, src, buffer);
recompose(buffer, recomposeStartIndex, onlyContiguous);
if(!doCompose) {
if(!buffer.equals(s, prevBoundary, src)) {
return false;
}
buffer.remove();
prevCC=0;
}
// Move to the next starter. We never need to look back before this point again.
prevBoundary=src;
}
return true;
}
/**
* Very similar to compose(): Make the same changes in both places if relevant.
* doSpan: spanQuickCheckYes (ignore bit 0 of the return value)
* !doSpan: quickCheck
* @return bits 31..1: spanQuickCheckYes (==s.length() if "yes") and
* bit 0: set if "maybe"; otherwise, if the span length<s.length()
* then the quick check result is "no"
*/
public int composeQuickCheck(CharSequence s, int src, int limit,
boolean onlyContiguous, boolean doSpan) {
int qcResult=0;
int minNoMaybeCP=minCompNoMaybeCP;
/*
* prevBoundary points to the last character before the current one
* that has a composition boundary before it with ccc==0 and quick check "yes".
*/
int prevBoundary=src;
int prevSrc;
int c=0;
int norm16=0;
int prevCC=0;
for(;;) {
// count code units below the minimum or with irrelevant data for the quick check
for(prevSrc=src;;) {
if(src==limit) {
return (src<<1)|qcResult; // "yes" or "maybe"
}
if( (c=s.charAt(src))=minNoNo.
* c is either a "noNo" (has a mapping) or a "maybeYes" (combines backward)
* or has ccc!=0.
*/
if(isMaybeOrNonZeroCC(norm16)) {
int cc=getCCFromYesOrMaybe(norm16);
if( onlyContiguous && // FCC
cc!=0 &&
prevCC==0 &&
prevBoundarycc
) {
// Fails FCD test.
} else if(prevCC<=cc || cc==0) {
prevCC=cc;
if(norm16appendZeroCC() because we track
// the lead and trail combining classes here, rather than leaving it to
// the ReorderingBuffer.
// The exception is the call to decomposeShort() which uses the buffer
// in the normal way.
// Tracks the last FCD-safe boundary, before lccc=0 or after properly-ordered tccc<=1.
// Similar to the prevBoundary in the compose() implementation.
int prevBoundary=src;
int prevSrc;
int c=0;
int prevFCD16=0;
int fcd16=0;
for(;;) {
// count code units with lccc==0
for(prevSrc=src; src!=limit;) {
if((c=s.charAt(src))1) {
--prevBoundary;
}
} else {
int p=src-1;
if( Character.isLowSurrogate(s.charAt(p)) && prevSrc1) {
prevBoundary=p;
}
}
if(buffer!=null) {
// The last lccc==0 character is excluded from the
// flush-and-append call in case it needs to be modified.
buffer.flushAndAppendZeroCC(s, prevSrc, prevBoundary);
buffer.append(s, prevBoundary, src);
}
// The start of the current character (c).
prevSrc=src;
} else if(src==limit) {
break;
}
src+=Character.charCount(c);
// The current character (c) at [prevSrc..src[ has a non-zero lead combining class.
// Check for proper order, and decompose locally if necessary.
if((prevFCD16&0xff)<=(fcd16>>8)) {
// proper order: prev tccc <= current lccc
if((fcd16&0xff)<=1) {
prevBoundary=src;
}
if(buffer!=null) {
buffer.appendZeroCC(c);
}
prevFCD16=fcd16;
continue;
} else if(buffer==null) {
return prevBoundary; // quick check "no"
} else {
/*
* Back out the part of the source that we copied or appended
* already but is now going to be decomposed.
* prevSrc is set to after what was copied/appended.
*/
buffer.removeSuffix(prevSrc-prevBoundary);
/*
* Find the part of the source that needs to be decomposed,
* up to the next safe boundary.
*/
src=findNextFCDBoundary(s, src, limit);
/*
* The source text does not fulfill the conditions for FCD.
* Decompose and reorder a limited piece of the text.
*/
decomposeShort(s, prevBoundary, src, buffer);
prevBoundary=src;
prevFCD16=0;
}
}
return src;
}
public void makeFCDAndAppend(CharSequence s, boolean doMakeFCD, ReorderingBuffer buffer) {
int src=0, limit=s.length();
if(!buffer.isEmpty()) {
int firstBoundaryInSrc=findNextFCDBoundary(s, 0, limit);
if(0!=firstBoundaryInSrc) {
int lastBoundaryInDest=findPreviousFCDBoundary(buffer.getStringBuilder(),
buffer.length());
StringBuilder middle=new StringBuilder((buffer.length()-lastBoundaryInDest)+
firstBoundaryInSrc+16);
middle.append(buffer.getStringBuilder(), lastBoundaryInDest, buffer.length());
buffer.removeSuffix(buffer.length()-lastBoundaryInDest);
middle.append(s, 0, firstBoundaryInSrc);
makeFCD(middle, 0, middle.length(), buffer);
src=firstBoundaryInSrc;
}
}
if(doMakeFCD) {
makeFCD(s, src, limit, buffer);
} else {
buffer.append(s, src, limit);
}
}
// Note: hasDecompBoundary() could be implemented as aliases to
// hasFCDBoundaryBefore() and hasFCDBoundaryAfter()
// at the cost of building the FCD trie for a decomposition normalizer.
public boolean hasDecompBoundary(int c, boolean before) {
for(;;) {
if(cMIN_NORMAL_MAYBE_YES) {
return false; // ccc!=0
} else if(isDecompNoAlgorithmic(norm16)) {
c=mapAlgorithmic(c, norm16);
} else {
// c decomposes, get everything from the variable-length extra data
int firstUnit=extraData.charAt(norm16);
if((firstUnit&MAPPING_LENGTH_MASK)==0) {
return false;
}
if(!before) {
// decomp after-boundary: same as hasFCDBoundaryAfter(),
// fcd16<=1 || trailCC==0
if(firstUnit>0x1ff) {
return false; // trailCC>1
}
if(firstUnit<=0xff) {
return true; // trailCC==0
}
// if(trailCC==1) test leadCC==0, same as checking for before-boundary
}
// true if leadCC==0 (hasFCDBoundaryBefore())
return (firstUnit&MAPPING_HAS_CCC_LCCC_WORD)==0 || (extraData.charAt(norm16-1)&0xff00)==0;
}
}
}
public boolean isDecompInert(int c) { return isDecompYesAndZeroCC(getNorm16(c)); }
public boolean hasCompBoundaryBefore(int c) {
return c= (testInert ? minNoNo : minMaybeYes)) {
return false;
} else if(isDecompNoAlgorithmic(norm16)) {
c=mapAlgorithmic(c, norm16);
} else {
// c decomposes, get everything from the variable-length extra data.
// If testInert, then c must be a yesNo character which has lccc=0,
// otherwise it could be a noNo.
int firstUnit=extraData.charAt(norm16);
// true if
// not MAPPING_NO_COMP_BOUNDARY_AFTER
// (which is set if
// c is not deleted, and
// it and its decomposition do not combine forward, and it has a starter)
// and if FCC then trailCC<=1
return
(firstUnit&MAPPING_NO_COMP_BOUNDARY_AFTER)==0 &&
(!onlyContiguous || firstUnit<=0x1ff);
}
}
}
public boolean hasFCDBoundaryBefore(int c) { return c=minMaybeYes; }
private static boolean isInert(int norm16) { return norm16==0; }
private static boolean isJamoL(int norm16) { return norm16==1; }
private static boolean isJamoVT(int norm16) { return norm16==JAMO_VT; }
private boolean isHangul(int norm16) { return norm16==minYesNo; }
private boolean isCompYesAndZeroCC(int norm16) { return norm16=MIN_YES_YES_WITH_CC || norm16=limitNoNo; }
// For use with isCompYes().
// Perhaps the compiler can combine the two tests for MIN_YES_YES_WITH_CC.
// static uint8_t getCCFromYes(uint16_t norm16) {
// return norm16>=MIN_YES_YES_WITH_CC ? (uint8_t)norm16 : 0;
// }
private int getCCFromNoNo(int norm16) {
if((extraData.charAt(norm16)&MAPPING_HAS_CCC_LCCC_WORD)!=0) {
return extraData.charAt(norm16-1)&0xff;
} else {
return 0;
}
}
// requires that the [cpStart..cpLimit[ character passes isCompYesAndZeroCC()
int getTrailCCFromCompYesAndZeroCC(CharSequence s, int cpStart, int cpLimit) {
int c;
if(cpStart==(cpLimit-1)) {
c=s.charAt(cpStart);
} else {
c=Character.codePointAt(s, cpStart);
}
int prevNorm16=getNorm16(c);
if(prevNorm16<=minYesNo) {
return 0; // yesYes and Hangul LV/LVT have ccc=tccc=0
} else {
return extraData.charAt(prevNorm16)>>8; // tccc from yesNo
}
}
// Requires algorithmic-NoNo.
private int mapAlgorithmic(int c, int norm16) {
return c+norm16-(minMaybeYes-MAX_DELTA-1);
}
// Requires minYesNo>8;
if((firstUnit&MAPPING_HAS_CCC_LCCC_WORD)!=0) {
leadCC=extraData.charAt(norm16-1)>>8;
} else {
leadCC=0;
}
++norm16; // skip over the firstUnit
buffer.append(extraData, norm16, norm16+length, leadCC, trailCC);
}
return;
}
}
/**
* Finds the recomposition result for
* a forward-combining "lead" character,
* specified with a pointer to its compositions list,
* and a backward-combining "trail" character.
*
* If the lead and trail characters combine, then this function returns
* the following "compositeAndFwd" value:
*
* Bits 21..1 composite character
* Bit 0 set if the composite is a forward-combining starter
*
* otherwise it returns -1.
*
* The compositions list has (trail, compositeAndFwd) pair entries,
* encoded as either pairs or triples of 16-bit units.
* The last entry has the high bit of its first unit set.
*
*
The list is sorted by ascending trail characters (there are no duplicates).
* A linear search is used.
*
*
See normalizer2impl.h for a more detailed description
* of the compositions list format.
*/
private static int combine(String compositions, int list, int trail) {
int key1, firstUnit;
if(trail(firstUnit=compositions.charAt(list))) {
list+=2+(firstUnit&COMP_1_TRIPLE);
}
if(key1==(firstUnit&COMP_1_TRAIL_MASK)) {
if((firstUnit&COMP_1_TRIPLE)!=0) {
return (compositions.charAt(list+1)<<16)|compositions.charAt(list+2);
} else {
return compositions.charAt(list+1);
}
}
} else {
// trail character is 3400..10FFFF
// result entry has 3 units
key1=COMP_1_TRAIL_LIMIT+(((trail>>COMP_1_TRAIL_SHIFT))&~COMP_1_TRIPLE);
int key2=(trail<(firstUnit=compositions.charAt(list))) {
list+=2+(firstUnit&COMP_1_TRIPLE);
} else if(key1==(firstUnit&COMP_1_TRAIL_MASK)) {
if(key2>(secondUnit=compositions.charAt(list+1))) {
if((firstUnit&COMP_1_LAST_TUPLE)!=0) {
break;
} else {
list+=3;
}
} else if(key2==(secondUnit&COMP_2_TRAIL_MASK)) {
return ((secondUnit&~COMP_2_TRAIL_MASK)<<16)|compositions.charAt(list+2);
} else {
break;
}
} else {
break;
}
}
}
return -1;
}
/**
* @param list some character's compositions list
* @param set recursively receives the composites from these compositions
*/
private void addComposites(int list, UnicodeSet set) {
int firstUnit, compositeAndFwd;
do {
firstUnit=maybeYesCompositions.charAt(list);
if((firstUnit&COMP_1_TRIPLE)==0) {
compositeAndFwd=maybeYesCompositions.charAt(list+1);
list+=2;
} else {
compositeAndFwd=((maybeYesCompositions.charAt(list+1)&~COMP_2_TRAIL_MASK)<<16)|
maybeYesCompositions.charAt(list+2);
list+=3;
}
int composite=compositeAndFwd>>1;
if((compositeAndFwd&1)!=0) {
addComposites(getCompositionsListForComposite(getNorm16(composite)), set);
}
set.add(composite);
} while((firstUnit&COMP_1_LAST_TUPLE)==0);
}
/*
* Recomposes the buffer text starting at recomposeStartIndex
* (which is in NFD - decomposed and canonically ordered),
* and truncates the buffer contents.
*
* Note that recomposition never lengthens the text:
* Any character consists of either one or two code units;
* a composition may contain at most one more code unit than the original starter,
* while the combining mark that is removed has at least one code unit.
*/
private void recompose(ReorderingBuffer buffer, int recomposeStartIndex,
boolean onlyContiguous) {
StringBuilder sb=buffer.getStringBuilder();
int p=recomposeStartIndex;
if(p==sb.length()) {
return;
}
int starter, pRemove;
int compositionsList;
int c, compositeAndFwd;
int norm16;
int cc, prevCC;
boolean starterIsSupplementary;
// Some of the following variables are not used until we have a forward-combining starter
// and are only initialized now to avoid compiler warnings.
compositionsList=-1; // used as indicator for whether we have a forward-combining starter
starter=-1;
starterIsSupplementary=false;
prevCC=0;
for(;;) {
c=sb.codePointAt(p);
p+=Character.charCount(c);
norm16=getNorm16(c);
cc=getCCFromYesOrMaybe(norm16);
if( // this character combines backward and
isMaybe(norm16) &&
// we have seen a starter that combines forward and
compositionsList>=0 &&
// the backward-combining character is not blocked
(prevCC=0) {
// The starter and the combining mark (c) do combine.
int composite=compositeAndFwd>>1;
// Remove the combining mark.
pRemove=p-Character.charCount(c); // pRemove & p: start & limit of the combining mark
sb.delete(pRemove, p);
p=pRemove;
// Replace the starter with the composite.
if(starterIsSupplementary) {
if(composite>0xffff) {
// both are supplementary
sb.setCharAt(starter, UTF16.getLeadSurrogate(composite));
sb.setCharAt(starter+1, UTF16.getTrailSurrogate(composite));
} else {
sb.setCharAt(starter, (char)c);
sb.deleteCharAt(starter+1);
// The composite is shorter than the starter,
// move the intermediate characters forward one.
starterIsSupplementary=false;
--p;
}
} else if(composite>0xffff) {
// The composite is longer than the starter,
// move the intermediate characters back one.
starterIsSupplementary=true;
sb.setCharAt(starter, UTF16.getLeadSurrogate(composite));
sb.insert(starter+1, UTF16.getTrailSurrogate(composite));
++p;
} else {
// both are on the BMP
sb.setCharAt(starter, (char)composite);
}
// Keep prevCC because we removed the combining mark.
if(p==sb.length()) {
break;
}
// Is the composite a starter that combines forward?
if((compositeAndFwd&1)!=0) {
compositionsList=
getCompositionsListForComposite(getNorm16(composite));
} else {
compositionsList=-1;
}
// We combined; continue with looking for compositions.
continue;
}
}
// no combination this time
prevCC=cc;
if(p==sb.length()) {
break;
}
// If c did not combine, then check if it is a starter.
if(cc==0) {
// Found a new starter.
if((compositionsList=getCompositionsListForDecompYes(norm16))>=0) {
// It may combine with something, prepare for it.
if(c<=0xffff) {
starterIsSupplementary=false;
starter=p-1;
} else {
starterIsSupplementary=true;
starter=p-2;
}
}
} else if(onlyContiguous) {
// FCC: no discontiguous compositions; any intervening character blocks.
compositionsList=-1;
}
}
buffer.flush();
}
public int composePair(int a, int b) {
int norm16=getNorm16(a); // maps an out-of-range 'a' to inert norm16=0
int list;
if(isInert(norm16)) {
return -1;
} else if(norm16minYesNo) { // composite 'a' has both mapping & compositions list
list+= // mapping pointer
1+ // +1 to skip the first unit with the mapping lenth
(extraData.charAt(list)&MAPPING_LENGTH_MASK); // + mapping length
}
// Turn the offset-into-extraData into an offset-into-maybeYesCompositions.
list+=MIN_NORMAL_MAYBE_YES-minMaybeYes;
}
} else if(norm16>1;
}
/**
* Does c have a composition boundary before it?
* True if its decomposition begins with a character that has
* ccc=0 && NFC_QC=Yes (isCompYesAndZeroCC()).
* As a shortcut, this is true if c itself has ccc=0 && NFC_QC=Yes
* (isCompYesAndZeroCC()) so we need not decompose.
*/
private boolean hasCompBoundaryBefore(int c, int norm16) {
for(;;) {
if(isCompYesAndZeroCC(norm16)) {
return true;
} else if(isMaybeOrNonZeroCC(norm16)) {
return false;
} else if(isDecompNoAlgorithmic(norm16)) {
c=mapAlgorithmic(c, norm16);
norm16=getNorm16(c);
} else {
// c decomposes, get everything from the variable-length extra data
int firstUnit=extraData.charAt(norm16);
if((firstUnit&MAPPING_LENGTH_MASK)==0) {
return false;
}
if((firstUnit&MAPPING_HAS_CCC_LCCC_WORD)!=0 && (extraData.charAt(norm16-1)&0xff00)!=0) {
return false; // non-zero leadCC
}
return isCompYesAndZeroCC(getNorm16(Character.codePointAt(extraData, norm16+1)));
}
}
}
private int findPreviousCompBoundary(CharSequence s, int p) {
while(p>0) {
int c=Character.codePointBefore(s, p);
p-=Character.charCount(c);
if(hasCompBoundaryBefore(c)) {
break;
}
// We could also test hasCompBoundaryAfter() and return iter.codePointLimit,
// but that's probably not worth the extra cost.
}
return p;
}
private int findNextCompBoundary(CharSequence s, int p, int limit) {
while(p0) {
int c=Character.codePointBefore(s, p);
p-=Character.charCount(c);
if(c canonStartSets;
// bits in canonIterData
private static final int CANON_NOT_SEGMENT_STARTER = 0x80000000;
private static final int CANON_HAS_COMPOSITIONS = 0x40000000;
private static final int CANON_HAS_SET = 0x200000;
private static final int CANON_VALUE_MASK = 0x1fffff;
}