org.apache.fop.complexscripts.bidi.UnicodeBidiAlgorithm Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of org.apache.fop Show documentation
Show all versions of org.apache.fop Show documentation
The core maven build properties
The newest version!
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/* $Id$ */
package org.apache.fop.complexscripts.bidi;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.fop.traits.Direction;
import org.apache.fop.util.CharUtilities;
// CSOFF: LineLengthCheck
/**
* The UnicodeBidiAlgorithm
class implements functionality prescribed by
* the Unicode Bidirectional Algorithm, Unicode Standard Annex #9.
*
* This work was originally authored by Glenn Adams ([email protected]).
*/
public final class UnicodeBidiAlgorithm implements BidiConstants {
/**
* logging instance
*/
private static final Log log = LogFactory.getLog(UnicodeBidiAlgorithm.class);
private UnicodeBidiAlgorithm() {
}
/**
* Resolve the directionality levels of each character in a character seqeunce.
* If some character is encoded in the character sequence as a Unicode Surrogate Pair,
* then the directionality level of each of the two members of the pair will be identical.
* @return null if bidirectional processing is not required; otherwise, returns an array
* of integers, where each integer corresponds to exactly one UTF-16
* encoding element present in the input character sequence, and where each integer denotes
* the directionality level of the corresponding encoding element
* @param cs input character sequence representing a UTF-16 encoded string
* @param defaultLevel the default paragraph level, which must be zero (LR) or one (RL)
*/
public static int[] resolveLevels(CharSequence cs, Direction defaultLevel) {
int[] chars = new int [ cs.length() ];
if (convertToScalar(cs, chars) || (defaultLevel == Direction.RL)) {
return resolveLevels(chars, (defaultLevel == Direction.RL) ? 1 : 0, new int [ chars.length ]);
} else {
return null;
}
}
/**
* Resolve the directionality levels of each character in a character seqeunce.
* @return null if bidirectional processing is not required; otherwise, returns an array
* of integers, where each integer corresponds to exactly one UTF-16
* encoding element present in the input character sequence, and where each integer denotes
* the directionality level of the corresponding encoding element
* @param chars array of input characters represented as unicode scalar values
* @param defaultLevel the default paragraph level, which must be zero (LR) or one (RL)
* @param levels array to receive levels, one for each character in chars array
*/
public static int[] resolveLevels(int[] chars, int defaultLevel, int[] levels) {
return resolveLevels(chars, getClasses(chars), defaultLevel, levels, false);
}
/**
* Resolve the directionality levels of each character in a character seqeunce.
* @return null if bidirectional processing is not required; otherwise, returns an array
* of integers, where each integer corresponds to exactly one UTF-16
* encoding element present in the input character sequence, and where each integer denotes
* the directionality level of the corresponding encoding element
* @param chars array of input characters represented as unicode scalar values
* @param classes array containing one bidi class per character in chars array
* @param defaultLevel the default paragraph level, which must be zero (LR) or one (RL)
* @param levels array to receive levels, one for each character in chars array
* @param useRuleL1 true if rule L1 should be used
*/
public static int[] resolveLevels(int[] chars, int[] classes, int defaultLevel, int[] levels, boolean useRuleL1) {
int[] ica = classes;
int[] wca = copySequence(ica);
int[] ea = new int [ levels.length ];
resolveExplicit(wca, defaultLevel, ea);
resolveRuns(wca, defaultLevel, ea, levelsFromEmbeddings(ea, levels));
if (useRuleL1) {
resolveSeparators(ica, wca, defaultLevel, levels);
}
dump("RL: CC(" + ((chars != null) ? chars.length : -1) + ")", chars, classes, defaultLevel, levels);
return levels;
}
private static int[] copySequence(int[] ta) {
int[] na = new int [ ta.length ];
System.arraycopy(ta, 0, na, 0, na.length);
return na;
}
private static void resolveExplicit(int[] wca, int defaultLevel, int[] ea) {
int[] es = new int [ MAX_LEVELS ]; /* embeddings stack */
int ei = 0; /* embeddings stack index */
int ec = defaultLevel; /* current embedding level */
for (int i = 0, n = wca.length; i < n; i++) {
int bc = wca [ i ]; /* bidi class of current char */
int el; /* embedding level to assign to current char */
switch (bc) {
case LRE: // start left-to-right embedding
case RLE: // start right-to-left embedding
case LRO: // start left-to-right override
case RLO: // start right-to-left override
int en; /* new embedding level */
if ((bc == RLE) || (bc == RLO)) {
en = ((ec & ~OVERRIDE) + 1) | 1;
} else {
en = ((ec & ~OVERRIDE) + 2) & ~1;
}
if (en < (MAX_LEVELS + 1)) {
es [ ei++ ] = ec;
if ((bc == LRO) || (bc == RLO)) {
ec = en | OVERRIDE;
} else {
ec = en & ~OVERRIDE;
}
} else {
// max levels exceeded, so don't change level or override
}
el = ec;
break;
case PDF: // pop directional formatting
el = ec;
if (ei > 0) {
ec = es [ --ei ];
} else {
// ignore isolated PDF
}
break;
case B: // paragraph separator
el = ec = defaultLevel;
ei = 0;
break;
default:
el = ec;
break;
}
switch (bc) {
case BN:
break;
case LRE: case RLE: case LRO: case RLO: case PDF:
wca [ i ] = BN;
break;
default:
if ((el & OVERRIDE) != 0) {
wca [ i ] = directionOfLevel(el);
}
break;
}
ea [ i ] = el;
}
}
private static int directionOfLevel(int level) {
return ((level & 1) != 0) ? R : L;
}
private static int levelOfEmbedding(int embedding) {
return embedding & ~OVERRIDE;
}
private static int[] levelsFromEmbeddings(int[] ea, int[] la) {
assert ea != null;
assert la != null;
assert la.length == ea.length;
for (int i = 0, n = la.length; i < n; i++) {
la [ i ] = levelOfEmbedding(ea [ i ]);
}
return la;
}
private static void resolveRuns(int[] wca, int defaultLevel, int[] ea, int[] la) {
if (la.length != wca.length) {
throw new IllegalArgumentException("levels sequence length must match classes sequence length");
} else if (la.length != ea.length) {
throw new IllegalArgumentException("levels sequence length must match embeddings sequence length");
} else {
for (int i = 0, n = ea.length, lPrev = defaultLevel; i < n; ) {
int s = i;
int e = s;
int l = findNextNonRetainedFormattingLevel(wca, ea, s, lPrev);
while (e < n) {
if (la [ e ] != l) {
if (startsWithRetainedFormattingRun(wca, ea, e)) {
e += getLevelRunLength(ea, e);
} else {
break;
}
} else {
e++;
}
}
lPrev = resolveRun(wca, defaultLevel, ea, la, s, e, l, lPrev);
i = e;
}
}
}
private static int findNextNonRetainedFormattingLevel(int[] wca, int[] ea, int start, int lPrev) {
int s = start;
int e = wca.length;
while (s < e) {
if (startsWithRetainedFormattingRun(wca, ea, s)) {
s += getLevelRunLength(ea, s);
} else {
break;
}
}
if (s < e) {
return levelOfEmbedding(ea [ s ]);
} else {
return lPrev;
}
}
private static int getLevelRunLength(int[] ea, int start) {
assert start < ea.length;
int nl = 0;
for (int s = start, e = ea.length, l0 = levelOfEmbedding(ea [ start ]); s < e; s++) {
if (levelOfEmbedding(ea [ s ]) == l0) {
nl++;
} else {
break;
}
}
return nl;
}
private static boolean startsWithRetainedFormattingRun(int[] wca, int[] ea, int start) {
int nl = getLevelRunLength(ea, start);
if (nl > 0) {
int nc = getRetainedFormattingRunLength(wca, start);
return (nc >= nl);
} else {
return false;
}
}
private static int getRetainedFormattingRunLength(int[] wca, int start) {
assert start < wca.length;
int nc = 0;
for (int s = start, e = wca.length; s < e; s++) {
if (wca [ s ] == BidiConstants.BN) {
nc++;
} else {
break;
}
}
return nc;
}
private static int resolveRun(int[] wca, int defaultLevel, int[] ea, int[] la, int start, int end, int level, int levelPrev) {
// determine start of run direction
int sor = directionOfLevel(max(levelPrev, level));
// determine end of run direction
int le = -1;
if (end == wca.length) {
le = max(level, defaultLevel);
} else {
for (int i = end; i < wca.length; i++) {
if (wca [ i ] != BidiConstants.BN) {
le = max(level, la [ i ]);
break;
}
}
if (le < 0) {
le = max(level, defaultLevel);
}
}
int eor = directionOfLevel(le);
if (log.isDebugEnabled()) {
log.debug("BR[" + padLeft(start, 3) + "," + padLeft(end, 3) + "] :" + padLeft(level, 2) + ": SOR(" + getClassName(sor) + "), EOR(" + getClassName(eor) + ")");
}
resolveWeak(wca, defaultLevel, ea, la, start, end, level, sor, eor);
resolveNeutrals(wca, defaultLevel, ea, la, start, end, level, sor, eor);
resolveImplicit(wca, defaultLevel, ea, la, start, end, level, sor, eor);
// if this run is all retained formatting, then return prior level, otherwise this run's level
return isRetainedFormatting(wca, start, end) ? levelPrev : level;
}
private static void resolveWeak(int[] wca, int defaultLevel, int[] ea, int[] la, int start, int end, int level, int sor, int eor) {
// W1 - X BN* NSM -> X BN* X
for (int i = start, n = end, bcPrev = sor; i < n; i++) {
int bc = wca [ i ];
if (bc == NSM) {
wca [ i ] = bcPrev;
} else if (bc != BN) {
bcPrev = bc;
}
}
// W2 - AL ... EN -> AL ... AN
for (int i = start, n = end, bcPrev = sor; i < n; i++) {
int bc = wca [ i ];
if (bc == EN) {
if (bcPrev == AL) {
wca [ i ] = AN;
}
} else if (isStrong(bc)) {
bcPrev = bc;
}
}
// W3 - AL -> R
for (int i = start, n = end; i < n; i++) {
int bc = wca [ i ];
if (bc == AL) {
wca [ i ] = R;
}
}
// W4 - EN BN* ES BN* EN -> EN BN* EN BN* EN; XN BN* CS BN* XN -> XN BN* XN BN* XN
for (int i = start, n = end, bcPrev = sor; i < n; i++) {
int bc = wca [ i ];
if (bc == ES) {
int bcNext = eor;
for (int j = i + 1; j < n; j++) {
if ((bc = wca [ j ]) != BN) {
bcNext = bc;
break;
}
}
if ((bcPrev == EN) && (bcNext == EN)) {
wca [ i ] = EN;
}
} else if (bc == CS) {
int bcNext = eor;
for (int j = i + 1; j < n; j++) {
if ((bc = wca [ j ]) != BN) {
bcNext = bc;
break;
}
}
if ((bcPrev == EN) && (bcNext == EN)) {
wca [ i ] = EN;
} else if ((bcPrev == AN) && (bcNext == AN)) {
wca [ i ] = AN;
}
}
if (bc != BN) {
bcPrev = bc;
}
}
// W5 - EN (ET|BN)* -> EN (EN|BN)*; (ET|BN)* EN -> (EN|BN)* EN
for (int i = start, n = end, bcPrev = sor; i < n; i++) {
int bc = wca [ i ];
if (bc == ET) {
int bcNext = eor;
for (int j = i + 1; j < n; j++) {
bc = wca [ j ];
if ((bc != BN) && (bc != ET)) {
bcNext = bc;
break;
}
}
if ((bcPrev == EN) || (bcNext == EN)) {
wca [ i ] = EN;
}
} else if (bc != BN) {
bcPrev = bc;
}
}
// W6 - BN* (ET|ES|CS) BN* -> ON* ON ON*
for (int i = start, n = end; i < n; i++) {
int bc = wca [ i ];
if ((bc == ET) || (bc == ES) || (bc == CS)) {
wca [ i ] = ON;
resolveAdjacentBoundaryNeutrals(wca, start, end, i, ON);
}
}
// W7 - L ... EN -> L ... L
for (int i = start, n = end, bcPrev = sor; i < n; i++) {
int bc = wca [ i ];
if (bc == EN) {
if (bcPrev == L) {
wca [ i ] = L;
}
} else if ((bc == L) || (bc == R)) {
bcPrev = bc;
}
}
}
private static void resolveNeutrals(int[] wca, int defaultLevel, int[] ea, int[] la, int start, int end, int level, int sor, int eor) {
// N1 - (L|R) N+ (L|R) -> L L+ L | R R+ R; (AN|EN) N+ R -> (AN|EN) R+ R; R N+ (AN|EN) -> R R+ (AN|EN)
for (int i = start, n = end, bcPrev = sor; i < n; i++) {
int bc = wca [ i ];
if (isNeutral(bc)) {
int bcNext = eor;
for (int j = i + 1; j < n; j++) {
bc = wca [ j ];
if ((bc == L) || (bc == R)) {
bcNext = bc;
break;
} else if ((bc == AN) || (bc == EN)) {
bcNext = R;
break;
} else if (isNeutral(bc)) {
continue;
} else if (isRetainedFormatting(bc)) {
continue;
} else {
break;
}
}
if (bcPrev == bcNext) {
wca [ i ] = bcPrev;
resolveAdjacentBoundaryNeutrals(wca, start, end, i, bcPrev);
}
} else if ((bc == L) || (bc == R)) {
bcPrev = bc;
} else if ((bc == AN) || (bc == EN)) {
bcPrev = R;
}
}
// N2 - N -> embedding level
for (int i = start, n = end; i < n; i++) {
int bc = wca [ i ];
if (isNeutral(bc)) {
int bcEmbedding = directionOfLevel(levelOfEmbedding(ea [ i ]));
wca [ i ] = bcEmbedding;
resolveAdjacentBoundaryNeutrals(wca, start, end, i, bcEmbedding);
}
}
}
private static void resolveAdjacentBoundaryNeutrals(int[] wca, int start, int end, int index, int bcNew) {
if ((index < start) || (index >= end)) {
throw new IllegalArgumentException();
} else {
for (int i = index - 1; i >= start; i--) {
int bc = wca [ i ];
if (bc == BN) {
wca [ i ] = bcNew;
} else {
break;
}
}
for (int i = index + 1; i < end; i++) {
int bc = wca [ i ];
if (bc == BN) {
wca [ i ] = bcNew;
} else {
break;
}
}
}
}
private static void resolveImplicit(int[] wca, int defaultLevel, int[] ea, int[] la, int start, int end, int level, int sor, int eor) {
for (int i = start, n = end; i < n; i++) {
int bc = wca [ i ]; // bidi class
int el = la [ i ]; // embedding level
int ed = 0; // embedding level delta
if ((el & 1) == 0) { // even
if (bc == R) {
ed = 1;
} else if (bc == AN) {
ed = 2;
} else if (bc == EN) {
ed = 2;
}
} else { // odd
if (bc == L) {
ed = 1;
} else if (bc == EN) {
ed = 1;
} else if (bc == AN) {
ed = 1;
}
}
la [ i ] = el + ed;
}
}
/**
* Resolve separators and boundary neutral levels to account for UAX#9 3.4 L1 while taking into
* account retention of formatting codes (5.2).
* @param ica original input class array (sequence)
* @param wca working copy of original intput class array (sequence), as modified by prior steps
* @param dl default paragraph level
* @param la array of output levels to be adjusted, as produced by bidi algorithm
*/
private static void resolveSeparators(int[] ica, int[] wca, int dl, int[] la) {
// steps (1) through (3)
for (int i = 0, n = ica.length; i < n; i++) {
int ic = ica[i];
if ((ic == BidiConstants.S) || (ic == BidiConstants.B)) {
la[i] = dl;
for (int k = i - 1; k >= 0; k--) {
int pc = ica[k];
if (isRetainedFormatting(pc)) {
continue;
} else if (pc == BidiConstants.WS) {
la[k] = dl;
} else {
break;
}
}
}
}
// step (4) - consider end of input sequence to be end of line, but skip any trailing boundary neutrals and retained formatting codes
for (int i = ica.length; i > 0; i--) {
int k = i - 1;
int ic = ica[k];
if (isRetainedFormatting(ic)) {
continue;
} else if (ic == BidiConstants.WS) {
la[k] = dl;
} else {
break;
}
}
// step (5) - per section 5.2
for (int i = 0, n = ica.length; i < n; i++) {
int ic = ica[i];
if (isRetainedFormatting(ic)) {
if (i == 0) {
la[i] = dl;
} else {
la[i] = la [ i - 1 ];
}
}
}
}
private static boolean isStrong(int bc) {
switch (bc) {
case L:
case R:
case AL:
return true;
default:
return false;
}
}
private static boolean isNeutral(int bc) {
switch (bc) {
case WS:
case ON:
case S:
case B:
return true;
default:
return false;
}
}
private static boolean isRetainedFormatting(int bc) {
switch (bc) {
case LRE:
case LRO:
case RLE:
case RLO:
case PDF:
case BN:
return true;
default:
return false;
}
}
private static boolean isRetainedFormatting(int[] ca, int s, int e) {
for (int i = s; i < e; i++) {
if (!isRetainedFormatting(ca[i])) {
return false;
}
}
return true;
}
private static int max(int x, int y) {
if (x > y) {
return x;
} else {
return y;
}
}
private static int[] getClasses(int[] chars) {
int[] classes = new int [ chars.length ];
int bc;
for (int i = 0, n = chars.length; i < n; i++) {
int ch = chars [ i ];
if (ch >= 0) {
bc = BidiClass.getBidiClass(chars [ i ]);
} else {
bc = SURROGATE;
}
classes [ i ] = bc;
}
return classes;
}
/**
* Convert character sequence (a UTF-16 encoded string) to an array of unicode scalar values
* expressed as integers. If a valid UTF-16 surrogate pair is encountered, it is converted to
* two integers, the first being the equivalent unicode scalar value, and the second being
* negative one (-1). This special mechanism is used to track the use of surrogate pairs while
* working with unicode scalar values, and permits maintaining indices that apply both to the
* input UTF-16 and out scalar value sequences.
* @return a boolean indicating that content is present that triggers bidirectional processing
* @param cs a UTF-16 encoded character sequence
* @param chars an integer array to accept the converted scalar values, where the length of the
* array must be the same as the length of the input character sequence
* @throws IllegalArgumentException if the input sequence is not a valid UTF-16 string, e.g.,
* if it contains an isolated UTF-16 surrogate
*/
private static boolean convertToScalar(CharSequence cs, int[] chars) throws IllegalArgumentException {
boolean triggered = false;
if (chars.length != cs.length()) {
throw new IllegalArgumentException("characters array length must match input sequence length");
}
for (int i = 0, n = chars.length; i < n; ) {
int chIn = cs.charAt(i);
int chOut;
if (chIn < 0xD800) {
chOut = chIn;
} else if (chIn < 0xDC00) {
int chHi = chIn;
int chLo;
if ((i + 1) < n) {
chLo = cs.charAt(i + 1);
if ((chLo >= 0xDC00) && (chLo <= 0xDFFF)) {
chOut = convertToScalar(chHi, chLo);
} else {
throw new IllegalArgumentException("isolated high surrogate");
}
} else {
throw new IllegalArgumentException("truncated surrogate pair");
}
} else if (chIn < 0xE000) {
throw new IllegalArgumentException("isolated low surrogate");
} else {
chOut = chIn;
}
if (!triggered && triggersBidi(chOut)) {
triggered = true;
}
if ((chOut & 0xFF0000) == 0) {
chars [ i++ ] = chOut;
} else {
chars [ i++ ] = chOut;
chars [ i++ ] = -1;
}
}
return triggered;
}
/**
* Convert UTF-16 surrogate pair to unicode scalar valuee.
* @return a unicode scalar value
* @param chHi high (most significant or first) surrogate
* @param chLo low (least significant or second) surrogate
* @throws IllegalArgumentException if one of the input surrogates is not valid
*/
private static int convertToScalar(int chHi, int chLo) {
if ((chHi < 0xD800) || (chHi > 0xDBFF)) {
throw new IllegalArgumentException("bad high surrogate");
} else if ((chLo < 0xDC00) || (chLo > 0xDFFF)) {
throw new IllegalArgumentException("bad low surrogate");
} else {
return (((chHi & 0x03FF) << 10) | (chLo & 0x03FF)) + 0x10000;
}
}
/**
* Determine of character CH triggers bidirectional processing. Bidirectional
* processing is deemed triggerable if CH is a strong right-to-left character,
* an arabic letter or number, or is a right-to-left embedding or override
* character.
* @return true if character triggers bidirectional processing
* @param ch a unicode scalar value
*/
private static boolean triggersBidi(int ch) {
switch (BidiClass.getBidiClass(ch)) {
case R:
case AL:
case AN:
case RLE:
case RLO:
return true;
default:
return false;
}
}
private static void dump(String header, int[] chars, int[] classes, int defaultLevel, int[] levels) {
log.debug(header);
log.debug("BD: default level(" + defaultLevel + ")");
StringBuffer sb = new StringBuffer();
if (chars != null) {
for (int i = 0, n = chars.length; i < n; i++) {
int ch = chars [ i ];
sb.setLength(0);
if ((ch > 0x20) && (ch < 0x7F)) {
sb.append((char) ch);
} else {
sb.append(CharUtilities.charToNCRef(ch));
}
for (int k = sb.length(); k < 12; k++) {
sb.append(' ');
}
sb.append(": " + padRight(getClassName(classes[i]), 4) + " " + levels[i]);
log.debug(sb);
}
} else {
for (int i = 0, n = classes.length; i < n; i++) {
sb.setLength(0);
for (int k = sb.length(); k < 12; k++) {
sb.append(' ');
}
sb.append(": " + padRight(getClassName(classes[i]), 4) + " " + levels[i]);
log.debug(sb);
}
}
}
private static String getClassName(int bc) {
switch (bc) {
case L: // left-to-right
return "L";
case LRE: // left-to-right embedding
return "LRE";
case LRO: // left-to-right override
return "LRO";
case R: // right-to-left
return "R";
case AL: // right-to-left arabic
return "AL";
case RLE: // right-to-left embedding
return "RLE";
case RLO: // right-to-left override
return "RLO";
case PDF: // pop directional formatting
return "PDF";
case EN: // european number
return "EN";
case ES: // european number separator
return "ES";
case ET: // european number terminator
return "ET";
case AN: // arabic number
return "AN";
case CS: // common number separator
return "CS";
case NSM: // non-spacing mark
return "NSM";
case BN: // boundary neutral
return "BN";
case B: // paragraph separator
return "B";
case S: // segment separator
return "S";
case WS: // whitespace
return "WS";
case ON: // other neutrals
return "ON";
case SURROGATE: // placeholder for low surrogate
return "SUR";
default:
return "?";
}
}
private static String padLeft(int n, int width) {
return padLeft(Integer.toString(n), width);
}
private static String padLeft(String s, int width) {
StringBuffer sb = new StringBuffer();
for (int i = s.length(); i < width; i++) {
sb.append(' ');
}
sb.append(s);
return sb.toString();
}
/* not used yet
private static String padRight ( int n, int width ) {
return padRight ( Integer.toString ( n ), width );
}
*/
private static String padRight(String s, int width) {
StringBuffer sb = new StringBuffer(s);
for (int i = sb.length(); i < width; i++) {
sb.append(' ');
}
return sb.toString();
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy