com.adobe.fontengine.CombiningSequenceResourceBuilder Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of aem-sdk-api Show documentation
Show all versions of aem-sdk-api Show documentation
The Adobe Experience Manager SDK
/*
* File: CombiningSequenceResourceBuilder.java
*
* ADOBE CONFIDENTIAL
* ___________________
*
* Copyright 2004-2005 Adobe Systems Incorporated
* All Rights Reserved.
*
* NOTICE: All information contained herein is, and remains the property of
* Adobe Systems Incorporated and its suppliers, if any. The intellectual
* and technical concepts contained herein are proprietary to Adobe Systems
* Incorporated and its suppliers and may be covered by U.S. and Foreign
* Patents, patents in process, and are protected by trade secret or
* copyright law. Dissemination of this information or reproduction of this
* material is strictly forbidden unless prior written permission is obtained
* from Adobe Systems Incorporated.
*
*/
package com.adobe.fontengine;
import java.io.File;
import java.io.FileOutputStream;
import java.io.ObjectOutputStream;
import java.io.OutputStream;
import java.util.HashMap;
import java.util.Iterator;
import java.util.Map;
import java.util.SortedMap;
import java.util.SortedSet;
import java.util.TreeMap;
import java.util.TreeSet;
import com.adobe.agl.lang.UCharacter;
import com.adobe.agl.lang.UProperty;
import com.adobe.agl.text.CanonicalIterator;
import com.adobe.agl.text.Normalizer;
import com.adobe.agl.text.UCharacterIterator;
import com.adobe.agl.text.UTF16;
/*
* Consider a font such as MinionPro, which supports a selected subset of
* accented latin characters, does have entries in its cmap for the
* corresponding precomposed characters, but does not have entries for the
* combining marks. For an input such as , if we only map the characters one by one, we get a
* .notdef for the second. This is clearly not desirable; we actually want to
* look at the cmap entry for U+00E8 LATIN SMALL LETTER E WITH GRAVE as well. Of
* course, we do that only if the character given in input cannot all be mapped
* to non-.notdef glyphs.
*
* This situation is actually more common than it may seem. On the font side,
* all OpenType fonts without OpenType layout are similar to Minion, and most
* Adobe OpenType fonts shipped to this date similarly do not support combining
* marks. On the text side, the Microsoft keyboard for Vietnamese is such that
* text normally contains combining sequences; while there are keys for the
* vowels made of a base letter and a diacritic mark, there are separate keys
* for the tone marks, resulting in combining marks in the text.
*
* What we are after is some mapping from sequences to canonically equivalent
* single coded characters. It could seem that simply using the NFC form of the
* sequences would do the trick. However, there are precomposed characters with
* are not in NFC form. For example, the NFC form of U+2ADC FORKING is . The NFC form of U+FB1F
* HEBREW LIGATURE YIDDISH YOD YOD PATAH is . So even if the interface of the
* layout engine specifies that its input is NFC, there is still a need for the
* mapping in question.
*
* Furthermore, requiring the input to be NFC is problematic, because of the CJK
* compatibility characters. For example,
* JIS X 0213 2000 contains two distinct characters, 1-41-78 and 1-14-24, which
* in the Unicode world are unified and are coded by U+4FAE. To enable
* round-tripping, Unicode also encoded U+FA30, and made it canonically
* equivalent to U+4FAE (round-tripping is guaranteed only if
* no normalization happens on the way). A number of fonts have actually
* leveraged the existence of U+FA30, and map different glyphs for U+4FAE and
* U+FA30. This is clearly not a sound approach (instead, one should use
* markup on the text and intelligent layout), but it is there. Normalizing the
* input to a layout engine, when markup is not used to carry the difference,
* does not match the user expectations.
*
* The mapping we need is relatively easy to construct: iterate over all the
* characters which have a canonical decomposition, except the CJK compatibility
* characters; for each such character, generate all the sequences which are
* canonically equivalent to it and add each resulting pair to the
* mapping.
*
* So far, we discussed only combining sequences, made of multiple characters.
*
* There is also the case of single characters which are canonically equivalent
* to another character, e.g. U+03A9 GREEK CAPITAL LETTER OMEGA and U+2126 OHM
* SIGN. On the one hand, when presented with the input U+2126 and a font that
* does not map it, we could fall back to U+03A9. This would have the advantage
* of regularity: we try canonically equivalent single characters regardless of
* whether the input is a single character or a sequence. A manifestation of
* this regularity can been seen for a font that maps U+03A9 and U+038F
* (canonically equivalent to ) but does not map U+2126; for the
* input , the glyph mapped from U+038F is used, and for the
* input , the glyph mapped from U+03A9 is used. If we don't have
* fallback on inputs made of a single character, we have a difference between
* a U+2126 that appears alone in the input and a U+2126 that appears in a
* sequence. On the other hand, our goal is really to make fonts without
* OpenType layout useful, and the fallback is not necessary for that: we can
* instead require that fonts which map U+03A9 also map U+2126.
*
* Yet another case is Hangul Jamos, which in some specific sequences are
* canonically equivalent to Hangul syllables. At the very least, we need to
* treat them separately, since the mapping is much better suited to an
* algorithmic implementation, by opposition to a table implementation.
*
* [looking up a cmap vs. creating a cmap vs. probing for CSS selection]
*
*/
final public class CombiningSequenceResourceBuilder {
public final static Integer done = new Integer (UCharacterIterator.DONE);
public final static Map stateMap = new HashMap ();
final static class State {
final String base;
int resolveTo = Integer.MAX_VALUE;
SortedSet mapsFrom = new TreeSet ();
SortedMap m = new TreeMap ();
public State (String base) {
this.base = base;
stateMap.put (base, this);
}
public void insert (UCharacterIterator it, int usv) {
int next = it.nextCodePoint ();
if (next == UCharacterIterator.DONE) {
if (usv < resolveTo) {
resolveTo = usv; }
mapsFrom.add (new Integer (usv)); }
else {
String nextBase = Normalizer.decompose (base + UCharacter.toString (next), false);
State s = (State) stateMap.get (nextBase);
if (s == null) {
s = new State (nextBase);}
m.put (new Integer (next), s);
s.insert (it, usv); }
}
public void print () {
System.out.print ("------------ ");
if (base.length () == 0) {
System.out.print ("root"); }
for (int i = 0; i < base.length (); i = UTF16.moveCodePointOffset (base, i, 1)) {
System.out.print (Integer.toHexString (UTF16.charAt (base, i)) + " "); }
System.out.println (".");
if (resolveTo != Integer.MAX_VALUE) {
System.out.println (" -> " + Integer.toHexString (resolveTo)); }
if (mapsFrom.size () > 0) {
System.out.print (" <-" + (mapsFrom.size () > 1 ? "* " : " "));
for (Iterator it = mapsFrom.iterator (); it.hasNext (); ) {
Integer usv = (Integer) it.next ();
System.out.print (Integer.toHexString (usv.intValue ()) + " "); }
System.out.println (""); }
for (Iterator it = m.keySet().iterator (); it.hasNext (); ) {
Integer usv = (Integer) it.next ();
State s= (State) m.get (usv);
System.out.print (" " + Integer.toHexString (usv.intValue ()) + " -> ");
for (int i = 0; i < s.base.length (); i = UTF16.moveCodePointOffset (s.base, i, 1)) {
System.out.print (Integer.toHexString (UTF16.charAt (s.base, i)) + " "); }
System.out.println (""); }
if (m.keySet ().size () > 50) {
int[] groups = new int [0x10ff];
for (int i = 0; i < groups.length; i++) {
groups [i] = 0; }
for (Iterator it = m.keySet ().iterator (); it.hasNext (); ) {
int usv = ((Integer) it.next ()).intValue ();
groups [usv >> 8]++; }
for (int i = 0; i < groups.length; i++) {
if (groups [i] > 0) {
System.out.println (" @ " + Integer.toHexString (i) +
" " + groups [i]); }}
System.out.println (" total: " + m.keySet ().size ()); }
}
private CombiningSequence composedCharMachine;
public CombiningSequence toComposedCharsMachine () {
if (composedCharMachine == null) {
int i;
int[] mapsFrom2 = new int [mapsFrom.size()];
i = 0;
for (Iterator it = mapsFrom.iterator (); it.hasNext (); ) {
mapsFrom2 [i++] = ((Integer) it.next ()).intValue (); }
int[] usvs = new int [m.size ()];
CombiningSequence[] nextMachine = new CombiningSequence [m.size ()];
i = 0;
for (Iterator it = m.keySet ().iterator (); it.hasNext (); ) {
Integer usv = (Integer) it.next ();
State s = (State) m.get (usv);
usvs [i] = usv.intValue ();
nextMachine [i] = s.toComposedCharsMachine ();
i++; }
composedCharMachine = new CombiningSequence (this.resolveTo, mapsFrom2, usvs, nextMachine); }
return composedCharMachine;
}
}
static State root = new State ("");
public static void main (String[] args) throws Exception {
int[][] ranges = new int[][] {
{0x0000, 0xac00},
// Hangul syllables
{0xd7a4, 0xd800},
// Surrogates
{0xe000, 0xf900},
// CJK compat, IBM & JIS
{0xfa6b, 0x2f800},
// CJK compat, CNS
{0x2fa1e, 0x10ffff}};
for (int r = 0; r < ranges.length; r++) {
for (int usv = ranges [r][0]; usv < ranges [r][1]; usv++) {
int dt = UCharacter.getIntPropertyValue (usv, UProperty.DECOMPOSITION_TYPE);
if (dt == UCharacter.DecompositionType.CANONICAL) {
CanonicalIterator it = new CanonicalIterator (UCharacter.toString (usv));
for (String s = ""; (s = it.next ()) != null;) {
root.insert (UCharacterIterator.getInstance (s), usv);
if (UTF16.countCodePoint (s) == 1) {
root.insert (UCharacterIterator.getInstance (UCharacter.toString (usv)), UTF16.charAt (s, 0)); }}}}}
// for (Iterator it = stateMap.values ().iterator(); it.hasNext (); ) {
// State state = (State) it.next ();
// state.print (); }
CombiningSequence m = root.toComposedCharsMachine();
File f = new File (args[1] + File.separator + "com/adobe/fontengine/composer");
System.out.println ("--- building composer resource");
f.delete ();
OutputStream is = new FileOutputStream (f);
ObjectOutputStream ois = new ObjectOutputStream (is);
ois.writeObject (m);
ois.close ();
System.out.println (" " + f.length () + " bytes.");
}
}
© 2015 - 2024 Weber Informatics LLC | Privacy Policy