com.adobe.fontengine.CombiningSequenceResourceBuilder Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of aem-sdk-api Show documentation
The Adobe Experience Manager SDK
There is a newer version: 2024.11.18751.20241128T090041Z-241100
/*
 * File: CombiningSequenceResourceBuilder.java
 * 
 *	ADOBE CONFIDENTIAL
 *	___________________
 *
 *	Copyright 2004-2005 Adobe Systems Incorporated
 *	All Rights Reserved.
 *
 *	NOTICE: All information contained herein is, and remains the property of
 *	Adobe Systems Incorporated and its suppliers, if any. The intellectual
 *	and technical concepts contained herein are proprietary to Adobe Systems
 *	Incorporated and its suppliers and may be covered by U.S. and Foreign
 *	Patents, patents in process, and are protected by trade secret or
 *	copyright law. Dissemination of this information or reproduction of this
 *	material is strictly forbidden unless prior written permission is obtained
 *	from Adobe Systems Incorporated.
 *
 */
package com.adobe.fontengine;

import java.io.File;
import java.io.FileOutputStream;
import java.io.ObjectOutputStream;
import java.io.OutputStream;
import java.util.HashMap;
import java.util.Iterator;
import java.util.Map;
import java.util.SortedMap;
import java.util.SortedSet;
import java.util.TreeMap;
import java.util.TreeSet;

import com.adobe.agl.lang.UCharacter;
import com.adobe.agl.lang.UProperty;
import com.adobe.agl.text.CanonicalIterator;
import com.adobe.agl.text.Normalizer;
import com.adobe.agl.text.UCharacterIterator;
import com.adobe.agl.text.UTF16;

/*
 * Consider a font such as MinionPro, which supports a selected subset of
 * accented latin characters, does have entries in its cmap for the
 * corresponding precomposed characters, but does not have entries for the
 * combining marks. For an input such as , if we only map the characters one by one, we get a
 * .notdef for the second. This is clearly not desirable; we actually want to
 * look at the cmap entry for U+00E8 LATIN SMALL LETTER E WITH GRAVE as well. Of
 * course, we do that only if the character given in input cannot all be mapped
 * to non-.notdef glyphs.
 * 
 * This situation is actually more common than it may seem. On the font side,
 * all OpenType fonts without OpenType layout are similar to Minion, and most
 * Adobe OpenType fonts shipped to this date similarly do not support combining
 * marks. On the text side, the Microsoft keyboard for Vietnamese is such that
 * text normally contains combining sequences; while there are keys for the
 * vowels made of a base letter and a diacritic mark, there are separate keys
 * for the tone marks, resulting in combining marks in the text.
 * 
 * What we are after is some mapping from sequences to canonically equivalent
 * single coded characters. It could seem that simply using the NFC form of the
 * sequences would do the trick. However, there are precomposed characters with
 * are not in NFC form. For example, the NFC form of U+2ADC FORKING is . The NFC form of U+FB1F
 * HEBREW LIGATURE YIDDISH YOD YOD PATAH is . So even if the interface of the
 * layout engine specifies that its input is NFC, there is still a need for the
 * mapping in question.
 * 
 * Furthermore, requiring the input to be NFC is problematic, because of the CJK
 * compatibility characters. For example, 
 * JIS X 0213 2000 contains two distinct characters, 1-41-78 and 1-14-24, which
 * in the Unicode world are unified and are coded by U+4FAE. To enable 
 * round-tripping, Unicode also encoded U+FA30, and made it canonically
 * equivalent to U+4FAE (round-tripping is guaranteed only if 
 * no normalization happens on the way). A number of fonts have actually 
 * leveraged the existence of U+FA30, and map different glyphs for U+4FAE and
 * U+FA30. This is clearly not a sound approach (instead, one should use
 * markup on the text and intelligent layout), but it is there. Normalizing the
 * input to a layout engine, when markup is not used to carry the difference,
 * does not match the user expectations.
 * 
 * The mapping we need is relatively easy to construct: iterate over all the
 * characters which have a canonical decomposition, except the CJK compatibility
 * characters; for each such character, generate all the sequences which are
 * canonically equivalent to it and add each resulting pair to the
 * mapping.
 * 
 * So far, we discussed only combining sequences, made of multiple characters.
 * 
 * There is also the case of single characters which are canonically equivalent
 * to another character, e.g. U+03A9 GREEK CAPITAL LETTER OMEGA and U+2126 OHM
 * SIGN. On the one hand, when presented with the input U+2126 and a font that
 * does not map it, we could fall back to U+03A9. This would have the advantage
 * of regularity: we try canonically equivalent single characters regardless of
 * whether the input is a single character or a sequence. A manifestation of
 * this regularity can been seen for a font that maps U+03A9 and U+038F
 * (canonically equivalent to ) but does not map U+2126; for the
 * input , the glyph mapped from U+038F is used, and for the
 * input , the glyph mapped from U+03A9 is used. If we don't have 
 * fallback on inputs made of a single character, we have a difference between
 * a U+2126 that appears alone in the input and a U+2126 that appears in a 
 * sequence. On the other hand, our goal is really to make fonts without
 * OpenType layout useful, and the fallback is not necessary for that: we can
 * instead require that fonts which map U+03A9 also map U+2126.
 * 
 * Yet another case is Hangul Jamos, which in some specific sequences are
 * canonically equivalent to Hangul syllables. At the very least, we need to
 * treat them separately, since the mapping is much better suited to an
 * algorithmic implementation, by opposition to a table implementation.
 * 
 * [looking up a cmap vs. creating a cmap vs. probing for CSS selection]
 *  
 */
final public class CombiningSequenceResourceBuilder {
    
  public final static Integer done = new Integer (UCharacterIterator.DONE);
  
  public final static Map stateMap = new HashMap ();
  
  
  final static class State {
    final String base;
    int resolveTo = Integer.MAX_VALUE;
    SortedSet mapsFrom = new TreeSet ();
    SortedMap m = new TreeMap ();

    public State (String base) {
      this.base = base;
      stateMap.put (base, this);
    }
    
    public void insert (UCharacterIterator it, int usv) {
      int next = it.nextCodePoint ();

      if (next == UCharacterIterator.DONE) {
        if (usv < resolveTo) {
          resolveTo = usv; } 
        mapsFrom.add (new Integer (usv)); }
      else {
        String nextBase = Normalizer.decompose (base + UCharacter.toString (next), false);
        State s = (State) stateMap.get (nextBase);

        if (s == null) {
          s = new State (nextBase);}
        m.put (new Integer (next), s); 
        s.insert (it, usv); }
    }
    
    public void print () {
      System.out.print ("------------ ");
      if (base.length () == 0) {
        System.out.print ("root"); }
      for (int i = 0; i < base.length (); i = UTF16.moveCodePointOffset (base, i, 1)) {
        System.out.print (Integer.toHexString (UTF16.charAt (base, i)) + " "); }
      System.out.println (".");
      
      if (resolveTo != Integer.MAX_VALUE) {
        System.out.println ("    -> " + Integer.toHexString (resolveTo)); }
      
      if (mapsFrom.size () > 0) {
        System.out.print     ("    <-" + (mapsFrom.size () > 1 ? "* " : "  "));
        for (Iterator it = mapsFrom.iterator (); it.hasNext (); ) {
          Integer usv = (Integer) it.next ();
          System.out.print (Integer.toHexString (usv.intValue ()) + " "); }
        System.out.println (""); }
      
      for (Iterator it = m.keySet().iterator (); it.hasNext (); ) {
        Integer usv = (Integer) it.next ();
        State s= (State) m.get (usv);
        System.out.print ("    " + Integer.toHexString (usv.intValue ()) + " -> ");
        for (int i = 0; i < s.base.length (); i = UTF16.moveCodePointOffset (s.base, i, 1)) {
          System.out.print (Integer.toHexString (UTF16.charAt (s.base, i)) + " "); }
        System.out.println (""); }
      
      if (m.keySet ().size () > 50) {
        int[] groups = new int [0x10ff];
        for (int i = 0; i < groups.length; i++) {
          groups [i] = 0; }
        for (Iterator it = m.keySet ().iterator (); it.hasNext (); ) {
          int usv = ((Integer) it.next ()).intValue ();
          groups [usv >> 8]++; }
        for (int i = 0; i < groups.length; i++) {
          if (groups [i] > 0) {
            System.out.println ("                     @ " + Integer.toHexString (i) + 
               " " + groups [i]); }}
        System.out.println ("                    total: " + m.keySet ().size ()); }
          
    }
        
    private CombiningSequence composedCharMachine;
    
    public CombiningSequence toComposedCharsMachine () {
      if (composedCharMachine == null) {
        int i;
        
        
        int[] mapsFrom2 = new int [mapsFrom.size()];
        i = 0;
        for (Iterator it = mapsFrom.iterator (); it.hasNext (); ) {
          mapsFrom2 [i++] = ((Integer) it.next ()).intValue (); }
        
        int[] usvs = new int [m.size ()];
        CombiningSequence[] nextMachine = new CombiningSequence [m.size ()];
        
        i = 0;        
        for (Iterator it = m.keySet ().iterator (); it.hasNext (); ) {
          Integer usv = (Integer) it.next ();
          State s = (State) m.get (usv);
          usvs [i] = usv.intValue ();
          nextMachine [i] = s.toComposedCharsMachine ();
          i++; }
        
        composedCharMachine = new CombiningSequence (this.resolveTo, mapsFrom2, usvs, nextMachine); }
      
      return composedCharMachine; 
    }
  }
  
  
  static State root = new State (""); 
    
  public static void main (String[] args) throws Exception {
    int[][] ranges = new int[][] {
        {0x0000, 0xac00},
        // Hangul syllables
        {0xd7a4, 0xd800},
        // Surrogates
        {0xe000, 0xf900},
        // CJK compat, IBM & JIS
        {0xfa6b, 0x2f800},
        // CJK compat, CNS
        {0x2fa1e, 0x10ffff}};
    
    for (int r = 0; r < ranges.length; r++) {
      for (int usv = ranges [r][0]; usv < ranges [r][1]; usv++) {
        int dt = UCharacter.getIntPropertyValue (usv, UProperty.DECOMPOSITION_TYPE);
        if (dt == UCharacter.DecompositionType.CANONICAL) {
          CanonicalIterator it = new CanonicalIterator (UCharacter.toString (usv));
          for (String s = ""; (s = it.next ()) != null;) {
                
            root.insert (UCharacterIterator.getInstance (s), usv);

            if (UTF16.countCodePoint (s) == 1) {
              root.insert (UCharacterIterator.getInstance (UCharacter.toString (usv)), UTF16.charAt (s, 0)); }}}}}
 
//    for (Iterator it = stateMap.values ().iterator(); it.hasNext (); ) {
//      State state = (State) it.next ();
//      state.print (); }
    
    CombiningSequence m = root.toComposedCharsMachine();

    File f = new File (args[1] + File.separator + "com/adobe/fontengine/composer");
    
    System.out.println ("--- building composer resource");
    
    f.delete ();
    
    OutputStream is = new FileOutputStream (f);
    ObjectOutputStream ois = new ObjectOutputStream (is);
    ois.writeObject (m);
    ois.close ();
    
    System.out.println ("  " + f.length () + " bytes.");

  }
}