All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.ibm.icu.text.AnyTransliterator Maven / Gradle / Ivy

Go to download

International Component for Unicode for Java (ICU4J) is a mature, widely used Java library providing Unicode and Globalization support

The newest version!
// © 2016 and later: Unicode, Inc. and others.
// License & terms of use: http://www.unicode.org/copyright.html
/*
*****************************************************************
* Copyright (c) 2002-2014, International Business Machines Corporation
* and others.  All Rights Reserved.
*****************************************************************
* Date        Name        Description
* 06/06/2002  aliu        Creation.
*****************************************************************
*/
package com.ibm.icu.text;

import java.util.ArrayList;
import java.util.Enumeration;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.MissingResourceException;
import java.util.Set;
import java.util.concurrent.ConcurrentHashMap;

import com.ibm.icu.lang.UScript;
/**
 * A transliterator that translates multiple input scripts to a single
 * output script.  It is named Any-T or Any-T/V, where T is the target
 * and V is the optional variant.  The target T is a script.
 *
 * 

An AnyTransliterator partitions text into runs of the same * script, together with adjacent COMMON or INHERITED characters. * After determining the script of each run, it transliterates from * that script to the given target/variant. It does so by * instantiating a transliterator from the source script to the * target/variant. If a run consists only of the target script, * COMMON, or INHERITED characters, then the run is not changed. * *

At startup, all possible AnyTransliterators are registered with * the system, as determined by examining the registered script * transliterators. * * @since ICU 2.2 * @author Alan Liu */ class AnyTransliterator extends Transliterator { //------------------------------------------------------------ // Constants static final char TARGET_SEP = '-'; static final char VARIANT_SEP = '/'; static final String ANY = "Any"; static final String NULL_ID = "Null"; static final String LATIN_PIVOT = "-Latin;Latin-"; /** * Cache mapping UScriptCode values to Transliterator*. */ private ConcurrentHashMap cache; /** * The target or target/variant string. */ private String target; /** * The target script code. Never USCRIPT_INVALID_CODE. */ private int targetScript; /** * Lazily initialize a special Transliterator for handling width characters. */ private static class WidthFix { private static final String ID = "[[:dt=Nar:][:dt=Wide:]] nfkd"; static final Transliterator INSTANCE = Transliterator.getInstance(ID); } /** * Implements {@link Transliterator#handleTransliterate}. */ @Override protected void handleTransliterate(Replaceable text, Position pos, boolean isIncremental) { int allStart = pos.start; int allLimit = pos.limit; ScriptRunIterator it = new ScriptRunIterator(text, pos.contextStart, pos.contextLimit); while (it.next()) { // Ignore runs in the ante context if (it.limit <= allStart) continue; // Try to instantiate transliterator from it.scriptCode to // our target or target/variant Transliterator t = getTransliterator(it.scriptCode); if (t == null) { // We have no transliterator. Do nothing, but keep // pos.start up to date. pos.start = it.limit; continue; } // If the run end is before the transliteration limit, do // a non-incremental transliteration. Otherwise do an // incremental one. boolean incremental = isIncremental && (it.limit >= allLimit); pos.start = Math.max(allStart, it.start); pos.limit = Math.min(allLimit, it.limit); int limit = pos.limit; t.filteredTransliterate(text, pos, incremental); int delta = pos.limit - limit; allLimit += delta; it.adjustLimit(delta); // We're done if we enter the post context if (it.limit >= allLimit) break; } // Restore limit. pos.start is fine where the last transliterator // left it, or at the end of the last run. pos.limit = allLimit; } /** * Private constructor * @param id the ID of the form S-T or S-T/V, where T is theTarget * and V is theVariant. Must not be empty. * @param theTarget the target name. Must not be empty, and must * name a script corresponding to theTargetScript. * @param theVariant the variant name, or the empty string if * there is no variant * @param theTargetScript the script code corresponding to * theTarget. */ private AnyTransliterator(String id, String theTarget, String theVariant, int theTargetScript) { super(id, null); targetScript = theTargetScript; cache = new ConcurrentHashMap(); target = theTarget; if (theVariant.length() > 0) { target = theTarget + VARIANT_SEP + theVariant; } } /** * @param id the ID of the form S-T or S-T/V, where T is theTarget * and V is theVariant. Must not be empty. * @param filter The Unicode filter. * @param target2 the target name. * @param targetScript2 the script code corresponding to theTarget. * @param widthFix2 Not used. This parameter is deprecated. * @param cache2 The Map object for cache. */ public AnyTransliterator(String id, UnicodeFilter filter, String target2, int targetScript2, Transliterator widthFix2, ConcurrentHashMap cache2) { super(id, filter); targetScript = targetScript2; cache = cache2; target = target2; } /** * Returns a transliterator from the given source to our target or * target/variant. Returns NULL if the source is the same as our * target script, or if the source is USCRIPT_INVALID_CODE. * Caches the result and returns the same transliterator the next * time. The caller does NOT own the result and must not delete * it. */ private Transliterator getTransliterator(int source) { if (source == targetScript || source == UScript.INVALID_CODE) { if (isWide(targetScript)) { return null; } else { return WidthFix.INSTANCE; } } Integer key = source; Transliterator t = cache.get(key); if (t == null) { String sourceName = UScript.getName(source); String id = sourceName + TARGET_SEP + target; try { t = Transliterator.getInstance(id, FORWARD); } catch (RuntimeException e) { } if (t == null) { // Try to pivot around Latin, our most common script id = sourceName + LATIN_PIVOT + target; try { t = Transliterator.getInstance(id, FORWARD); } catch (RuntimeException e) { } } if (t != null) { if (!isWide(targetScript)) { List v = new ArrayList(); v.add(WidthFix.INSTANCE); v.add(t); t = new CompoundTransliterator(v); } Transliterator prevCachedT = cache.putIfAbsent(key, t); if (prevCachedT != null) { t = prevCachedT; } } else if (!isWide(targetScript)) { return WidthFix.INSTANCE; } } return t; } /** * @param targetScript2 * @return */ private boolean isWide(int script) { return script == UScript.BOPOMOFO || script == UScript.HAN || script == UScript.HANGUL || script == UScript.HIRAGANA || script == UScript.KATAKANA; } /** * Registers standard transliterators with the system. Called by * Transliterator during initialization. Scan all current targets * and register those that are scripts T as Any-T/V. */ static void register() { HashMap> seen = new HashMap>(); // old code used set, but was dependent on order for (Enumeration s = Transliterator.getAvailableSources(); s.hasMoreElements(); ) { String source = s.nextElement(); // Ignore the "Any" source if (source.equalsIgnoreCase(ANY)) continue; for (Enumeration t = Transliterator.getAvailableTargets(source); t.hasMoreElements(); ) { String target = t.nextElement(); // Get the script code for the target. If not a script, ignore. int targetScript = scriptNameToCode(target); if (targetScript == UScript.INVALID_CODE) { continue; } Set seenVariants = seen.get(target); if (seenVariants == null) { seen.put(target, seenVariants = new HashSet()); } for (Enumeration v = Transliterator.getAvailableVariants(source, target); v.hasMoreElements(); ) { String variant = v.nextElement(); // Only process each target/variant pair once if (seenVariants.contains(variant)) { continue; } seenVariants.add(variant); String id; id = TransliteratorIDParser.STVtoID(ANY, target, variant); AnyTransliterator trans = new AnyTransliterator(id, target, variant, targetScript); Transliterator.registerInstance(trans); Transliterator.registerSpecialInverse(target, NULL_ID, false); } } } } /** * Return the script code for a given name, or * UScript.INVALID_CODE if not found. */ private static int scriptNameToCode(String name) { try{ int[] codes = UScript.getCode(name); return codes != null ? codes[0] : UScript.INVALID_CODE; }catch( MissingResourceException e){ ///CLOVER:OFF return UScript.INVALID_CODE; ///CLOVER:ON } } //------------------------------------------------------------ // ScriptRunIterator /** * Returns a series of ranges corresponding to scripts. They will be * of the form: * * ccccSScSSccccTTcTcccc - c = common, S = first script, T = second * | | - first run (start, limit) * | | - second run (start, limit) * * That is, the runs will overlap. The reason for this is so that a * transliterator can consider common characters both before and after * the scripts. */ private static class ScriptRunIterator { private Replaceable text; private int textStart; private int textLimit; /** * The code of the current run, valid after next() returns. May * be UScript.INVALID_CODE if and only if the entire text is * COMMON/INHERITED. */ public int scriptCode; /** * The start of the run, inclusive, valid after next() returns. */ public int start; /** * The end of the run, exclusive, valid after next() returns. */ public int limit; /** * Constructs a run iterator over the given text from start * (inclusive) to limit (exclusive). */ public ScriptRunIterator(Replaceable text, int start, int limit) { this.text = text; this.textStart = start; this.textLimit = limit; this.limit = start; } /** * Returns true if there are any more runs. true is always * returned at least once. Upon return, the caller should * examine scriptCode, start, and limit. */ public boolean next() { int ch; int s; scriptCode = UScript.INVALID_CODE; // don't know script yet start = limit; // Are we done? if (start == textLimit) { return false; } // Move start back to include adjacent COMMON or INHERITED // characters while (start > textStart) { ch = text.char32At(start - 1); // look back s = UScript.getScript(ch); if (s == UScript.COMMON || s == UScript.INHERITED) { --start; } else { break; } } // Move limit ahead to include COMMON, INHERITED, and characters // of the current script. while (limit < textLimit) { ch = text.char32At(limit); // look ahead s = UScript.getScript(ch); if (s != UScript.COMMON && s != UScript.INHERITED) { if (scriptCode == UScript.INVALID_CODE) { scriptCode = s; } else if (s != scriptCode) { break; } } ++limit; } // Return true even if the entire text is COMMON / INHERITED, in // which case scriptCode will be UScript.INVALID_CODE. return true; } /** * Adjusts internal indices for a change in the limit index of the * given delta. A positive delta means the limit has increased. */ public void adjustLimit(int delta) { limit += delta; textLimit += delta; } } /** * Temporary hack for registry problem. Needs to be replaced by better architecture. */ public Transliterator safeClone() { UnicodeFilter filter = getFilter(); if (filter != null && filter instanceof UnicodeSet) { filter = new UnicodeSet((UnicodeSet)filter); } return new AnyTransliterator(getID(), filter, target, targetScript, null, cache); } /* (non-Javadoc) * @see com.ibm.icu.text.Transliterator#addSourceTargetSet(com.ibm.icu.text.UnicodeSet, com.ibm.icu.text.UnicodeSet, com.ibm.icu.text.UnicodeSet) */ @Override public void addSourceTargetSet(UnicodeSet inputFilter, UnicodeSet sourceSet, UnicodeSet targetSet) { UnicodeSet myFilter = getFilterAsUnicodeSet(inputFilter); // Assume that it can modify any character to any other character sourceSet.addAll(myFilter); if (myFilter.size() != 0) { targetSet.addAll(0, 0x10FFFF); } } }





© 2015 - 2024 Weber Informatics LLC | Privacy Policy