All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.daisy.dotify.common.text.SimpleUCharReplacer Maven / Gradle / Ivy

There is a newer version: 1.0.7
Show newest version
package org.daisy.dotify.common.text;

import java.io.IOException;
import java.net.URL;
import java.text.Normalizer;
import java.util.HashMap;
import java.util.Iterator;
import java.util.Properties;
import java.util.Set;
import java.util.logging.Logger;

/**
 * 

* Provides substitution for unicode characters with replacement strings. *

* *

* This is a much simplified version of UCharReplacer by Markus Gylling from the * org.daisy.util package. *

* *

* The use of this class may result in a change in unicode character * composition between input and output. If you need a certain normalization * form, normalize after the use of this class. *

* *

* Usage example: *

*
 * SimpleCharReplacer ucr = new SimpleCharReplacer();
 * ucr.addSubstitutionTable(fileURL);
 * ucr.addSubstitutionTable(fileURL2);
 * String ret = ucr.replace(input);
 * 
* *

* The translation table file is using the same xml format as that of * java.util.Properties [1][2], using the HEX representation (without the * characteristic 0x-prefix!) of a unicode character as the key * attribute and the replacement string as value of the entry element. *

* *

* If the key attribute contains exactly one unicode codepoint (one * character) it will be treated literally. It will not be interpreted as a HEX * representation of another character, even if theoretically possible. E.g. if * the key is "a", it will be treated as 0x0061 rather than as 0x000a *

* *

* Note - there is a significant difference between a unicode codepoint (32 bit * int) and a UTF16 codeunit (=char) - a codepoint consists of one or two * codeunits. *

*

* To make sure an int represents a codepoint and not a codeunit, use for * example com.ibm.icu.text.Normalizer to NFC compose, followed by * com.ibm.icu.text.UCharacterIterator to retrieve possibly non-BMP * codepoints from a string. *

* *
  • see [1] http://java.sun.com/j2se/1.5.0/docs/api/java/util/Properties.html
  • *
  • see [2] http://java.sun.com/dtd/properties.dtd
* * @author Joel Håkansson * @author Markus Gylling (UCharReplacer) */ public class SimpleUCharReplacer extends HashMap { private static final Logger logger = Logger.getLogger(SimpleUCharReplacer.class.getCanonicalName()); /** * */ private static final long serialVersionUID = -3238811228931823883L; /** * Creates a new instance. */ public SimpleUCharReplacer() { super(); } /** * Adds a substitution table to this instance. See the class description for * the format. * * @param table the url to the substitution table. * @throws IOException if the table could not be added */ public void addSubstitutionTable(URL table) throws IOException { try { loadTable(table); } catch (Exception e) { throw new IOException(e); } } /** * Replaces characters in the input according to this object's current configuration. * * @param input the input * @return returns a modified string */ public CharSequence replace(String input) { int codePoint; StringBuilder sb = new StringBuilder(input.length()); // Java 1.6 SDK version Normalizer.normalize(input, Normalizer.Form.NFC); //Java 1.5 SDK version // iterate over each code point in the input string for (int offset = 0; offset < input.length(); ) { codePoint = input.codePointAt(offset); CharSequence substitution = get(codePoint); if (null != substitution && substitution.length() > 0) { // a replacement occurred sb.append(substitution); } else { // a replacement didn't occur sb.appendCodePoint(codePoint); } offset += Character.charCount(codePoint); } return sb; } /** * Loads a table using the Properties class. */ private void loadTable(URL tableURL) throws IOException { Properties props = new Properties(); props.loadFromXML(tableURL.openStream()); Set keys = props.keySet(); for (Iterator it = keys.iterator(); it.hasNext(); ) { String key = (String) it.next(); if (key.codePointCount(0, key.length()) == 1) { put(key.codePointAt(0), props.getProperty(key)); } else { try { put(Integer.decode("0x" + key), props.getProperty(key)); } catch (NumberFormatException e) { logger.severe( "error in translation table " + tableURL.toString() + ": attribute key=\"" + key + "\" is not a hex number." ); } } } } }




© 2015 - 2024 Weber Informatics LLC | Privacy Policy