org.daisy.dotify.common.text.SimpleUCharReplacer Maven / Gradle / Ivy
package org.daisy.dotify.common.text;
import java.io.IOException;
import java.net.URL;
import java.text.Normalizer;
import java.util.HashMap;
import java.util.Iterator;
import java.util.Properties;
import java.util.Set;
/**
*
* Provides substitution for unicode characters with replacement strings.
*
*
*
* This is a much simplified version of UCharReplacer by Markus Gylling from the
* org.daisy.util package.
*
*
*
* The use of this class may result in a change in unicode character
* composition between input and output. If you need a certain normalization
* form, normalize after the use of this class.
*
*
*
* Usage example:
*
*
* SimpleCharReplacer ucr = new SimpleCharReplacer();
* ucr.addSubstitutionTable(fileURL);
* ucr.addSubstitutionTable(fileURL2);
* String ret = ucr.replace(input);
*
*
*
* The translation table file is using the same xml format as that of
* java.util.Properties [1][2], using the HEX representation (without the
* characteristic 0x-prefix!) of a unicode character as the key
* attribute and the replacement string as value of the entry
element.
*
*
*
* If the key
attribute contains exactly one unicode codepoint (one
* character) it will be treated literally. It will not be interpreted as a HEX
* representation of another character, even if theoretically possible. E.g. if
* the key
is "a", it will be treated as 0x0061 rather than as 0x000a
*
*
*
* Note - there is a significant difference between a unicode codepoint (32 bit
* int) and a UTF16 codeunit (=char) - a codepoint consists of one or two
* codeunits.
*
*
* To make sure an int represents a codepoint and not a codeunit, use for
* example com.ibm.icu.text.Normalizer
to NFC compose, followed by
* com.ibm.icu.text.UCharacterIterator
to retrieve possibly non-BMP
* codepoints from a string.
*
*
* - see [1] http://java.sun.com/j2se/1.5.0/docs/api/java/util/Properties.html
* - see [2] http://java.sun.com/dtd/properties.dtd
*
* @author Joel Håkansson
* @author Markus Gylling (UCharReplacer)
*/
public class SimpleUCharReplacer extends HashMap {
/**
*
*/
private static final long serialVersionUID = -3238811228931823883L;
/**
* Creates a new instance.
*/
public SimpleUCharReplacer() {
super();
}
/**
* Adds a substitution table to this instance. See the class description for
* the format.
* @param table the url to the substitution table.
* @throws IOException if the table could not be added
*/
public void addSubstitutionTable(URL table) throws IOException {
try {
loadTable(table);
} catch (Exception e) {
throw new IOException(e);
}
}
/**
* Replaces characters in the input according to this object's current configuration.
* @param input the input
* @return returns a modified string
*/
public CharSequence replace(String input) {
int codePoint;
StringBuilder sb = new StringBuilder(input.length());
// Java 1.6 SDK version
Normalizer.normalize(input, Normalizer.Form.NFC);
//Java 1.5 SDK version
// iterate over each code point in the input string
for (int offset = 0; offset < input.length();) {
codePoint = input.codePointAt(offset);
CharSequence substitution = get(codePoint);
if (null != substitution && substitution.length() > 0) {
// a replacement occurred
sb.append(substitution);
} else {
// a replacement didn't occur
sb.appendCodePoint(codePoint);
}
offset += Character.charCount(codePoint);
}
return sb;
}
/**
* Loads a table using the Properties class.
*/
private void loadTable(URL tableURL) throws IOException {
Properties props = new Properties();
props.loadFromXML(tableURL.openStream());
Set> keys = props.keySet();
for (Iterator> it = keys.iterator(); it.hasNext();) {
String key = (String) it.next();
if (key.codePointCount(0, key.length()) == 1) {
put(key.codePointAt(0), props.getProperty(key));
} else {
try {
put(Integer.decode("0x" + key), props.getProperty(key));
} catch (NumberFormatException e) {
System.err.println("error in translation table " + tableURL.toString() + ": attribute key=\"" + key + "\" is not a hex number.");
}
}
}
}
}