net.sf.okapi.common.DecodeUtil Maven / Gradle / Ivy
package net.sf.okapi.common;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
public class DecodeUtil {
private static final Pattern NCR = Pattern.compile("(\\S+?);");
private static final Pattern CER = Pattern.compile("(&\\w*?;)");
private static final HTMLCharacterEntities entities = new HTMLCharacterEntities();
static {
entities.ensureInitialization(false);
}
/**
* Converts an HTML plain text into a plain text one.
* This method assumes there is no elements in the text, just escaped characters.
* The methods supported NCRs as well as HTML CERs.
* @param text the HTML string to convert.
* @return the un-escaped text.
*/
public static String fromPlainTextHTML (String text) {
if ( Util.isEmpty(text) ) {
return "";
}
text = text.replace("'", "'");
text = text.replace("<", "<");
text = text.replace(">", ">");
text = text.replace(""", "\"");
StringBuilder sb = new StringBuilder();
sb.append(text.replace("&", "&"));
// Un-escape character entity references
Matcher m;
while ( true ) {
m = CER.matcher(sb.toString());
if ( !m.find() ) break;
int val = entities.lookupReference(m.group(0));
if ( val != -1 ) {
sb.replace(m.start(0), m.end(0), String.valueOf((char) val));
}
else { // Unknown entity
// TODO: replace by something meaningful to allow continuing the replacements
break; // Temporary, to avoid infinite loop
}
}
// Un-escape numeric character references
m = NCR.matcher(sb.toString());
while ( m.find() ) {
String val = m.group(1);
int n = '?'; // Default
try {
if ( val.charAt(0) == 'x' ) { // Hexadecimal
n = Integer.valueOf(m.group(1).substring(1), 16);
}
else { // Decimal
n = Integer.parseInt(m.group(1));
}
}
catch (NumberFormatException e) {
// Just use default
}
sb.replace(m.start(0), m.end(0), String.valueOf((char) n));
m = NCR.matcher(sb.toString());
}
return sb.toString();
}
}