All Downloads are FREE. Search and download functionalities are using the official Maven repository.

net.sf.okapi.common.DecodeUtil Maven / Gradle / Ivy

There is a newer version: 1.47.0
Show newest version
package net.sf.okapi.common;

import java.util.regex.Matcher;
import java.util.regex.Pattern;

public class DecodeUtil {
	private static final Pattern NCR = Pattern.compile("&#(\\S+?);");
	private static final Pattern CER = Pattern.compile("(&\\w*?;)");
	private static final HTMLCharacterEntities entities = new HTMLCharacterEntities();
	static {
		entities.ensureInitialization(false);
	}

	/**
	 * Converts an HTML plain text into a plain text one.
	 * 

This method assumes there is no elements in the text, just escaped characters. * The methods supported NCRs as well as HTML CERs. * @param text the HTML string to convert. * @return the un-escaped text. */ public static String fromPlainTextHTML (String text) { if ( Util.isEmpty(text) ) { return ""; } text = text.replace("'", "'"); text = text.replace("<", "<"); text = text.replace(">", ">"); text = text.replace(""", "\""); StringBuilder sb = new StringBuilder(); sb.append(text.replace("&", "&")); // Un-escape character entity references Matcher m; while ( true ) { m = CER.matcher(sb.toString()); if ( !m.find() ) break; int val = entities.lookupReference(m.group(0)); if ( val != -1 ) { sb.replace(m.start(0), m.end(0), String.valueOf((char) val)); } else { // Unknown entity // TODO: replace by something meaningful to allow continuing the replacements break; // Temporary, to avoid infinite loop } } // Un-escape numeric character references m = NCR.matcher(sb.toString()); while ( m.find() ) { String val = m.group(1); int n = '?'; // Default try { if ( val.charAt(0) == 'x' ) { // Hexadecimal n = Integer.valueOf(m.group(1).substring(1), 16); } else { // Decimal n = Integer.parseInt(m.group(1)); } } catch (NumberFormatException e) { // Just use default } sb.replace(m.start(0), m.end(0), String.valueOf((char) n)); m = NCR.matcher(sb.toString()); } return sb.toString(); } }





© 2015 - 2025 Weber Informatics LLC | Privacy Policy