net.sf.okapi.common.DecodeUtil Maven / Gradle / Ivy

Go to download

Show more of this group Show more artifacts with this name
Show all versions of okapi-lib Show documentation

There is a newer version: 1.47.0

package net.sf.okapi.common;

import java.util.regex.Matcher;
import java.util.regex.Pattern;

public class DecodeUtil {
	private static final Pattern NCR = Pattern.compile("&#(\\S+?);");
	private static final Pattern CER = Pattern.compile("(&\\w*?;)");
	private static final HTMLCharacterEntities entities = new HTMLCharacterEntities();
	static {
		entities.ensureInitialization(false);
	}

	/**
	 * Converts an HTML plain text into a plain text one.
	 * This method assumes there is no elements in the text, just escaped characters.
	 * The methods supported NCRs as well as HTML CERs.
	 * @param text the HTML string to convert.
	 * @return the un-escaped text.
	 */
	public static String fromPlainTextHTML (String text) {
		if ( Util.isEmpty(text) ) {
			return "";
		}
		text = text.replace("'", "'");
		text = text.replace("<", "<");
		text = text.replace(">", ">");
		text = text.replace(""", "\"");
		StringBuilder sb = new StringBuilder();
		sb.append(text.replace("&", "&"));

		// Un-escape character entity references
		Matcher m;
		while ( true ) {
			m = CER.matcher(sb.toString());
			if ( !m.find() ) break;
			int val = entities.lookupReference(m.group(0));
			if ( val != -1 ) {
				sb.replace(m.start(0), m.end(0), String.valueOf((char) val));
			}
			else { // Unknown entity
				// TODO: replace by something meaningful to allow continuing the replacements
				break; // Temporary, to avoid infinite loop
			}
		}

		// Un-escape numeric character references
		m = NCR.matcher(sb.toString());
		while ( m.find() ) {
			String val = m.group(1);
			int n = '?'; // Default
			try {
				if ( val.charAt(0) == 'x' ) { // Hexadecimal
					n = Integer.valueOf(m.group(1).substring(1), 16);
				}
				else { // Decimal
					n = Integer.parseInt(m.group(1));
				}
			}
			catch (NumberFormatException e) {
				// Just use default
			}
			sb.replace(m.start(0), m.end(0), String.valueOf((char) n));
			m = NCR.matcher(sb.toString());
		}

		return sb.toString();
	}
}