
org.clapper.util.html.HTMLUtil Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of javautil Show documentation
Show all versions of javautil Show documentation
A general-purpose Java utility library
The newest version!
package org.clapper.util.html;
import java.util.Enumeration;
import java.util.HashMap;
import java.util.Map;
import java.util.MissingResourceException;
import java.util.ResourceBundle;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import java.util.regex.PatternSyntaxException;
import org.clapper.util.text.TextUtil;
import org.clapper.util.text.Unicode;
import org.clapper.util.text.XStringBuffer;
import org.clapper.util.text.XStringBuilder;
/**
* Static class containing miscellaneous HTML-related utility methods.
*/
public final class HTMLUtil
{
/*----------------------------------------------------------------------*\
Private Constants
\*----------------------------------------------------------------------*/
/**
* Resource bundle containing the character entity code mappings.
*/
private static final String BUNDLE_NAME = "org.clapper.util.html.HTMLUtil";
/*----------------------------------------------------------------------*\
Private Data Items
\*----------------------------------------------------------------------*/
private static ResourceBundle resourceBundle = null;
/**
* For regular expression substitution. Instantiated first time it's
* needed.
*/
private static Pattern entityPattern = null;
/*----------------------------------------------------------------------*\
Constructor
\*----------------------------------------------------------------------*/
private HTMLUtil()
{
// Can't be instantiated
}
/*----------------------------------------------------------------------*\
Public Methods
\*----------------------------------------------------------------------*/
/**
* Removes all HTML element tags from a string, leaving just the character
* data. This method does not touch any inline HTML character
* entity codes. Use
* {@link #convertCharacterEntities convertCharacterEntities()}
* to convert HTML character entity codes.
*
* @param s the string to adjust
*
* @return the resulting, possibly modified, string
*
* @see #convertCharacterEntities
*/
public static String stripHTMLTags (String s)
{
char[] ch = s.toCharArray();
boolean inElement = false;
XStringBuilder buf = new XStringBuilder();
for (int i = 0; i < ch.length; i++)
{
switch (ch[i])
{
case '<':
inElement = true;
break;
case '>':
if (inElement)
inElement = false;
else
buf.append (ch[i]);
break;
default:
if (! inElement)
buf.append (ch[i]);
break;
}
}
return buf.toString();
}
/**
* Escape characters that are special in HTML, so that the resulting
* string can be included in HTML (or XML). For instance, this method
* will convert an embedded "&" to "&".
*
* @param s the string to convert
*
* @return the converted string
*/
public static String escapeHTML(String s)
{
StringBuilder buf = new StringBuilder();
for (char c : s.toCharArray())
{
switch (c)
{
case '&':
buf.append("&");
break;
case '<':
buf.append("<");
break;
case '>':
buf.append(">");
break;
default:
buf.append(c);
}
}
return buf.toString();
}
/**
* Converts all inline HTML character entities (c.f.,
* http://www.w3.org/TR/REC-html40/sgml/entities.html)
* to their Unicode character counterparts, if possible.
*
* @param s the string to convert
*
* @return the resulting, possibly modified, string
*
* @see #stripHTMLTags
* @see #makeCharacterEntities
*/
public static String convertCharacterEntities(String s)
{
// The resource bundle contains the mappings for symbolic entity
// names like "amp". Note: Must protect matching and MatchResult in
// a critical section, for thread-safety. See javadocs for
// Perl5Util.
synchronized (HTMLUtil.class)
{
try
{
if (entityPattern == null)
entityPattern = Pattern.compile ("&(#?[^;\\s&]+);?");
}
catch (PatternSyntaxException ex)
{
// Should not happen unless I've screwed up the pattern.
// Throw a runtime error.
assert (false);
}
}
XStringBuffer buf = new XStringBuffer();
Matcher matcher = null;
synchronized (HTMLUtil.class)
{
matcher = entityPattern.matcher (s);
}
for (;;)
{
String match = null;
String preMatch = null;
String postMatch = null;
if (! matcher.find())
break;
match = matcher.group(1);
preMatch = s.substring (0, matcher.start (1) - 1);
if (preMatch != null)
buf.append(preMatch);
if (s.charAt(matcher.end() - 1) != ';')
{
// Not a well-formed entity. Copy into the buffer.
buf.append(s.substring(matcher.start(), matcher.end()));
postMatch = s.substring(matcher.end(1));
}
else
{
// Well-formed entity.
postMatch = s.substring(matcher.end(1) + 1);
buf.append(convertEntity(match));
}
if (postMatch == null)
break;
s = postMatch;
matcher.reset (s);
}
if (s.length() > 0)
buf.append (s);
return buf.toString();
}
/**
* Converts appropriate Unicode characters to their HTML character entity
* counterparts (c.f.,
* http://www.w3.org/TR/REC-html40/sgml/entities.html).
*
* @param s the string to convert
*
* @return the resulting, possibly modified, string
*
* @see #stripHTMLTags
*
* @see #convertCharacterEntities
*/
public static String makeCharacterEntities (String s)
{
// First, make a character-to-entity-name map from the resource bundle.
ResourceBundle bundle = getResourceBundle();
Map charToEntityName =
new HashMap();
Enumeration keys = bundle.getKeys();
XStringBuffer buf = new XStringBuffer();
while (keys.hasMoreElements())
{
String key = keys.nextElement();
String sChar = bundle.getString (key);
char c = sChar.charAt (0);
// Transform the bundle key into an entity name by removing the
// "html_" prefix.
buf.clear();
buf.append (key);
buf.delete ("html_");
charToEntityName.put (c, buf.toString());
}
char[] chars = s.toCharArray();
buf.clear();
for (int i = 0; i < chars.length; i++)
{
char c = chars[i];
String entity = charToEntityName.get (c);
if (entity == null)
{
if (! TextUtil.isPrintable(c))
{
buf.append("&#");
buf.append(Integer.valueOf(c));
buf.append(';');
}
else
{
buf.append(c);
}
}
else
{
buf.append ('&');
buf.append(entity);
buf.append(';');
}
}
return buf.toString();
}
/**
* Convenience method to convert embedded HTML to text. This method:
*
*
* - Strips embedded HTML tags via a call to
* {@link #stripHTMLTags #stripHTMLTags()}
*
- Uses {@link #convertCharacterEntities convertCharacterEntities()}
* to convert HTML entity codes to appropriate Unicode characters.
*
- Converts certain Unicode characters in a string to plain text
* sequences.
*
*
* @param s the string to parse
*
* @return the resulting, possibly modified, string
*
* @see #convertCharacterEntities
* @see #stripHTMLTags
*/
public static String textFromHTML(String s)
{
String stripped = convertCharacterEntities (stripHTMLTags (s));
char[] ch = stripped.toCharArray();
StringBuilder buf = new StringBuilder();
for (int i = 0; i < ch.length; i++)
{
switch (ch[i])
{
case Unicode.LEFT_SINGLE_QUOTE:
case Unicode.RIGHT_SINGLE_QUOTE:
buf.append ('\'');
break;
case Unicode.LEFT_DOUBLE_QUOTE:
case Unicode.RIGHT_DOUBLE_QUOTE:
buf.append ('"');
break;
case Unicode.EM_DASH:
buf.append ("--");
break;
case Unicode.EN_DASH:
case Unicode.NON_BREAKING_HYPHEN:
buf.append ('-');
break;
case Unicode.ZERO_WIDTH_JOINER:
case Unicode.ZERO_WIDTH_NON_JOINER:
break;
case Unicode.TRADEMARK:
buf.append ("[TM]");
break;
case Unicode.NBSP:
case Unicode.THIN_SPACE:
case Unicode.HAIR_SPACE:
case Unicode.EM_SPACE:
case Unicode.EN_SPACE:
buf.append(' ');
break;
default:
buf.append (ch[i]);
break;
}
}
return buf.toString();
}
/*----------------------------------------------------------------------*\
Private Methods
\*----------------------------------------------------------------------*/
/**
* Match an entity, minus the leading "&" and ";" characters.
*/
private static String convertEntity(String s)
{
StringBuilder buf = new StringBuilder();
ResourceBundle bundle = getResourceBundle();
if (s.charAt(0) == '#')
{
if (s.length() == 1)
buf.append('#');
else
{
// It might be a numeric entity code. Try to parse it as a
// number. If the parse fails, just put the whole string in the
// result, as is. Be sure to handle both the decimal form
// (e.g., ™) and the hexadecimal form (e.g., ™).
int cc;
boolean isHex = (s.length() > 2) && (s.charAt(1) == 'x');
boolean isLegal = false;
try
{
if (isHex)
cc = Integer.parseInt(s.substring(2), 16);
else
cc = Integer.parseInt(s.substring(1));
// It parsed. Is it a valid Unicode character?
if (Character.isDefined((char) cc))
{
buf.append((char) cc);
isLegal = true;
}
}
catch (NumberFormatException ex)
{
}
if (! isLegal)
{
buf.append("&#");
if (isHex)
buf.append('x');
buf.append(s + ";");
}
}
}
else
{
// Not a numeric entity. Try to find a matching symbolic
// entity.
try
{
buf.append(bundle.getString("html_" + s));
}
catch (MissingResourceException ex)
{
buf.append("&" + s + ";");
}
}
return buf.toString();
}
/**
* Load the resource bundle, if it hasn't already been loaded.
*/
private static ResourceBundle getResourceBundle()
{
synchronized (HTMLUtil.class)
{
if (resourceBundle == null)
resourceBundle = ResourceBundle.getBundle (BUNDLE_NAME);
}
return resourceBundle;
}
}