org.opencms.util.StringBean Maven / Gradle / Ivy
Show all versions of opencms-test Show documentation
package org.opencms.util;
import java.io.Serializable;
import org.htmlparser.Node;
import org.htmlparser.Tag;
import org.htmlparser.Text;
import org.htmlparser.tags.LinkTag;
import org.htmlparser.util.Translate;
import org.htmlparser.visitors.NodeVisitor;
/**
* Extracts the HTML page content.
*/
public class StringBean extends NodeVisitor implements Serializable {
/**
* A newline.
*/
private static final String NEWLINE = System.getProperty("line.separator");
/**
* The length of the NEWLINE.
*/
private static final int NEWLINE_SIZE = NEWLINE.length();
private static final long serialVersionUID = 1596190888769126925L;
/**
* The buffer text is stored in while traversing the HTML.
*/
protected StringBuffer m_buffer;
/**
* If true
sequences of whitespace characters are replaced
* with a single space character.
*/
protected boolean m_collapse;
/**
* Set true
when traversing a PRE tag.
*/
protected boolean m_isPre;
/**
* Set true
when traversing a SCRIPT tag.
*/
protected boolean m_isScript;
/**
* Set true
when traversing a STYLE tag.
*/
protected boolean m_isStyle;
/**
* If true
the link URLs are embedded in the text output.
*/
protected boolean m_links;
/**
* The strings extracted from the URL.
*/
protected String m_strings;
/**
* Create a StringBean object.
* Default property values are set to 'do the right thing':
*
Links
is set false
so text appears like a
* browser would display it, albeit without the colour or underline clues
* normally associated with a link.
* ReplaceNonBreakingSpaces
is set true
, so
* that printing the text works, but the extra information regarding these
* formatting marks is available if you set it false.
* Collapse
is set true
, so text appears
* compact like a browser would display it.
*/
public StringBean() {
super(true, true);
m_strings = null;
m_links = false;
m_collapse = true;
m_buffer = new StringBuffer(4096);
m_isScript = false;
m_isPre = false;
m_isStyle = false;
}
/**
* Get the current 'collapse whitespace' state.
* If set to true
this emulates the operation of browsers
* in interpretting text where user agents should collapse input
* white space sequences when producing output inter-word space
.
* See HTML specification section 9.1 White space
*
* http://www.w3.org/TR/html4/struct/text.html#h-9.1.
* @return true
if sequences of whitespace (space '\u0020',
* tab '\u0009', form feed '\u000C', zero-width space '\u200B',
* carriage-return '\r' and NEWLINE '\n') are to be replaced with a single
* space.
*/
public boolean getCollapse() {
return (m_collapse);
}
/**
* Get the current 'include links' state.
* @return true
if link text is included in the text extracted
* from the URL, false
otherwise.
*/
public boolean getLinks() {
return (m_links);
}
/**
* Return the textual contents of the URL.
* This is the primary output of the bean.
* @return The user visible (what would be seen in a browser) text.
*/
public String getStrings() {
if (null == m_strings) {
if (0 == m_buffer.length()) {
setStrings();
} else {
updateStrings(m_buffer.toString());
}
}
return (m_strings);
}
/**
* Set the current 'collapse whitespace' state.
* If the setting is changed after the URL has been set, the text from the
* URL will be reacquired, which is possibly expensive.
* @param collapse If true
, sequences of whitespace
* will be reduced to a single space.
*/
public void setCollapse(boolean collapse) {
boolean oldValue = m_collapse;
if (oldValue != collapse) {
m_collapse = collapse;
setStrings();
}
}
/**
* Set the 'include links' state.
* If the setting is changed after the URL has been set, the text from the
* URL will be reacquired, which is possibly expensive.
* @param links Use true
if link text is to be included in the
* text extracted from the URL, false
otherwise.
*/
public void setLinks(boolean links) {
boolean oldValue = m_links;
if (oldValue != links) {
m_links = links;
setStrings();
}
}
/**
* Resets the state of the PRE and SCRIPT flags.
* @param tag The end tag to process.
*/
@Override
public void visitEndTag(Tag tag) {
Node parent = tag.getParent();
if (parent instanceof LinkTag) {
if (getLinks()) { // appends the link as text between angle brackets to the output.
m_buffer.append(" <");
m_buffer.append(((LinkTag)parent).getLink());
m_buffer.append(">");
}
}
String name = tag.getTagName().toUpperCase();
if (name.equals("PRE")) {
m_isPre = false;
} else if (name.equals("SCRIPT")) {
m_isScript = false;
} else if (name.equals("STYLE")) {
m_isStyle = false;
}
if (isHeadTag(name)) {
carriageReturn();
carriageReturn(true);
}
if (isTitleTag(name)) {
m_buffer.append(" ]");
carriageReturn();
carriageReturn(true);
}
}
private boolean isTitleTag(String name) {
return "TITLE".equals(name);
}
private boolean isHeadTag(String name) {
return "H1".equals(name)
|| "H2".equals(name)
|| "H3".equals(name)
|| "H4".equals(name)
|| "H5".equals(name)
|| "H6".equals(name);
}
/**
* Appends the text to the output.
* @param string The text node.
*/
@Override
public void visitStringNode(Text string) {
if (!m_isScript && !m_isStyle) {
String text = string.getText();
if (!m_isPre) {
text = Translate.decode(text);
text = text.replace('\u00a0', ' ');
if (getCollapse()) {
collapse(m_buffer, text);
} else {
m_buffer.append(text);
}
} else {
m_buffer.append(text);
}
}
}
/**
* Appends a NEWLINE to the output if the tag breaks flow, and
* possibly sets the state of the PRE and SCRIPT flags.
* @param tag The tag to examine.
*/
@Override
public void visitTag(Tag tag) {
String name = tag.getTagName();
if (name.equalsIgnoreCase("PRE")) {
m_isPre = true;
} else if (name.equalsIgnoreCase("SCRIPT")) {
m_isScript = true;
} else if (name.equalsIgnoreCase("STYLE")) {
m_isStyle = true;
}
if (isHeadTag(name)) {
carriageReturn(true);
m_buffer.append("* ");
} else if (isTitleTag(name)) {
m_buffer.append("[ ");
} else {
if (tag.breaksFlow()) {
carriageReturn();
}
}
}
/**
* Appends a newline to the buffer if there isn't one there already.
* Except if the buffer is empty.
*/
protected void carriageReturn() {
carriageReturn(false);
}
/**
* Appends a newline to the buffer if there isn't one there already.
* Except if the buffer is empty.
*
* @param check a parameter the developer forgot to comment
*/
protected void carriageReturn(boolean check) {
int length;
length = m_buffer.length();
if ((0 != length) // don't append newlines to the beginning of a buffer
&& (check || ((NEWLINE_SIZE <= length) // not enough chars to hold a NEWLINE
&& (!m_buffer.substring(length - NEWLINE_SIZE, length).equals(NEWLINE))))) {
m_buffer.append(NEWLINE);
}
}
/**
* Add the given text collapsing whitespace.
* Use a little finite state machine:
*
* state 0: whitepace was last emitted character
* state 1: in whitespace
* state 2: in word
* A whitespace character moves us to state 1 and any other character
* moves us to state 2, except that state 0 stays in state 0 until
* a non-whitespace and going from whitespace to word we emit a space
* before the character:
* input: whitespace other-character
* state\next
* 0 0 2
* 1 1 space then 2
* 2 1 2
*
* @param buffer The buffer to append to.
* @param string The string to append.
*/
protected void collapse(StringBuffer buffer, String string) {
int chars;
int length;
int state;
char character;
chars = string.length();
if (0 != chars) {
length = buffer.length();
state = ((0 == length)
|| (buffer.charAt(length - 1) == ' ')
|| ((NEWLINE_SIZE <= length) && buffer.substring(length - NEWLINE_SIZE, length).equals(NEWLINE)))
? 0
: 1;
for (int i = 0; i < chars; i++) {
character = string.charAt(i);
switch (character) {
// see HTML specification section 9.1 White space
// http://www.w3.org/TR/html4/struct/text.html#h-9.1
case '\u0020':
case '\u0009':
case '\u000C':
case '\u200B':
case '\r':
case '\n':
if (0 != state) {
state = 1;
}
break;
default:
if (1 == state) {
buffer.append(' ');
}
state = 2;
buffer.append(character);
}
}
}
}
/**
* Fetch the URL contents.
* Only do work if there is a valid parser with it's URL set.
*/
protected void setStrings() {
m_strings = null;
m_buffer = new StringBuffer(4096);
}
/**
* Assign the Strings
property, firing the property change.
* @param strings The new value of the Strings
property.
*/
protected void updateStrings(String strings) {
if ((null == m_strings) || !m_strings.equals(strings)) {
m_strings = strings;
}
}
}