![JAR search and dependency download from the Maven repository](/logo.png)
com.google.gwt.thirdparty.streamhtmlparser.util.EntityResolver Maven / Gradle / Ivy
The newest version!
/*
* Copyright (C) 2010 Google Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.google.gwt.thirdparty.streamhtmlparser.util;
import com.google.gwt.thirdparty.guava.common.base.Preconditions;
import com.google.gwt.thirdparty.guava.common.collect.ImmutableMap;
import java.util.Map;
/**
* Decodes (unescapes) HTML entities with the complication that these
* are received one character at a time hence must be stored temporarily.
* Also, we may receive some "junk" characters before the actual
* entity which we will discard.
*
*
This class is designed to be 100% compatible with the corresponding
* logic in the C-version of the
* {@link com.com.google.gwt.thirdparty.streamhtmlparser.security.streamhtmlparser.HtmlParser}, found
* in htmlparser.c
. There are however a few intentional
* differences outlines below:
*
* - We accept lower and upper-case hex NCRs, the C-version
* accepts only lower-case ones.
*
- The output on some invalid inputs may be different. This is
* currently in the process of consolidation with Filipe.
*
- The API is a bit different, I find this one better suited
* for Java. In particular, the C method
processChar
* returns the output {@code String} whereas in Java, we return
* a status code and then provide the {@code String} in a separate
* method getEntity
. It is cleaner as it avoids the
* need to return empty {@code String}s during incomplete processing.
*
*
* Valid HTML entities have one of the following three forms:
*
* ⅆ
where dd is a number in decimal (base 10) form.
* &x|Xyy;
where yy is a hex-number (base 16).
* &<html-entity>;
where
* <html-entity>
is one of lt
,
* gt
, amp
, quot
or
* apos
.
*
*
* A reset
method is provided to facilitate object re-use.
*/
public class EntityResolver {
/**
* Returned in processChar
method.
*
*
* NOT_STARTED
indicates we are still processing
* trailing characters before the start of an entity.
* The caller may want to save the characters it provided us.
* IN_PROGRESS
indicates we are currently processing
* characters part of an entity.
* COMPLETED
indicates we have finished processing
* an entity. The caller can then invoke getEntity
* then re-set the object for future re-use.
*
*/
public enum Status {
NOT_STARTED("Not Started"),
IN_PROGRESS("In Progress"),
COMPLETED("Completed");
private final String message;
private Status(String message) {
this.message = message;
}
/**
* Returns a brief description of the {@code Status} for
* debugging purposes. The format of the returned {@code String}
* is not fully specified nor guaranteed to remain the same.
*/
@Override
public String toString() {
return message;
}
}
/**
* How many characters to store as we are processing an entity. Once we
* reach that size, we know the entity is definitely invalid. The size
* is higher than needed but keeping it as-is for compatibility with
* the C-version.
*/
private static final int MAX_ENTITY_SIZE = 10;
/**
* Map containing the recognized HTML entities and their decoded values.
* The trailing ';' is not included in the key but it is accounted for.
*/
private static final Map HTML_ENTITIES_MAP =
new ImmutableMap.Builder()
.put("<", "<")
.put(">", ">")
.put("&", "&")
.put("&apos", "'")
.build();
/** Storage for received until characters until an HTML entity is complete. */
private final StringBuilder sb;
/**
* Indicates the state we are in. see {@link EntityResolver.Status}.
*/
private Status status;
private String entity;
/**
* Constructs an entity resolver that is initially empty and
* with status {@code NOT_STARTED}, see {@link EntityResolver.Status}.
*
*/
public EntityResolver() {
sb = new StringBuilder();
status = Status.NOT_STARTED;
entity = "";
}
/**
* Constructs an entity resolver that is an exact copy of
* the one provided. In particular it has the same contents
* and status.
*
* @param aEntityResolver the entity resolver to copy
*/
public EntityResolver(EntityResolver aEntityResolver) {
sb = new StringBuilder();
sb.replace(0, sb.length(), aEntityResolver.sb.toString());
entity = aEntityResolver.entity;
status = aEntityResolver.status;
}
/**
* Returns the object to its original state for re-use, deleting any
* stored characters that may be present.
*/
public void reset() {
status = Status.NOT_STARTED;
sb.setLength(0);
entity = "";
}
/**
* Returns the full state of the StreamEntityResolver
* in a human readable form. The format of the returned String
* is not specified and is subject to change.
*
* @return full state of this object
*/
@Override
public String toString() {
return String.format("Status: %s; Contents (%d): %s", status.toString(),
sb.length(), sb.toString());
}
/**
* Returns the decoded HTML Entity. Should only be called
* after {@code processChar} returned status {@code COMPLETED}.
*
* @return the decoded HTML Entity or an empty {@code String} if
* we were called with any status other than {@code COMPLETED}
*/
public String getEntity() {
return entity;
}
/**
* Processes a character from the input stream and decodes any html entities
* from that processed input stream.
*
* @param input the {@code char} to process
* @return the processed {@code String}. Typically returns an empty
* {@code String} while awaiting for more characters to complete
* processing of the entity.
*/
public Status processChar(char input) {
// Developer error if the precondition fails.
Preconditions.checkState(status != Status.NOT_STARTED || sb.length() == 0);
if (status == Status.NOT_STARTED) {
if (input == '&') {
sb.append(input);
status = Status.IN_PROGRESS;
}
} else if (status == Status.IN_PROGRESS) {
if ((input == ';') || (HtmlUtils.isHtmlSpace(input))) {
status = Status.COMPLETED;
entity = convertEntity(input);
} else {
if (sb.length() < MAX_ENTITY_SIZE) {
sb.append(input);
} else {
status = Status.COMPLETED;
entity = uncovertedInput(input);
}
}
} else {
// Status.COMPLETED, ignore character, do nothing.
}
return status;
}
/**
* Performs the decoding of a complete HTML entity and saves the
* result back into the buffer.
*
* Numeric Character References
*
* @param terminator the last character read, unused on successful
* conversions since it is the end delimiter of the entity
* @return The decoded entity or the original input if we could not decode it.
*/
private String convertEntity(char terminator) {
// Developer error if the buffer was empty or does not start with '&'.
Preconditions.checkArgument(sb.length() > 0);
Preconditions.checkArgument(sb.charAt(0) == '&');
if (sb.length() > 1) {
if (sb.charAt(1) == '#') {
if (sb.length() <= 2) { // Error => return content as-is.
return uncovertedInput(terminator);
}
try {
if ((sb.charAt(2) == 'x') || (sb.charAt(2) == 'X')) { // Hex NCR
return new String(Character.toChars(
Integer.parseInt(sb.substring(3), 16)));
} else { // Decimal NCR
return new String(Character.toChars(
Integer.parseInt(sb.substring(2))));
}
} catch (NumberFormatException e) {
return uncovertedInput(terminator);
}
}
// See if it matches any of the few recognized entities.
String key = sb.toString();
if (HTML_ENTITIES_MAP.containsKey(key)) {
return HTML_ENTITIES_MAP.get(key);
}
}
// Covers the case of a lonely '&' given or valid/invalid unknown entities.
return uncovertedInput(terminator);
}
private String uncovertedInput(char terminator) {
return String.format("%s%c", sb.toString(), terminator);
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy