org.htmlunit.cyberneko.HTMLNamedEntitiesParser Maven / Gradle / Ivy
/*
* Copyright (c) 2002-2009 Andy Clark, Marc Guillemot
* Copyright (c) 2017-2024 Ronald Brill
* Copyright 2023 René Schwietzke
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
* https://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.htmlunit.cyberneko;
import java.io.IOException;
import java.io.InputStream;
import java.util.Arrays;
import java.util.Properties;
/**
* This is a very specialized class for recognizing HTML named entities with the ability
* to look them up in stages. It is stateless and hence memory friendly.
* Additionally, it is not generated code rather it sets itself up from a file at
* first use and stays fixed from now on. Technically, it is not a parser anymore,
* because it does not have a state that matches the HTML standard:
*
* 12.2.5.72 Character reference state
*
* Because it is stateless, it delegates the state handling to the user in the
* sense of how many characters one saw and when to stop doing things.
*
* @author René Schwietzke
* @author Ronald Brill
*/
public final class HTMLNamedEntitiesParser {
// These are some benchmark results of a comparison old vs. new parser. "onlyCommon" is a test with just 7 out of
// 2231 entities (most common such as lt gt and more). Random means, we are not feeding the parser the data
// the test data in the same order all the time, but vary it.
//
// As you can see, the new parser is up to 20x faster for common entities and 8x faster when checking all.
//
// T14s Gen 1 AMD, 32 GB memory, newParser4 is this implementation here
//
// Benchmark (onlyCommon) (random) Mode Cnt Score Error Units
// HtmlEntitiesParserBenchmark.newParser4 true true avgt 3 135.647 ± 13.500 ns/op
// HtmlEntitiesParserBenchmark.newParser4 true false avgt 3 132.972 ± 4.807 ns/op
// HtmlEntitiesParserBenchmark.newParser4 false true avgt 3 240162.769 ± 3538.438 ns/op
// HtmlEntitiesParserBenchmark.newParser4 false false avgt 3 206904.535 ± 53584.038 ns/op
// HtmlEntitiesParserBenchmark.oldParser true true avgt 3 3320.223 ± 178.501 ns/op
// HtmlEntitiesParserBenchmark.oldParser true false avgt 3 3097.086 ± 48.238 ns/op
// HtmlEntitiesParserBenchmark.oldParser false true avgt 3 1584678.257 ± 65965.438 ns/op
// HtmlEntitiesParserBenchmark.oldParser false false avgt 3 1604853.180 ± 73638.435 ns/op
/*
* Our single instance of the parser, we don't have state, so we are safe
*/
private static final HTMLNamedEntitiesParser instance = new HTMLNamedEntitiesParser();
/*
* Our starting point of the pseudo tree of entities. The root level is a little special, because of the size,
* it employs a different lookup on the characters (calculation rather comparison).
*/
private RootState rootLevel_ = new RootState();
/**
* Constructor. It builds the parser state from an entity defining properties file. This file has been taken
* from https://html.spec.whatwg.org/multipage/named-characters.html (JSON version) and converted
* appropriately.
*/
private HTMLNamedEntitiesParser() {
// read the entities defined in the data taken from
try (InputStream stream = HTMLNamedEntitiesParser.class.getResourceAsStream("html_entities.properties")) {
final Properties props = new Properties();
props.load(stream);
props.forEach((k, v) -> {
final String key = (String) k;
final String value = (String) v;
// we might have an empty line in it
if (key.trim().isEmpty()) {
return;
}
rootLevel_.add(key, value);
});
// make the root more efficient, rest stays simple
rootLevel_.optimize();
}
catch (final IOException e) {
// we are doomed and hence can break the entire setup due to some incorrect classpath
// or build
throw new RuntimeException("Unable to initilaize the HTML entities from file");
}
}
/**
* Returns the singleton. The singleton is stateless and can safely be used in a multi-threaded
* context.
*
* @return the singleton instance of the parser, can never be null
*/
public static HTMLNamedEntitiesParser get() {
return instance;
}
/**
* Utility method, mostly for testing, that allows us to look up and entity from a string
* instead from single characters.
*
* @param entityName the entity to look up
* @return a state that resembles the result, will never be null
*/
public State lookup(final String entityName) {
State lastResult = rootLevel_;
State lastMatchingResult = null;
for (int i = 0; i < entityName.length(); i++) {
final State result = lastResult.lookup(entityName.charAt(i));
if (result.endNode_) {
// we found the last matching possible entity in the pseudo tree
// we can finish here, there is nothing beyond that point
return result;
}
if (result == lastResult) {
// nothing changed, more characters have not done anything
// in case we have see something that was a match before, return
// to that state
return lastMatchingResult == null ? lastResult : lastMatchingResult;
}
if (result.isMatch_) {
// in case this is a match but not an endnode, we keep that state
// for later, in case any further chars take us into the wrong direction
// standard dictates to stop when we don't have a match and return
// to the last known match, if any
lastMatchingResult = result;
}
lastResult = result;
}
return lastMatchingResult == null ? lastResult : lastMatchingResult;
}
/**
* Pseudo parses and entity character by character. We assume that we get
* presented with the chars after the starting ampersand. This parser does
* not supported unicode entities, hence this has to be handled differently.
*
* @param character the next character, should not be the ampersand ever
* @param state the last known state or null in case we start to parse
*
* @return the current state, which might be a valid final result, see {@link State}
*/
public State lookup(final int character, final State state) {
return state != null ? state.lookup(character) : rootLevel_.lookup(character);
}
/**
* Our "level" in the treeish structure that keeps its static state and the next level
* underneath.
*/
public static class State {
// what is the current depth aka amount of characters seen
private final int depth_;
// The characters at this level
// The state at the same position holds the matching result
int[] characters_ = new int[0];
// The matching states at this level
// we intentionally have not build a unified data structure
// between characters and state, keep it simple!
State[] nextState_ = new State[0];
// our current fragment or full entity, so for the entity "copy;"
// you will have c, co, cop, copy, and copy; on each state level
public final String entityOrFragment_;
// what shall we resolve to? if we don't resolve, this is null!!!
public String resolvedValue_;
// the length of the entity fragment
public final int length_;
// tell us, if this is ending with a semicolon
public final boolean endsWithSemicolon_;
// does this entity fragment match a resolved value?
public boolean isMatch_;
// is this the end of the look up level structure, this the end
// and hence it shall be a match
public boolean endNode_;
/**
* Create the empty state
*/
protected State() {
entityOrFragment_ = "";
length_ = 0;
depth_ = 0;
endsWithSemicolon_ = false;
isMatch_ = false;
resolvedValue_ = null;
endNode_ = false;
}
/**
* Create us a new state that describes itself nicely
*/
protected State(final int depth, final String entityFragment, final String resolvedValue) {
if (depth == entityFragment.length()) {
// we are at the end
entityOrFragment_ = entityFragment;
length_ = entityFragment.length();
depth_ = entityFragment.length();
endsWithSemicolon_ = entityFragment.endsWith(";");
isMatch_ = true;
resolvedValue_ = resolvedValue;
endNode_ = entityFragment.endsWith(";");
}
else {
// intermediate state
final String currentFragment = entityFragment.substring(0, depth);
entityOrFragment_ = currentFragment;
length_ = currentFragment.length();
depth_ = depth;
endsWithSemicolon_ = false;
isMatch_ = false;
resolvedValue_ = null;
endNode_ = false;
}
}
/**
* We have a special in between state because some entities exist as correct
* entity with a semicolon at the end and as legacy version without. We want
* to look up both correctly, hence when we build the data set, we have to
* unmark an existing one as final one and insert one more.
*
* @param entity the entity to look up
* @param resolvedValue the value it will resolve to
*/
protected void updateNonSemicolonEntity(final String entity, final String resolvedValue) {
if (entity.endsWith(";")) {
// nothing to do, perfect entity
return;
}
// our entity is legacy (no ;) and so we have to see if we know the ; version already
if (entity.length() == depth_) {
// safety check, just for the initial programming and later updates then
// for daily life
if (!entity.equals(entityOrFragment_)) {
throw new RuntimeException("Illegal state reached");
}
// declare this an intermediate match
endNode_ = false;
isMatch_ = true;
resolvedValue_ = resolvedValue;
}
}
/**
* Add a new entity to the pseudo-tree
*
* @param entity the entity to look for later
* @param resolvedValue the value it resolves to
*/
protected void add(final String entity, final String resolvedValue) {
// ok, any characters left?
if (depth_ >= entity.length()) {
// no reason to go any further
return;
}
// get me my character
final char c = entity.charAt(depth_);
// do I already know it?
final int pos = Arrays.binarySearch(characters_, c);
if (pos < 0) {
// we don't know it, make the size bigger and get us the new pos
nextState_ = Arrays.copyOf(nextState_, nextState_.length + 1);
characters_ = Arrays.copyOf(characters_, characters_.length + 1);
final int newPos = -(pos + 1);
// move stuff first
if (newPos != characters_.length - 1) {
System.arraycopy(characters_, newPos, characters_, newPos + 1, characters_.length - newPos - 1);
System.arraycopy(nextState_, newPos, nextState_, newPos + 1, nextState_.length - newPos - 1);
}
else {
// we insert at the end, so no move needed
}
final State newLevel = new State(depth_ + 1, entity, resolvedValue);
characters_[newPos] = c;
nextState_[newPos] = newLevel;
// update next level
newLevel.add(entity, resolvedValue);
}
else {
// ok, if this one is without a ; and we have the full entity, we
// have a mismatch between one with and one without ;
// change the level
nextState_[pos].updateNonSemicolonEntity(entity, resolvedValue);
nextState_[pos].add(entity, resolvedValue);
}
}
/**
* Lookup the state by iterating over the chars at this state, should not be that
* many and due to the small size of the array, should be cache only
*
* @param character the char to look up
* @return the next state or the same in case the character was not found
*/
protected State lookup(final int character) {
// because we have sorted arrays, we can be more efficient here
final int length = characters_.length;
for (int i = 0; i < length; i++) {
final int c = characters_[i];
// are we still under, simply continue
if (c < character) {
continue;
}
if (c == character) {
// we are at position
return nextState_[i];
}
// ok, too far and have not found it, abort with current state
return this;
}
// nothing found, maybe array was empty
return this;
}
}
/**
* This is our initial state and has a special optimization applied. We
* don't iterate, we jump by character code to the position.
*/
protected static class RootState extends State {
// the smallest character determines this
private int offset_ = 0;
@Override
public State lookup(final int character) {
// fastpath, just calculate the pos
final int pos = character - offset_;
// in case we don't have a matching char, return
// this state, if we end up in a hole with null,
// we do the same
if (pos >= 0 && pos < nextState_.length) {
final State s = nextState_[pos];
return s != null ? s : this;
}
return this;
}
/*
* Optimizes the layout after creation. This is only applied to the root state
* because it is a wider range of characters. It does not make sense for the substates,
* because we would get arrays with large holes and that makes the cache go bust.
*/
protected void optimize() {
// are we final already?
if (offset_ > 0) {
// that is just for later to tell us that we don't understand our
// own code anymore and called that incorrectly
throw new RuntimeException("Optimiize was called twice");
}
// ok, smallest char is the start
offset_ = characters_[0];
// get us new a level array covering the smallest char in [0] and the largest in the last pos,
// we might have holes, but not too many, hence this is faster than iterating or a binary search
final State[] newNextLevel = new State[characters_[characters_.length - 1] - offset_ + 1];
// arrange entries according to charactercode
for (int i = 0; i < characters_.length; i++) {
final int c = characters_[i];
final State level = nextState_[i];
newNextLevel[c - offset_] = level;
}
// take it live
nextState_ = newNextLevel;
// free memory, because we not longer need that, doesn't save a ton
// but it might also help to discover programming mistakes
characters_ = null;
}
}
}