Many resources are needed to download a project. Please understand that we have to compensate our server costs. Thank you in advance. Project price only 1 $
You can buy this project and download/modify it how often you want.
package org.hl7.fhir.utilities.xhtml;
/*
Copyright (c) 2011+, HL7, Inc.
All rights reserved.
Redistribution and use in source and binary forms, with or without modification,
are permitted provided that the following conditions are met:
* Redistributions of source code must retain the above copyright notice, this
list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright notice,
this list of conditions and the following disclaimer in the documentation
and/or other materials provided with the distribution.
* Neither the name of HL7 nor the names of its contributors may be used to
endorse or promote products derived from this software without specific
prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT,
INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
POSSIBILITY OF SUCH DAMAGE.
*/
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.Reader;
import java.io.StringReader;
import java.nio.charset.StandardCharsets;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
import org.hl7.fhir.exceptions.FHIRException;
import org.hl7.fhir.exceptions.FHIRFormatError;
import org.hl7.fhir.utilities.StringPair;
import org.hl7.fhir.utilities.TextFile;
import org.hl7.fhir.utilities.Utilities;
import org.hl7.fhir.utilities.i18n.I18nConstants;
import org.hl7.fhir.utilities.xhtml.XhtmlNode.Location;
import org.w3c.dom.Attr;
import org.w3c.dom.Element;
import org.w3c.dom.Node;
import org.xmlpull.v1.XmlPullParser;
import org.xmlpull.v1.XmlPullParserException;
public class XhtmlParser {
public static final String XHTML_NS = "http://www.w3.org/1999/xhtml";
private static final char END_OF_CHARS = (char) -1;
private static final boolean DEBUG = false;
public class NamespaceNormalizationMap {
private String defaultNamespace;
private String originalNamespacePrefix;
private Map map = new HashMap();
public NamespaceNormalizationMap(NamespaceNormalizationMap namespaceMap) {
if (namespaceMap != null) {
map.putAll(namespaceMap.map);
this.defaultNamespace = namespaceMap.defaultNamespace;
this.originalNamespacePrefix = namespaceMap.originalNamespacePrefix;
}
}
public void setDefaultNamespace(String defaultNamespace) {
this.defaultNamespace = defaultNamespace;
}
/**
* Keeps track of the original namespace this element had before it was normalized
*
* This way, child elements using that prefix will be able recognize that they
* should use the default namespace.
*
*
*
*
*
* parentElement's namespaceA would become the default namespace.
*
* When normalizing childElement originalNamespacePrefix would be namespaceA,
* so we would know that childElement should use the default namespace.
*
*
*
*
*
* @param originalNamespacePrefix
*/
public void setOriginalNamespacePrefix(String originalNamespacePrefix) {
this.originalNamespacePrefix = originalNamespacePrefix;
}
public void putNamespacePrefix(String prefix, String namespace) {
map.put(prefix, namespace);
}
public String getDefaultNamespace() {
return defaultNamespace;
}
public boolean hasDefaultNamespace() {
return defaultNamespace != null;
}
public String getNamespaceForPrefix(String prefix) {
if (originalNamespacePrefix != null && originalNamespacePrefix.equals(prefix)) {
return defaultNamespace;
}
return map.containsKey(prefix) ? map.get(prefix) : "http://error/undefined-namespace";
}
public Set> getPrefixNamespaceEntrySet() {
return map.entrySet();
}
}
public class ElementName {
private String namespace;
private String name;
public ElementName(String src) {
if (src.contains(":")) {
namespace = src.substring(0, src.indexOf(":"));
name = src.substring(src.indexOf(":")+1);
} else {
namespace = null;
name = src;
}
}
public String getName() {
return name;
}
public boolean hasNamespace() {
return namespace != null;
}
public String getNamespace() {
return namespace;
}
@Override
public String toString() {
return namespace +"::"+name;
}
}
private Set elements = new HashSet();
private Set attributes = new HashSet();
private Map definedEntities = new HashMap<>();
private Map declaredEntities = new HashMap<>();
public XhtmlParser() {
super();
policy = ParserSecurityPolicy.Accept; // for general parsing
// set up sets
elements.add("p");
elements.add("br");
elements.add("div");
elements.add("h1");
elements.add("h2");
elements.add("h3");
elements.add("h4");
elements.add("h5");
elements.add("h6");
elements.add("a");
elements.add("span");
elements.add("b");
elements.add("em");
elements.add("i");
elements.add("strong");
elements.add("small");
elements.add("big");
elements.add("tt");
elements.add("small");
elements.add("dfn");
elements.add("q");
elements.add("var");
elements.add("abbr");
elements.add("acronym");
elements.add("cite");
elements.add("blockquote");
elements.add("hr");
elements.add("address");
elements.add("bdo");
elements.add("kbd");
elements.add("q");
elements.add("sub");
elements.add("sup");
elements.add("ul");
elements.add("ol");
elements.add("li");
elements.add("dl");
elements.add("dt");
elements.add("dd");
elements.add("pre");
elements.add("table");
elements.add("caption");
elements.add("colgroup");
elements.add("col");
elements.add("thead");
elements.add("tr");
elements.add("tfoot");
elements.add("tbody");
elements.add("th");
elements.add("td");
elements.add("code");
elements.add("samp");
elements.add("img");
elements.add("map");
elements.add("area");
attributes.add("title");
attributes.add("style");
attributes.add("class");
attributes.add("id");
attributes.add("lang");
attributes.add("xml:lang");
attributes.add("dir");
attributes.add("accesskey");
attributes.add("tabindex");
// tables:
attributes.add("span");
attributes.add("width");
attributes.add("align");
attributes.add("valign");
attributes.add("char");
attributes.add("charoff");
attributes.add("abbr");
attributes.add("axis");
attributes.add("headers");
attributes.add("scope");
attributes.add("rowspan");
attributes.add("colspan");
attributes.add("a.href");
attributes.add("a.name");
attributes.add("img.src");
attributes.add("img.border");
attributes.add("div.xmlns");
attributes.add("blockquote.cite");
attributes.add("q.cite");
attributes.add("a.charset");
attributes.add("a.type");
attributes.add("a.name");
attributes.add("a.href");
attributes.add("a.hreflang");
attributes.add("a.rel");
attributes.add("a.rev");
attributes.add("a.shape");
attributes.add("a.coords");
attributes.add("img.src");
attributes.add("img.alt");
attributes.add("img.longdesc");
attributes.add("img.height");
attributes.add("img.width");
attributes.add("img.usemap");
attributes.add("img.ismap");
attributes.add("map.name");
attributes.add("area.shape");
attributes.add("area.coords");
attributes.add("area.href");
attributes.add("area.nohref");
attributes.add("area.alt");
attributes.add("table.summary");
attributes.add("table.width");
attributes.add("table.border");
attributes.add("table.frame");
attributes.add("table.rules");
attributes.add("table.cellspacing");
attributes.add("table.cellpadding");
defineEntities();
}
public enum ParserSecurityPolicy {
Accept,
Drop,
Reject
}
private ParserSecurityPolicy policy;
private boolean trimWhitespace;
private boolean mustBeWellFormed = true;
private List validationIssues = new ArrayList<>();
public boolean isTrimWhitespace() {
return trimWhitespace;
}
public void setTrimWhitespace(boolean trimWhitespace) {
this.trimWhitespace = trimWhitespace;
}
public boolean isMustBeWellFormed() {
return mustBeWellFormed;
}
public XhtmlParser setMustBeWellFormed(boolean mustBeWellFormed) {
this.mustBeWellFormed = mustBeWellFormed;
return this;
}
public boolean isXmlMode() {
return xmlMode;
}
public XhtmlParser setXmlMode(boolean xmlMode) {
this.xmlMode = xmlMode;
return this;
}
public ParserSecurityPolicy getPolicy() {
return policy;
}
public void setPolicy(ParserSecurityPolicy policy) {
this.policy = policy;
}
public List getValidationIssues() {
return validationIssues;
}
public XhtmlNode parseHtmlNode(Element node) throws FHIRFormatError {
return parseHtmlNode(node, null);
}
public XhtmlNode parseHtmlNode(Element node, String defaultNS) throws FHIRFormatError {
XhtmlNode res = parseNode(node, defaultNS);
if (res.getNsDecl() == null)
res.getAttributes().put("xmlns", XHTML_NS);
return res;
}
private XhtmlNode parseNode(Element node, String defaultNS) throws FHIRFormatError {
XhtmlNode res = new XhtmlNode(NodeType.Element);
res.setName(node.getLocalName());
defaultNS = checkNS(res, node, defaultNS);
for (int i = 0; i < node.getAttributes().getLength(); i++) {
Attr attr = (Attr) node.getAttributes().item(i);
if (attributeIsOk(res.getName(), attr.getName(), attr.getValue()) && !attr.getLocalName().startsWith("xmlns"))
res.getAttributes().put(attr.getName(), attr.getValue());
}
Node child = node.getFirstChild();
while (child != null) {
if (child.getNodeType() == Node.TEXT_NODE) {
res.addText(child.getTextContent());
} else if (child.getNodeType() == Node.COMMENT_NODE) {
res.addComment(child.getTextContent());
} else if (child.getNodeType() == Node.ELEMENT_NODE) {
if (elementIsOk(child.getLocalName()))
res.getChildNodes().add(parseNode((Element) child, defaultNS));
} else
throw new FHIRFormatError("Unhandled XHTML feature: "+Integer.toString(child.getNodeType())+descLoc());
child = child.getNextSibling();
}
return res;
}
private String checkNS(XhtmlNode res, Element node, String defaultNS) {
String ns = node.getNamespaceURI();
if (ns == null)
return null;
if (!ns.equals(defaultNS)) {
res.getAttributes().put("xmlns", ns);
return ns;
}
return defaultNS;
}
public XhtmlNode parseHtmlNode(XmlPullParser xpp) throws XmlPullParserException, IOException, FHIRFormatError {
XhtmlNode res = parseNode(xpp);
if (res.getNsDecl() == null)
res.getAttributes().put("xmlns", XHTML_NS);
return res;
}
private XhtmlNode parseNode(XmlPullParser xpp) throws XmlPullParserException, IOException, FHIRFormatError {
XhtmlNode res = new XhtmlNode(NodeType.Element);
res.setName(xpp.getName());
for (int i = 0; i < xpp.getAttributeCount(); i++) {
String an = "xml".equals(xpp.getAttributePrefix(i)) ? "xml:"+xpp.getAttributeName(i) : xpp.getAttributeName(i);
String av = xpp.getAttributeValue(i);
if (attributeIsOk(xpp.getName(), an, av)) {
res.getAttributes().put(an, av);
}
}
int eventType = xpp.next();
while (eventType != XmlPullParser.END_TAG) {
if (eventType == XmlPullParser.TEXT) {
res.addText(xpp.getText());
xpp.next();
} else if (eventType == XmlPullParser.COMMENT) {
res.addComment(xpp.getText());
xpp.next();
} else if (eventType == XmlPullParser.START_TAG) {
if (elementIsOk(xpp.getName()))
res.getChildNodes().add(parseNode(xpp));
} else
throw new FHIRFormatError("Unhandled XHTML feature: "+Integer.toString(eventType)+descLoc());
eventType = xpp.getEventType();
}
xpp.next();
return res;
}
private boolean attributeIsOk(String elem, String attr, String value) throws FHIRFormatError {
boolean ok = attributes.contains(attr) || attributes.contains(elem+"."+attr);
if (ok)
return true;
else switch (policy) {
case Accept: return true;
case Drop: return false;
case Reject: return error(I18nConstants.XHTML_XHTML_ATTRIBUTE_ILLEGAL, elem+"."+attr);
}
if ((elem+"."+attr).equals("img.src") && !(value.startsWith("#") || value.startsWith("http:") || value.startsWith("https:"))) {
switch (policy) {
case Accept: return true;
case Drop: return false;
case Reject: return error(I18nConstants.XHTML_XHTML_Image_Reference_Illegal, value);
}
}
return false;
}
private boolean error(String msg, String param) {
validationIssues.add(new StringPair(msg, param));
return false;
}
private boolean elementIsOk(String name) throws FHIRFormatError {
boolean ok = elements.contains(name);
if (ok)
return true;
else switch (policy) {
case Accept: return true;
case Drop: return false;
case Reject: return error(I18nConstants.XHTML_XHTML_ELEMENT_ILLEGAL, name);
}
return false;
}
private String descLoc() {
return " at line "+Integer.toString(line)+" column "+Integer.toString(col);
}
private Reader rdr;
private String cache = "";
private XhtmlNode unwindPoint;
private String lastText = "";
private int line = 1;
private int col = 0;
private char lastChar;
private Location lastLoc;
private boolean xmlMode;
public XhtmlDocument parse(String source, String entryName) throws FHIRFormatError, IOException {
rdr = new StringReader(source);
return parse(entryName);
}
public XhtmlDocument parse(InputStream input, String entryName) throws FHIRFormatError, IOException {
rdr = new InputStreamReader(input, StandardCharsets.UTF_8);
return parse(entryName);
}
private XhtmlDocument parse(String entryName) throws FHIRFormatError, IOException
{
XhtmlDocument result = new XhtmlDocument();
skipWhiteSpaceAndComments(result);
if (peekChar() != '<')
throw new FHIRFormatError("Unable to Parse HTML - does not start with tag. Found "+peekChar()+descLoc());
readChar();
markLocation();
ElementName n = new ElementName(readName().toLowerCase());
if ((entryName != null) && !n.getName().equals(entryName))
throw new FHIRFormatError("Unable to Parse HTML - starts with '"+n+"' not '"+entryName+"'"+descLoc());
XhtmlNode root = result.addTag(n.getName());
root.setLocation(markLocation());
parseAttributes(root);
markLocation();
NamespaceNormalizationMap nsm = normalizeNamespaces(n, root, null, true);
if (readChar() == '/') {
if (peekChar() != '>')
throw new FHIRFormatError("unexpected non-end of element "+n+" "+descLoc());
readChar();
root.setEmptyExpanded(false);
} else {
unwindPoint = null;
List p = new ArrayList<>();
parseElementInner(root, p, nsm, true);
root.setEmptyExpanded(true);
}
return result;
}
private Location markLocation() {
Location res = lastLoc;
lastLoc = new Location(line, col);
return res;
}
private NamespaceNormalizationMap normalizeNamespaces(ElementName elementName, XhtmlNode node, NamespaceNormalizationMap parentNamespaceMap, boolean nodeIsRoot) {
// what we do here is strip out any stated namespace attributes, putting them in the namespace map
// then we figure out what the namespace of this element is, and state it explicitly if it's not the default
NamespaceNormalizationMap nodeNamespaceMap = new NamespaceNormalizationMap(parentNamespaceMap);
List namespaceAttributes = new ArrayList();
for (String an : node.getAttributes().keySet()) {
if (an.equals("xmlns")) {
nodeNamespaceMap.setDefaultNamespace(node.getAttribute(an));
namespaceAttributes.add(an);
}
if (an.startsWith("xmlns:")) {
nodeNamespaceMap.putNamespacePrefix(an.substring(6), node.getAttribute(an));
namespaceAttributes.add(an);
}
}
for (String s : namespaceAttributes)
node.getAttributes().remove(s);
if (elementName.hasNamespace()) {
String elementNamespace = nodeNamespaceMap.getNamespaceForPrefix(elementName.getNamespace());
if (!elementNamespace.equals(nodeNamespaceMap.getDefaultNamespace())) {
node.getAttributes().put("xmlns", elementNamespace);
nodeNamespaceMap.setDefaultNamespace(elementNamespace);
nodeNamespaceMap.setOriginalNamespacePrefix(elementName.getNamespace());
nodeNamespaceMap.map.remove(elementName.getNamespace());
}
}
// Add namespaces back if not defined in parentNamespaceMap (we haven't seen it before, so we need to define it here)
if (shouldAddXmlnsNamespaceAttribute(parentNamespaceMap, nodeIsRoot, nodeNamespaceMap)) {
node.getAttributes().put("xmlns", nodeNamespaceMap.getDefaultNamespace());
}
for (Map.Entry entry : nodeNamespaceMap.getPrefixNamespaceEntrySet() ) {
if (shouldAddXmlnsNamespacePrefixAttribute(parentNamespaceMap, nodeIsRoot, entry.getKey())) {
node.getAttributes().put("xmlns:" + entry.getKey(), entry.getValue());
}
}
return nodeNamespaceMap;
}
private static boolean shouldAddXmlnsNamespacePrefixAttribute(NamespaceNormalizationMap parentNamespaceMap, boolean nodeIsRoot, String attributeKey) {
if (nodeIsRoot) {
return true;
}
return (!parentNamespaceMap.map.containsKey(attributeKey));
}
private static boolean shouldAddXmlnsNamespaceAttribute(NamespaceNormalizationMap parentNamespaceMap, boolean nodeIsRoot, NamespaceNormalizationMap nodeNamespaceMap) {
if (nodeIsRoot) {
return nodeNamespaceMap.hasDefaultNamespace();
}
return nodeNamespaceMap.hasDefaultNamespace() && (parentNamespaceMap == null || !nodeNamespaceMap.getDefaultNamespace().equals(parentNamespaceMap.getDefaultNamespace()));
}
private void addTextNode(XhtmlNode node, StringBuilder s)
{
String t = isTrimWhitespace() ? s.toString().trim() : s.toString();
if (t.length() > 0)
{
lastText = t;
// System.out.println(t);
node.addText(t).setLocation(markLocation());
s.setLength(0);
}
}
private void parseElementInner(XhtmlNode node, List parents, NamespaceNormalizationMap nsm, boolean escaping) throws FHIRFormatError, IOException
{
StringBuilder s = new StringBuilder();
while (peekChar() != END_OF_CHARS && !parents.contains(unwindPoint) && !(node == unwindPoint))
{
if (peekChar() == '<')
{
addTextNode(node, s);
readChar();
if (peekChar() == '!') {
String sc = readToCommentEnd();
// moved the validator
// if (sc.startsWith("DOCTYPE"))
// throw new FHIRFormatError("Malformed XHTML: Found a DocType declaration, and these are not allowed (XXE security vulnerability protection)");
node.addComment(sc).setLocation(markLocation());
} else if (peekChar() == '?')
node.addComment(readToTagEnd()).setLocation(markLocation());
else if (peekChar() == '/') {
readChar();
ElementName n = new ElementName(readToTagEnd());
if (node.getName().equals(n.getName()))
return;
else
{
if (mustBeWellFormed) {
throw new FHIRFormatError("Malformed XHTML: Found \""+n.getName()+">\" expecting \""+node.getName()+">\""+descLoc());
}
for (int i = parents.size() - 1; i >= 0; i--)
{
if (parents.get(i).getName().equals(n))
unwindPoint = parents.get(i);
}
if (unwindPoint != null)
{
for (int i = parents.size(); i > 0; i--)
{
if (i < parents.size() && parents.get(i) == unwindPoint)
return;
if (i == parents.size())
{
parents.get(i - 1).getChildNodes().addAll(node.getChildNodes());
node.getChildNodes().clear();
}
else
{
parents.get(i - 1).getChildNodes().addAll(parents.get(i).getChildNodes());
parents.get(i).getChildNodes().clear();
}
}
}
}
}
else if (Character.isLetterOrDigit(peekChar()))
{
parseElement(node, parents, nsm);
}
else
throw new FHIRFormatError("Unable to Parse HTML - node '" + node.getName() + "' has unexpected content '"+peekChar()+"' (last text = '"+lastText+"'"+descLoc());
}
else if (peekChar() == '&') // escaping &&
{
parseLiteral(s);
}
else
s.append(readChar());
}
addTextNode(node, s);
}
private void parseElement(XhtmlNode parent, List parents, NamespaceNormalizationMap namespaceMap) throws IOException, FHIRFormatError
{
markLocation();
ElementName name = new ElementName(readName());
XhtmlNode node = parent.addTag(name.getName());
node.setLocation(markLocation());
List newParents = new ArrayList();
newParents.addAll(parents);
newParents.add(parent);
parseAttributes(node);
markLocation();
namespaceMap = normalizeNamespaces(name, node, namespaceMap, false);
if (readChar() == '/') {
if (peekChar() != '>')
throw new FHIRFormatError("unexpected non-end of element "+name+" "+descLoc());
readChar();
node.setEmptyExpanded(false);
} else {
node.setEmptyExpanded(true);
parseElementInner(node, newParents, namespaceMap, "script".equals(name.getName()));
}
}
private void parseAttributes(XhtmlNode node) throws FHIRFormatError, IOException
{
while (Character.isWhitespace(peekChar()))
readChar();
while (peekChar() != '>' && peekChar() != '/' && peekChar() != END_OF_CHARS)
{
String name = readName();
if (name.length() == 0)
{
throw new FHIRFormatError("Unable to read attribute on <"+node.getName()+">"+descLoc());
}
while (Character.isWhitespace(peekChar()))
readChar();
if (isNameChar(peekChar()) || peekChar() == '>' || peekChar() == '/')
node.getAttributes().put(name, null);
else if (peekChar() != '=')
{
throw new FHIRFormatError("Unable to read attribute '"+name+"' value on <"+node.getName()+">"+descLoc());
}
else
{
readChar();
while (Character.isWhitespace(peekChar()))
readChar();
if (peekChar() == '"' || peekChar() == '\'')
node.getAttributes().put(name, parseAttributeValue(readChar()));
else
node.getAttributes().put(name, parseAttributeValue(END_OF_CHARS));
}
while (Character.isWhitespace(peekChar()))
readChar();
}
}
private String parseAttributeValue(char term) throws IOException, FHIRFormatError
{
StringBuilder b = new StringBuilder();
while (peekChar() != END_OF_CHARS && peekChar() != '>' && (term != END_OF_CHARS || peekChar() != '/') && peekChar() != term)
{
if (peekChar() == '&')
{
parseLiteral(b);
}
else
b.append(readChar());
}
if (peekChar() == term)
readChar();
return b.toString();
}
private void skipWhiteSpaceAndComments(XhtmlNode focus) throws IOException, FHIRFormatError {
while (Character.isWhitespace(peekChar()) || (peekChar() == 0xfeff))
readChar();
if (peekChar() == '<')
{
char ch = readChar();
if (peekChar() == '!') {
readChar();
if (peekChar() == '-') {
readChar();
if (peekChar() == '-') {
readChar();
if (peekChar() == ' ')
readChar();
focus.addComment(readToCommentEnd());
} else
throw new FHIRFormatError("unrecognised element type 0)
return cache.charAt(0);
else if (!rdr.ready())
return END_OF_CHARS;
else
{
int i = rdr.read();
if (i == -1) {
cache = "";
return END_OF_CHARS;
}
char c = (char) i;
cache = Character.toString(c);
return c;
}
}
private char readChar() throws IOException
{
char c;
if (cache.length() > 0)
{
c = cache.charAt(0);
cache = cache.length() == 1 ? "" : cache.substring(1);
}
else if (!rdr.ready())
c = END_OF_CHARS;
else
c = (char)rdr.read();
if (c == '\r' || c == '\n') {
if (c == '\r' || lastChar != '\r') {
line++;
col = 0;
}
lastChar = c;
}
col++;
return c;
}
private String readToTagEnd() throws IOException, FHIRFormatError
{
StringBuilder s = new StringBuilder();
while (peekChar() != '>' && peekChar() != END_OF_CHARS)
s.append(readChar());
if (peekChar() != END_OF_CHARS)
{
readChar();
skipWhiteSpace();
} else if (mustBeWellFormed)
throw new FHIRFormatError("Unexpected termination of html source"+descLoc());
return s.toString();
}
private String readToDocTypeEnd() throws IOException, FHIRFormatError
{
StringBuilder s = new StringBuilder();
boolean done = false;
while (!done) {
char c = peekChar();
if (c == '>') {
done = true;
readChar();
} else if (c != END_OF_CHARS)
s.append(readChar());
else if (mustBeWellFormed)
throw new FHIRFormatError("Unexpected termination of html source"+descLoc());
}
return s.toString();
}
private String readToCommentEnd() throws IOException, FHIRFormatError
{
if (peekChar() == '!')
readChar();
StringBuilder s = new StringBuilder();
boolean simple = true;
if (peekChar() == '-') {
readChar();
simple = peekChar() != '-';
if (simple)
s.append('-');
else
readChar();
}
boolean doctypeEntities = false;
boolean done = false;
while (!done) {
char c = peekChar();
if (c == '-') {
readChar();
if (peekChar() == '-') {
readChar();
if (peekChar() == '>') {
done = true;
} else {
pushChar('-');
s.append("-");
}
} else
s.append('-');
} else if (doctypeEntities && c == ']') {
s.append(readChar());
if (peekChar() == '>') {
done = true;
}
} else if (simple && peekChar() == '>' && !doctypeEntities) {
done = true;
} else if (c == '[' && s.toString().startsWith("DOCTYPE ")) {
doctypeEntities = true;
s.append(readChar());
} else if (c != END_OF_CHARS)
s.append(readChar());
else if (mustBeWellFormed)
throw new FHIRFormatError("Unexpected termination of html source"+descLoc());
}
if (peekChar() != END_OF_CHARS)
{
readChar();
skipWhiteSpace();
}
if (doctypeEntities) {
parseDoctypeEntities(s.toString());
}
return s.toString();
}
private void parseDoctypeEntities(String s) {
while (s.contains("");
String ed = s.substring(0, e+1);
s = s.substring(e+1);
ed = ed.substring(8).trim();
e = ed.indexOf(" ");
String n = ed.substring(0, e).trim();
ed = ed.substring(e).trim();
e = ed.indexOf(" "); // SYSTEM
ed = ed.substring(e).trim();
String v = ed.substring(0, ed.length()-1);
declaredEntities.put(n, v);
}
}
private boolean isNameChar(char ch)
{
return Character.isLetterOrDigit(ch) || ch == '_' || ch == '-' || ch == ':' || ch == '.';
}
private String readName() throws IOException
{
StringBuilder s = new StringBuilder();
while (isNameChar(peekChar()))
s.append(readChar());
return s.toString();
}
private String readUntil(char ch) throws IOException
{
StringBuilder s = new StringBuilder();
while (peekChar() != 0 && peekChar() != ch)
s.append(readChar());
readChar();
return s.toString();
}
private String readUntil(String sc) throws IOException
{
StringBuilder s = new StringBuilder();
while (peekChar() != 0 && sc.indexOf(peekChar()) == -1)
s.append(readChar());
readChar();
return s.toString();
}
private void parseLiteral(StringBuilder s) throws IOException, FHIRFormatError {
// UInt16 w;
readChar();
String c = readUntil(";&'\"><");
if (c.isEmpty())
throw new FHIRFormatError("Invalid literal declaration following text: " + s);
else if (c.charAt(0) == '#') {
if (isInteger(c.substring(1), 10))
s.append(Character.toString(Integer.parseInt(c.substring(1))));
else if (c.charAt(1) == 'x' && isInteger(c.substring(2), 16))
s.append(Character.toString(Integer.parseInt(c.substring(2), 16)));
} else if (declaredEntities.containsKey(c)) {
s.append(declaredEntities.get(c));
} else {
if (xmlMode) {
if (!Utilities.existsInList(c, "quot", "amp", "apos", "lt", "gt")) {
error(I18nConstants.XHTML_XHTML_Entity_Illegal, "&"+c+";");
}
}
if (definedEntities.containsKey("&"+c+";")) {
s.append(definedEntities.get("&"+c+";"));
// what's going on here?
// the contents that follow already existed, and then I added the routine to populate the entities
// which was generated from other code. The code that follows is probably redundant, but I haven't
// cross checked it. some is welcome to do so... (GG 8-Nov 2021)
} else if (c.equals("apos"))
s.append('\'');
else if (c.equals("quot"))
s.append('"');
else if (c.equals("nbsp"))
s.append(XhtmlNode.NBSP);
else if (c.equals("amp"))
s.append('&');
else if (c.equals("lsquo"))
s.append((char) 8216); // right single quotation, U+2019 ISOnum
else if (c.equals("rsquo"))
s.append((char) 8217); // right single quotation, U+2019 ISOnum
else if (c.equals("gt"))
s.append('>');
else if (c.equals("lt"))
s.append('<');
else if (c.equals("copy"))
s.append((char) 169);
else if (c.equals("reg"))
s.append((char) 174);
else if (c.equals("sect"))
s.append((char) 0xA7);
else if (c.equals("fnof"))
s.append((char) 402); // latin small f with hook = function = florin, U+0192 ISOtech -->
else if (c.equals("Alpha"))
s.append((char) 913); // greek capital letter alpha, U+0391
else if (c.equals("Beta"))
s.append((char) 914); // greek capital letter beta, U+0392
else if (c.equals("Gamma"))
s.append((char) 915); // greek capital letter gamma, U+0393 ISOgrk3
else if (c.equals("Delta"))
s.append((char) 916); // greek capital letter delta, U+0394 ISOgrk3
else if (c.equals("Epsilon"))
s.append((char) 917); // greek capital letter epsilon, U+0395
else if (c.equals("Zeta"))
s.append((char) 918); // greek capital letter zeta, U+0396
else if (c.equals("Eta"))
s.append((char) 919); // greek capital letter eta, U+0397
else if (c.equals("Theta"))
s.append((char) 920); // greek capital letter theta, U+0398 ISOgrk3
else if (c.equals("Iota"))
s.append((char) 921); // greek capital letter iota, U+0399
else if (c.equals("Kappa"))
s.append((char) 922); // greek capital letter kappa, U+039A
else if (c.equals("Lambda"))
s.append((char) 923); // greek capital letter lambda, U+039B ISOgrk3
else if (c.equals("Mu"))
s.append((char) 924); // greek capital letter mu, U+039C
else if (c.equals("Nu"))
s.append((char) 925); // greek capital letter nu, U+039D
else if (c.equals("Xi"))
s.append((char) 926); // greek capital letter xi, U+039E ISOgrk3
else if (c.equals("Omicron"))
s.append((char) 927); // greek capital letter omicron, U+039F
else if (c.equals("Pi"))
s.append((char) 928); // greek capital letter pi, U+03A0 ISOgrk3
else if (c.equals("Rho"))
s.append((char) 929); // greek capital letter rho, U+03A1
else if (c.equals("Sigma"))
s.append((char) 931); // greek capital letter sigma, U+03A3 ISOgrk3
else if (c.equals("Tau"))
s.append((char) 932); // greek capital letter tau, U+03A4
else if (c.equals("Upsilon"))
s.append((char) 933); // greek capital letter upsilon, U+03A5 ISOgrk3
else if (c.equals("Phi"))
s.append((char) 934); // greek capital letter phi, U+03A6 ISOgrk3
else if (c.equals("Chi"))
s.append((char) 935); // greek capital letter chi, U+03A7
else if (c.equals("Psi"))
s.append((char) 936); // greek capital letter psi, U+03A8 ISOgrk3
else if (c.equals("Omega"))
s.append((char) 937); // greek capital letter omega, U+03A9 ISOgrk3
else if (c.equals("alpha"))
s.append((char) 945); // greek small letter alpha, U+03B1 ISOgrk3
else if (c.equals("beta"))
s.append((char) 946); // greek small letter beta, U+03B2 ISOgrk3
else if (c.equals("gamma"))
s.append((char) 947); // greek small letter gamma, U+03B3 ISOgrk3
else if (c.equals("delta"))
s.append((char) 948); // greek small letter delta, U+03B4 ISOgrk3
else if (c.equals("epsilon"))
s.append((char) 949); // greek small letter epsilon, U+03B5 ISOgrk3
else if (c.equals("zeta"))
s.append((char) 950); // greek small letter zeta, U+03B6 ISOgrk3
else if (c.equals("eta"))
s.append((char) 951); // greek small letter eta, U+03B7 ISOgrk3
else if (c.equals("theta"))
s.append((char) 952); // greek small letter theta, U+03B8 ISOgrk3
else if (c.equals("iota"))
s.append((char) 953); // greek small letter iota, U+03B9 ISOgrk3
else if (c.equals("kappa"))
s.append((char) 954); // greek small letter kappa, U+03BA ISOgrk3
else if (c.equals("lambda"))
s.append((char) 955); // greek small letter lambda, U+03BB ISOgrk3
else if (c.equals("mu"))
s.append((char) 956); // greek small letter mu, U+03BC ISOgrk3
else if (c.equals("nu"))
s.append((char) 957); // greek small letter nu, U+03BD ISOgrk3
else if (c.equals("xi"))
s.append((char) 958); // greek small letter xi, U+03BE ISOgrk3
else if (c.equals("omicron"))
s.append((char) 959); // greek small letter omicron, U+03BF NEW
else if (c.equals("pi"))
s.append((char) 960); // greek small letter pi, U+03C0 ISOgrk3
else if (c.equals("rho"))
s.append((char) 961); // greek small letter rho, U+03C1 ISOgrk3
else if (c.equals("sigmaf"))
s.append((char) 962); // greek small letter final sigma, U+03C2 ISOgrk3
else if (c.equals("sigma"))
s.append((char) 963); // greek small letter sigma, U+03C3 ISOgrk3
else if (c.equals("tau"))
s.append((char) 964); // greek small letter tau, U+03C4 ISOgrk3
else if (c.equals("upsilon"))
s.append((char) 965); // greek small letter upsilon, U+03C5 ISOgrk3
else if (c.equals("phi"))
s.append((char) 966); // greek small letter phi, U+03C6 ISOgrk3
else if (c.equals("chi"))
s.append((char) 967); // greek small letter chi, U+03C7 ISOgrk3
else if (c.equals("psi"))
s.append((char) 968); // greek small letter psi, U+03C8 ISOgrk3
else if (c.equals("omega"))
s.append((char) 969); // greek small letter omega, U+03C9 ISOgrk3
else if (c.equals("thetasym"))
s.append((char) 977); // greek small letter theta symbol, U+03D1 NEW
else if (c.equals("upsih"))
s.append((char) 978); // greek upsilon with hook symbol, U+03D2 NEW
else if (c.equals("piv"))
s.append((char) 982); // greek pi symbol, U+03D6 ISOgrk3
else if (c.equals("bull"))
s.append((char) 8226); // bullet = black small circle, U+2022 ISOpub
else if (c.equals("hellip"))
s.append((char) 8230); // horizontal ellipsis = three dot leader, U+2026 ISOpub
else if (c.equals("prime"))
s.append((char) 8242); // prime = minutes = feet, U+2032 ISOtech
else if (c.equals("Prime"))
s.append((char) 8243); // double prime = seconds = inches, U+2033 ISOtech
else if (c.equals("oline"))
s.append((char) 8254); // overline = spacing overscore, U+203E NEW
else if (c.equals("frasl"))
s.append((char) 8260); // fraction slash, U+2044 NEW
else if (c.equals("weierp"))
s.append((char) 8472); // script capital P = power set = Weierstrass p, U+2118 ISOamso
else if (c.equals("image"))
s.append((char) 8465); // blackletter capital I = imaginary part, U+2111 ISOamso
else if (c.equals("real"))
s.append((char) 8476); // blackletter capital R = real part symbol, U+211C ISOamso
else if (c.equals("trade"))
s.append((char) 8482); // trade mark sign, U+2122 ISOnum
else if (c.equals("alefsym"))
s.append((char) 8501); // alef symbol = first transfinite cardinal, U+2135 NEW
else if (c.equals("larr"))
s.append((char) 8592); // leftwards arrow, U+2190 ISOnum
else if (c.equals("uarr"))
s.append((char) 8593); // upwards arrow, U+2191 ISOnum
else if (c.equals("rarr"))
s.append((char) 8594); // rightwards arrow, U+2192 ISOnum
else if (c.equals("darr"))
s.append((char) 8595); // downwards arrow, U+2193 ISOnum
else if (c.equals("harr"))
s.append((char) 8596); // left right arrow, U+2194 ISOamsa
else if (c.equals("crarr"))
s.append((char) 8629); // downwards arrow with corner leftwards = carriage return, U+21B5 NEW
else if (c.equals("lArr"))
s.append((char) 8656); // leftwards double arrow, U+21D0 ISOtech
else if (c.equals("uArr"))
s.append((char) 8657); // upwards double arrow, U+21D1 ISOamsa
else if (c.equals("rArr"))
s.append((char) 8658); // rightwards double arrow, U+21D2 ISOtech
else if (c.equals("dArr"))
s.append((char) 8659); // downwards double arrow, U+21D3 ISOamsa
else if (c.equals("hArr"))
s.append((char) 8660); // left right double arrow, U+21D4 ISOamsa
else if (c.equals("forall"))
s.append((char) 8704); // for all, U+2200 ISOtech
else if (c.equals("part"))
s.append((char) 8706); // partial differential, U+2202 ISOtech
else if (c.equals("exist"))
s.append((char) 8707); // there exists, U+2203 ISOtech
else if (c.equals("empty"))
s.append((char) 8709); // empty set = null set = diameter, U+2205 ISOamso
else if (c.equals("nabla"))
s.append((char) 8711); // nabla = backward difference, U+2207 ISOtech
else if (c.equals("isin"))
s.append((char) 8712); // element of, U+2208 ISOtech
else if (c.equals("notin"))
s.append((char) 8713); // not an element of, U+2209 ISOtech
else if (c.equals("ni"))
s.append((char) 8715); // contains as member, U+220B ISOtech
else if (c.equals("prod"))
s.append((char) 8719); // n-ary product = product sign, U+220F ISOamsb
else if (c.equals("sum"))
s.append((char) 8721); // n-ary sumation, U+2211 ISOamsb
else if (c.equals("minus"))
s.append((char) 8722); // minus sign, U+2212 ISOtech
else if (c.equals("lowast"))
s.append((char) 8727); // asterisk operator, U+2217 ISOtech
else if (c.equals("radic"))
s.append((char) 8730); // square root = radical sign, U+221A ISOtech
else if (c.equals("prop"))
s.append((char) 8733); // proportional to, U+221D ISOtech
else if (c.equals("infin"))
s.append((char) 8734); // infinity, U+221E ISOtech -->
else if (c.equals("ang"))
s.append((char) 8736); // angle, U+2220 ISOamso
else if (c.equals("and"))
s.append((char) 8743); // logical and = wedge, U+2227 ISOtech
else if (c.equals("or"))
s.append((char) 8744); // logical or = vee, U+2228 ISOtech
else if (c.equals("cap"))
s.append((char) 8745); // intersection = cap, U+2229 ISOtech
else if (c.equals("cup"))
s.append((char) 8746); // union = cup, U+222A ISOtech
else if (c.equals("int"))
s.append((char) 8747); // integral, U+222B ISOtech
else if (c.equals("there4"))
s.append((char) 8756); // therefore, U+2234 ISOtech
else if (c.equals("sim"))
s.append((char) 8764); // tilde operator = varies with = similar t U+223C ISOtech
else if (c.equals("cong"))
s.append((char) 8773); // approximately equal to, U+2245 ISOtec
else if (c.equals("asymp"))
s.append((char) 8776); // almost equal to = asymptotic to, U+2248 ISOamsr
else if (c.equals("ne"))
s.append((char) 8800); // not equal to, U+2260 ISOtech
else if (c.equals("equiv"))
s.append((char) 8801); // identical to, U+2261 ISOtech
else if (c.equals("le"))
s.append((char) 8804); // less-than or equal to, U+2264 ISOtech
else if (c.equals("ge"))
s.append((char) 8805); // greater-than or equal to, U+2265 ISOtech
else if (c.equals("sub"))
s.append((char) 8834); // subset of, U+2282 ISOtech
else if (c.equals("sup"))
s.append((char) 8835); // superset of, U+2283 ISOtech
else if (c.equals("nsub"))
s.append((char) 8836); // not a subset of, U+2284 ISOamsn
else if (c.equals("sube"))
s.append((char) 8838); // subset of or equal to, U+2286 ISOtech
else if (c.equals("supe"))
s.append((char) 8839); // superset of or equal to, U+2287 ISOtech
else if (c.equals("oplus"))
s.append((char) 8853); // circled plus = direct sum, U+2295 ISOamsb
else if (c.equals("otimes"))
s.append((char) 8855); // circled times = vector product, U+2297 ISOamsb -->
else if (c.equals("perp"))
s.append((char) 8869); // up tack = orthogonal to = perpendicular, U+22A5 ISOtech
else if (c.equals("sdot"))
s.append((char) 8901); // dot operator, U+22C5 ISOamsb
else if (c.equals("lceil"))
s.append((char) 8968); // left ceiling = apl upstile, U+2308 ISOamsc
else if (c.equals("rceil"))
s.append((char) 8969); // right ceiling, U+2309 ISOamsc
else if (c.equals("lfloor"))
s.append((char) 8970); // left floor = apl downstile, U+230A ISOamsc
else if (c.equals("rfloor"))
s.append((char) 8971); // right floor, U+230B ISOamsc
else if (c.equals("lang"))
s.append((char) 9001); // left-pointing angle bracket = bra, U+2329 ISOtech
else if (c.equals("rang"))
s.append((char) 9002); // right-pointing angle bracket = ket, U+232A ISOtech
else if (c.equals("loz"))
s.append((char) 9674); // lozenge, U+25CA ISOpub
else if (c.equals("spades"))
s.append((char) 9824); // black spade suit, U+2660 ISOpub
else if (c.equals("clubs"))
s.append((char) 9827); // black club suit = shamrock, U+2663 ISOpub
else if (c.equals("hearts"))
s.append((char) 9829); // black heart suit = valentine, U+2665 ISOpub
else if (c.equals("diams"))
s.append((char) 9830); // black diamond suit, U+2666 ISOpub --
else if (c.equals("ndash"))
s.append((char) 8211);
else if (c.equals("mdash"))
s.append((char) 8212);
else if (c.equals("ldquo"))
s.append((char) 8221);
else if (c.equals("rdquo"))
s.append((char) 201D);
else if (c.equals("frac14"))
s.append((char) 188);
else if (!mustBeWellFormed) {
// we guess that this is an accidentally unescaped &
s.append("&"+c);
} else {
throw new FHIRFormatError("unable to parse character reference '" + c + "'' (last text = '" + lastText + "'" + descLoc());
}
}
}
private boolean isInteger(String s, int base) {
try {
Integer.parseInt(s, base);
return true;
} catch (Exception e) {
return false;
}
}
public List parseMDFragment(String source) throws IOException, FHIRException {
XhtmlNode div = parseFragment( "
"+source+"
");
return div.getChildNodes();
}
public List parseMDFragmentStripParas(String source) throws IOException, FHIRException {
XhtmlNode div = parseFragment( "