![JAR search and dependency download from the Maven repository](/logo.png)
edu.harvard.hul.ois.jhove.module.html.HtmlDocDesc Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of html-hul Show documentation
Show all versions of html-hul Show documentation
HTML module developed by Harvard University Library
The newest version!
/**********************************************************************
* Jhove - JSTOR/Harvard Object Validation Environment
* Copyright 2004 by JSTOR and the President and Fellows of Harvard College
**********************************************************************/
package edu.harvard.hul.ois.jhove.module.html;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import edu.harvard.hul.ois.jhove.ErrorMessage;
import edu.harvard.hul.ois.jhove.RepInfo;
import edu.harvard.hul.ois.jhove.module.HtmlModule;
import edu.harvard.hul.ois.jhove.module.utf8.Utf8BlockMarker;
import edu.harvard.hul.ois.jhove.module.xml.HtmlMetadata;
/**
* This is an abstract class for processing an HTML document that has
* been parsed into a List of HtmlElements. It defines common behavior
* for all supported versions of HTML except XHTML. Subclasses
* modify this base as needed.
*
* @author Gary McGath
*
*/
public abstract class HtmlDocDesc {
/** Metadata for this document. */
private HtmlMetadata metadata;
/**
* Generic list of supported tags. For efficiency, this is
* generated only once. Subclasses will need to get a copy
* of this list and make additions or deletions as necessary.
* They must not modify any of the existing
* members of the list.
*/
protected static HashMap commonTags;
/**
* List of supported tags for this version of HTML. The subclass
* is responsible for generating this, typically using commonTags
* as a starting point.
*/
protected Map supportedElements;
/** A representation of the HTML element. */
protected HtmlTagDesc htmlElement;
/** A representation of the HEAD element. */
protected HtmlTagDesc headElement;
/** A representation of the BODY element. */
protected HtmlTagDesc bodyElement;
/** A representation of the FRAMESET element. */
protected HtmlTagDesc framesetElement;
private HtmlStack elementStack;
/** Header tags, which are invariant for all HTML versions. */
protected static String[] headings = { "h1", "h2", "h3", "h4", "h5", "h6" };
/**
* Validates the document and puts interesting properties into the
* RepInfo.
*
* @param elements
* The element list constructed by the parser
* @param info
* The RepInfo object which will be populated
* with properties
*/
public boolean validate(List elements, RepInfo info) {
// As we get to each open tag, we
// check it against the corresponding HtmlTagDesc. If there isn't one,
// we
// mark the document as invalid but continue anyway; we create a
// temporary
// HtmlTagDesc object for the tag that we find, with the closing tag
// indicated
// as optional.
// For each open tag, we push the HtmlTagDesc object onto the stack. We
// check
// if it's in the allowed content of the enclosing element. If not, we
// report it
// as an error but continue with it anyway.
//
// We special-case HTML, HEAD and BODY, which can be implied.
// If a tag is found which requires the content model for one of
// these, and it isn't on the stack, we just push it.
metadata = new HtmlMetadata();
elementStack = new HtmlStack();
elementStack.setHeadElement(headElement);
elementStack.setBodyElement(bodyElement);
elementStack.setFramesetElement(framesetElement);
Iterator iter = elements.iterator();
while (iter.hasNext()) {
JHElement elem = (JHElement) iter.next();
if (elem instanceof JHDoctype) {
// Doctype requires no further processing; grammar
// will have already caught it if it's not at the top
continue;
} else if (elem instanceof JHOpenTag) {
doOpenTag((JHOpenTag) elem, info);
} else if (elem instanceof JHCloseTag) {
doCloseTag((JHCloseTag) elem, info);
} else if (elem instanceof JHErrorElement) {
doErrorElement((JHErrorElement) elem, info);
} else if (elem instanceof JHPCData) {
doPCData((JHPCData) elem, info, metadata);
}
}
// It's a requirement that there be at least a TITLE,
// and thus an implicit or explicit HEAD element.
if (!elementStack.isHeadSeen()) {
info.setMessage(new ErrorMessage(
MessageConstants.HTML_HUL_11));
info.setValid(false);
}
return true;
}
/** Returns the metadata for this document. */
public HtmlMetadata getMetadata() {
return metadata;
}
/**
* Initialization called by subclass constructors after supportedElements
* has been assigned.
*/
protected void init() {
htmlElement = (HtmlTagDesc) supportedElements.get("html");
headElement = (HtmlTagDesc) supportedElements.get("head");
bodyElement = (HtmlTagDesc) supportedElements.get("body");
}
/* Break out open tag code */
private void doOpenTag(JHOpenTag tag, RepInfo info) {
String name = tag.getName().toLowerCase();
boolean unknownTag = false;
ErrorMessage msg = tag.getErrorMessage();
if (msg != null) {
info.setMessage(msg);
info.setWellFormed(false);
// But keep going anyway!
}
/*
* If it's anything but an HTML tag, and the stack is empty,
* push an "HTML" element.
*/
if (elementStack.isEmpty() && !"html".equals(name)) {
JHOpenTag fakeTag = new JHOpenTag("html");
fakeTag.setElement(htmlElement);
elementStack.push(fakeTag);
}
HtmlTagDesc tagDesc = (HtmlTagDesc) supportedElements.get(name);
if (tagDesc == null) {
unknownTag = true;
}
// Check the context only if it's a known tag;
// otherwise we'll issue a redundant error message.
if (!unknownTag && !checkElementContext(tag, info)) {
String toptag = null;
if (!elementStack.isEmpty()) {
JHOpenTag top = elementStack.top();
toptag = top.getName();
}
info.setMessage(new ErrorMessage(
MessageConstants.HTML_HUL_4,
"Name = " + name + ", "
+ (toptag != null ? "Container = " + toptag + ", "
: "")
+ "Line = " + tag.getLine() + ", Column = "
+ tag.getColumn()));
info.setValid(false);
}
if (unknownTag) {
info.setMessage(
new ErrorMessage(MessageConstants.HTML_HUL_6,
"Name = " + name + ", Line = " + tag.getLine()
+ ", Column = " + tag.getColumn()));
info.setValid(false);
// Make a temporary tag descriptor
tagDesc = new HtmlTempTagDesc(name);
}
if (!unknownTag && info.getWellFormed() == RepInfo.TRUE) {
/* Check if the attributes are valid */
List atts = tag.getAttributes();
Iterator iter = atts.iterator();
// Create a list to accumulate all attribute names.
List attNames = new ArrayList(atts.size());
while (iter.hasNext()) {
JHAttribute att = (JHAttribute) iter.next();
String attName = att.getName();
attNames.add(attName);
String attVal = att.getValue();
HtmlAttributeDesc attDesc = tagDesc.namedAttDesc(attName);
if (attDesc == null) {
info.setMessage(new ErrorMessage(
MessageConstants.HTML_HUL_7,
"Name = " + name + ", Attribute = " + attName
+ ", Line = " + att.getLine()
+ ", Column = " + att.getColumn()));
info.setValid(false);
} else {
/* Check if value is legit */
if (!attDesc.valueOK(attName, attVal)) {
info.setMessage(new ErrorMessage(
MessageConstants.HTML_HUL_8,
"Element = " + name + ", Attribute = " + attName
+ ", Value = " + attVal + ", Line = "
+ att.getLine() + ", Column = "
+ att.getColumn()));
info.setValid(false);
}
}
// Extract entities from attribute value
if (attVal != null) {
Iterator entIter = tag.getEntities(attVal).iterator();
Utf8BlockMarker utf8BM = metadata.getUtf8BlockMarker();
while (entIter.hasNext()) {
String ent = (String) entIter.next();
metadata.addEntity(ent);
// If it's a numerical entity, note which UTF8 block
// it's in
try {
if (ent.charAt(1) == '#') {
int entval = Integer.parseInt(
ent.substring(2, ent.length() - 1));
utf8BM.markBlock(entval);
}
} catch (Exception e) {
// Any exception means it's the wrong kind of entity
}
}
}
}
// Check if all required attributes were found.
List missingAtts = tagDesc.missingRequiredAttributes(attNames);
if (!missingAtts.isEmpty()) {
info.setValid(false);
Iterator miter = missingAtts.iterator();
while (miter.hasNext()) {
String matt = (String) miter.next();
info.setMessage(new ErrorMessage(
MessageConstants.HTML_HUL_9,
"Tag = " + name + ", Attribute = " + matt
+ ", Line = " + tag.getLine()
+ ", Column = " + tag.getColumn()));
}
}
}
tag.processElement(metadata);
// If the content is empty, then a closing tag isn't permitted
// (SGML handbook 7.3), so we don't push the open tag.
// But if it's a temporary tag descriptor, we don't know
// anything about it, so all guesses are wild. Push it anyway.
if (tagDesc.isTemp() || !tagDesc.isContentEmpty()) {
tag.setElement(tagDesc);
elementStack.push(tag);
}
}
private void doCloseTag(JHCloseTag tag, RepInfo info) {
String name = tag.getName();
// Dig down into the stack till we find an element which
// matches this. If there's none, report the document
// as not well formed. Also allow for the special case
// of an empty body. (An empty head is illegal.)
int idx = elementStack.search(name);
if (idx == -1) {
info.setMessage(new ErrorMessage(
MessageConstants.HTML_HUL_10,
"Name = " + name + ", Line = " + tag.getLine()
+ ", Column = " + tag.getColumn()));
info.setValid(false);
} else {
// Pop the stack down to the level of the matching tag.
elementStack.popTo(idx);
}
}
private static void doErrorElement(JHErrorElement elem, RepInfo info) {
elem.reportError(info);
}
private void doPCData(JHPCData elem, RepInfo info, HtmlMetadata metadata) {
// Pop any elements that have optional close tags and do not
// allow PCDATA.
if (elementStack.isEmpty()) {
// PCData before any content. This generates an implicit
// html and body if they haven't already been seen.
// It also means the document isn't valid, since the title
// should precede any PCData.
info.setMessage(new ErrorMessage(
MessageConstants.HTML_HUL_12));
info.setValid(false);
return;
}
HtmlTagDesc top = elementStack.top().getElement();
if (top.isTemp() || top.allowsPCData()) {
// We assume that PCData is allowed with unknown tags.
elem.processPCData(elementStack, metadata);
return;
}
// If we can pop elements with optional closing tags till we find
// one that allows PCData, we should do that. But popping the
// stack empty, as could happen if we're in a HEAD element, is
// wrong. So we always allow two elements to remain on the stack.
while (!top.isCloseTagRequired()) {
if (elementStack.size() <= 2) {
break;
}
elementStack.popp();
top = elementStack.top().getElement();
if (top.allowsPCData()) {
elem.processPCData(elementStack, metadata);
return;
}
}
info.setMessage(new ErrorMessage(
MessageConstants.HTML_HUL_13,
"Line = " + elem.getLine() + ", Column = " + elem.getColumn()));
info.setValid(false);
}
/*
* Returns true if the element is permissible at this point.
* This may pop elements off the stack and push implied tags.
*/
private boolean checkElementContext(JHOpenTag elem, RepInfo info) {
/*
* We are guaranteed there's something on the stack
* unless the tag is "html", but Paranoia Is A Virtue
*/
String name = elem.getName();
if (elementStack.isEmpty()) {
return "html".equals(name);
}
if (elementStack.excludesTag(name)) {
return false;
}
JHOpenTag top = elementStack.top();
for (;;) {
if (top.canGetMore() && top.allowsTag(name, this)) {
top.countComponent();
return true;
}
if (!top.canAdvance()) {
/* Can't advance, can't stay put. */
break;
}
top.advanceIndex();
}
/* Kludgy special-case code for optional tags */
HtmlTagDesc topElem = top.getElement();
if (topElem == htmlElement) {
if (!elementStack.isHeadSeen()
&& headElement.allowsTag(name, this)) {
JHOpenTag fakeTag = new JHOpenTag("head");
fakeTag.setElement(headElement);
elementStack.push(fakeTag);
return true;
}
if (!elementStack.isBodySeen() && bodyElement != null
&& bodyElement.allowsTag(name, this)) {
JHOpenTag fakeTag = new JHOpenTag("body");
fakeTag.setElement(bodyElement);
elementStack.push(fakeTag);
return true;
}
return false;
} else if (topElem == headElement) {
if ("body".equals(name) || "frameset".equals(name)) {
// Pop implied head end tag. Is this too much
// special-casing?
elementStack.popp();
elementStack.push(elem);
return true;
} else if (!elementStack.isBodySeen() && bodyElement != null
&& bodyElement.allowsTag(name, this)) {
// Similar to above case except that the head is
// implicitly terminated.
elementStack.popp();
JHOpenTag fakeTag = new JHOpenTag("body");
fakeTag.setElement(bodyElement);
elementStack.push(fakeTag);
return true;
} else {
return false;
}
}
// Pop elements till we find a valid context. If
// the enclosing element doesn't have an optional close
// tag, report an error but pop it anyway. But first
// check if there even is a context to which we can pop things.
boolean complained = false;
boolean searchStack = false;
if (elementStack.size() > 2) {
Iterator iter = elementStack.iterator();
// Discard html element
iter.next();
while (iter.hasNext()) {
JHOpenTag otag = (JHOpenTag) iter.next();
if (otag.allowsTag(name, this)) {
searchStack = true;
break;
}
}
}
if (searchStack) {
// We've established we can pop down to something.
while (elementStack.size() > 2) {
if (!complained) {
top = elementStack.top();
topElem = top.getElement();
if (topElem.isCloseTagRequired()) {
info.setValid(false);
info.setMessage(new ErrorMessage(
MessageConstants.HTML_HUL_5,
"Name = " + name + ", " + "Container = "
+ top.getName() + ", " + "Line = "
+ elem.getLine() + ", Column = "
+ elem.getColumn()));
}
}
elementStack.popp();
top = elementStack.top();
// topElem = top.getElement ();
if (top.allowsTag(name, this)) {
return true;
}
if (elementStack.isEmpty()) {
break;
}
}
}
return false;
}
/** Adds all the Strings in an array to the end of a List. */
protected static void addStringsToList(String[] names, List lst) {
for (int i = 0; i < names.length; i++) {
lst.add(names[i]);
}
}
/**
* Adds an attribute to a List, with unrestricted values and
* type IMPLIED.
*/
protected static void addSimpleAttribute(List atts, String name) {
atts.add(new HtmlAttributeDesc(name));
}
/**
* Adds an attribute to a List, with unrestricted values and
* type REQUIRED.
*/
protected static void addRequiredAttribute(List atts, String name) {
atts.add(new HtmlAttributeDesc(name, null, HtmlAttributeDesc.REQUIRED));
}
/**
* Adds an attribute to a List, with the only permitted value being
* the name of the attribute. This kind of attribute is normally
* represented in HTML without an explicit value; in fact, some (most?)
* readers won't permit an explicit value.
*/
protected static void addSelfAttribute(List atts, String name) {
atts.add(new HtmlAttributeDesc(name, new String[] { name },
HtmlAttributeDesc.IMPLIED));
}
/** Removes excluded strings from a List. */
protected static void removeStringsFromList(List lst, String[] strs) {
for (int i = 0; i < strs.length; i++) {
lst.remove(strs[i]);
}
}
/** Pushes an element onto the element stack. */
protected void pushElementStack(JHOpenTag tag) {
elementStack.push(tag);
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy