edu.harvard.hul.ois.jhove.module.html.HtmlDocDesc Maven / Gradle / Ivy
/**********************************************************************
* Jhove - JSTOR/Harvard Object Validation Environment
* Copyright 2004 by JSTOR and the President and Fellows of Harvard College
**********************************************************************/
package edu.harvard.hul.ois.jhove.module.html;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import edu.harvard.hul.ois.jhove.ErrorMessage;
import edu.harvard.hul.ois.jhove.RepInfo;
import edu.harvard.hul.ois.jhove.module.Utf8BlockMarker;
/**
* This is an abstract class for processing an HTML document that has
* been parsed into a List of HtmlElements. It defines common behavior
* for all supported versions of HTML except XHTML. Subclasses
* modify this base as needed.
*
* @author Gary McGath
*
*/
public abstract class HtmlDocDesc {
/** Metadata for this document. */
private HtmlMetadata metadata;
/** Generic list of supported tags. For efficiency, this is
* generated only once. Subclasses will need to get a copy
* of this list and make additions or deletions as necessary.
* They must not modify any of the existing
* members of the list. */
protected static HashMap commonTags;
/** List of supported tags for this version of HTML. The subclass
* is responsible for generating this, typically using commonTags
* as a starting point. */
protected Map supportedElements;
/** A representation of the HTML element. */
protected HtmlTagDesc htmlElement;
/** A representation of the HEAD element. */
protected HtmlTagDesc headElement;
/** A representation of the BODY element. */
protected HtmlTagDesc bodyElement;
/** A representation of the FRAMESET element. */
protected HtmlTagDesc framesetElement;
private HtmlStack elementStack;
/** Header tags, which are invariant for all HTML versions. */
protected static String[] headings =
{ "h1", "h2", "h3", "h4", "h5", "h6" };
/** Consructor. */
public HtmlDocDesc ()
{
}
/** Validates the document and puts interesting properties into the
* RepInfo.
*
* @param elements The element list constructed by the parser
* @param info The RepInfo object which will be populated
* with properties
*/
public boolean validate (List elements, RepInfo info) {
// As we get to each open tag, we
// check it against the corresponding HtmlTagDesc. If there isn't one, we
// mark the document as invalid but continue anyway; we create a temporary
// HtmlTagDesc object for the tag that we find, with the closing tag indicated
// as optional.
// For each open tag, we push the HtmlTagDesc object onto the stack. We check
// if it's in the allowed content of the enclosing element. If not, we report it
// as an error but continue with it anyway.
//
// We special-case HTML, HEAD and BODY, which can be implied.
// If a tag is found which requires the content model for one of
// these, and it isn't on the stack, we just push it.
metadata = new HtmlMetadata ();
elementStack = new HtmlStack ();
elementStack.setHeadElement (headElement);
elementStack.setBodyElement (bodyElement);
elementStack.setFramesetElement (framesetElement);
Iterator iter = elements.iterator();
while (iter.hasNext ()) {
JHElement elem = (JHElement) iter.next ();
if (elem instanceof JHDoctype) {
// Doctype requires no further processing; grammar
// will have already caught it if it's not at the top
continue;
}
else if (elem instanceof JHOpenTag) {
doOpenTag ((JHOpenTag) elem, info);
}
else if (elem instanceof JHCloseTag) {
doCloseTag ((JHCloseTag) elem, info);
}
else if (elem instanceof JHErrorElement) {
doErrorElement ((JHErrorElement) elem, info);
}
else if (elem instanceof JHPCData) {
doPCData ((JHPCData) elem, info, metadata);
}
}
// It's a requirement that there be at least a TITLE,
// and thus an implicit or explicit HEAD element.
if (!elementStack.isHeadSeen ()) {
info.setMessage(new ErrorMessage
(MessageConstants.ERR_HEAD_ELE_MISS));
info.setValid (false);
}
return true;
}
/** Returns the metadata for this document. */
public HtmlMetadata getMetadata ()
{
return metadata;
}
/** Initialization called by subclass constructors after supportedElements
* has been assigned. */
protected void init ()
{
htmlElement = (HtmlTagDesc) supportedElements.get ("html");
headElement = (HtmlTagDesc) supportedElements.get ("head");
bodyElement = (HtmlTagDesc) supportedElements.get ("body");
}
/* Break out open tag code */
private void doOpenTag (JHOpenTag tag, RepInfo info)
{
String name = tag.getName ().toLowerCase ();
boolean unknownTag = false;
String msg = tag.getErrorMessage ();
if (msg != null) {
info.setMessage (new ErrorMessage
(msg,
"Name = " + name + ", Line = " +
tag.getLine () + ", Column = " +
tag.getColumn () ));
info.setWellFormed (false);
// But keep going anyway!
}
/* If it's anything but an HTML tag, and the stack is empty,
* push an "HTML" element. */
if (elementStack.isEmpty ()) {
if (!"html".equals (name)) {
JHOpenTag fakeTag = new JHOpenTag ("html");
fakeTag.setElement (htmlElement);
elementStack.push (fakeTag);
}
}
HtmlTagDesc tagDesc =
(HtmlTagDesc) supportedElements.get (name);
if (tagDesc == null) {
unknownTag = true;
}
// Check the context only if it's a known tag;
// otherwise we'll issue a redundant error message.
if (!unknownTag && !checkElementContext (tag, info)) {
String toptag = null;
if (!elementStack.isEmpty ()) {
JHOpenTag top = (JHOpenTag) elementStack.top();
toptag = top.getName();
}
info.setMessage (new ErrorMessage
(MessageConstants.ERR_HTML_ILLEGAL_TAG,
"Name = " + name + ", " +
(toptag != null ? "Container = " + toptag + ", " : "") +
"Line = " + tag.getLine () + ", Column = " +
tag.getColumn () ));
info.setValid (false);
}
if (unknownTag) {
info.setMessage (new ErrorMessage
(MessageConstants.ERR_HTML_UNKNOWN_TAG,
"Name = " + name + ", Line = " +
tag.getLine () + ", Column = " +
tag.getColumn ()));
info.setValid (false);
// Make a temporary tag descriptor
tagDesc = new HtmlTempTagDesc (name);
}
if (!unknownTag && info.getWellFormed() == RepInfo.TRUE) {
/* Check if the attributes are valid */
List atts = tag.getAttributes ();
Iterator iter = atts.iterator ();
// Create a list to accumulate all attribute names.
List attNames = new ArrayList (atts.size ());
while (iter.hasNext ()) {
JHAttribute att = (JHAttribute) iter.next ();
String attName = att.getName();
attNames.add (attName);
String attVal = att.getValue();
HtmlAttributeDesc attDesc =
tagDesc.namedAttDesc (attName);
if (attDesc == null) {
info.setMessage ( new ErrorMessage
(MessageConstants.ERR_HTML_UNDEFINED_ATTRIBUTE,
"Name = " + name + ", Attribute = " +
attName + ", Line = " + att.getLine () +
", Column = " + att.getColumn ()));
info.setValid (false);
}
else {
/* Check if value is legit */
if (!attDesc.valueOK (attName, attVal)) {
info.setMessage (new ErrorMessage
(MessageConstants.ERR_HTML_BAD_VALUE_IN_ATTRIBUTE,
"Element = " + name + ", Attribute = " +
attName + ", Value = " + attVal +
", Line = " + att.getLine () +
", Column = " + att.getColumn ()));
info.setValid (false);
}
}
// Extract entities from attribute value
if (attVal != null) {
Iterator entIter = tag.getEntities (attVal).iterator ();
Utf8BlockMarker utf8BM = metadata.getUtf8BlockMarker ();
while (entIter.hasNext ()) {
String ent = (String) entIter.next ();
metadata.addEntity (ent);
// If it's a numerical entity, note which UTF8 block it's in
try {
if (ent.charAt (1) == '#') {
int entval = Integer.parseInt
(ent.substring (2, ent.length() - 1));
utf8BM.markBlock(entval);
}
}
catch (Exception e) {
// Any exception means it's the wrong kind of entity
}
}
}
}
// Check if all required attributes were found.
List missingAtts = tagDesc.missingRequiredAttributes(attNames);
if (!missingAtts.isEmpty ()) {
info.setValid (false);
Iterator miter = missingAtts.iterator ();
while (miter.hasNext ()) {
String matt = (String) miter.next ();
info.setMessage (new ErrorMessage
(MessageConstants.ERR_HTML_MISSING_ATTRIBUTE,
"Tag = " + name + ", Attribute = " + matt +
", Line = " + tag.getLine () +
", Column = " + tag.getColumn ()));
}
}
}
tag.processElement (metadata);
// If the content is empty, then a closing tag isn't permitted
// (SGML handbook 7.3), so we don't push the open tag.
// But if it's a temporary tag descriptor, we don't know
// anything about it, so all guesses are wild. Push it anyway.
if (tagDesc.isTemp () || !tagDesc.isContentEmpty()) {
tag.setElement (tagDesc);
elementStack.push (tag);
}
}
private void doCloseTag (JHCloseTag tag, RepInfo info)
{
String name = tag.getName ();
// Dig down into the stack till we find an element which
// matches this. If there's none, report the document
// as not well formed. Also allow for the special case
// of an empty body. (An empty head is illegal.)
int idx = elementStack.search (name);
if (idx == -1) {
info.setMessage (new ErrorMessage
(MessageConstants.ERR_HTML_CLOSED_TAG_NO_OPEN,
"Name = " + name + ", Line = " + tag.getLine () +
", Column = " + tag.getColumn ()));
info.setValid (false);
}
else {
// Pop the stack down to the level of the matching tag.
elementStack.popTo (idx);
}
}
private void doErrorElement (JHErrorElement elem, RepInfo info)
{
elem.reportError (info);
}
private void doPCData (JHPCData elem, RepInfo info, HtmlMetadata metadata)
{
// Pop any elements that have optional close tags and do not
// allow PCDATA.
if (elementStack.isEmpty ()) {
// PCData before any content. This generates an implicit
// html and body if they haven't already been seen.
// It also means the document isn't valid, since the title
// should precede any PCData.
info.setMessage(new ErrorMessage
(MessageConstants.ERR_HEAD_ELE_MISS));
info.setValid (false);
return;
}
HtmlTagDesc top = elementStack.top ().getElement ();
if (top.isTemp() || top.allowsPCData ()) {
// We assume that PCData is allowed with unknown tags.
elem.processPCData (elementStack, metadata);
return;
}
// If we can pop elements with optional closing tags till we find
// one that allows PCData, we should do that. But popping the
// stack empty, as could happen if we're in a HEAD element, is
// wrong. So we always allow two elements to remain on the stack.
while (!top.isCloseTagRequired ()) {
if (elementStack.size () <= 2) {
break;
}
elementStack.popp ();
top = elementStack.top ().getElement ();
if (top.allowsPCData ()) {
elem.processPCData (elementStack, metadata);
return;
}
}
info.setMessage (new ErrorMessage (MessageConstants.ERR_HTML_BAD_PC_DATA,
"Line = " +
elem.getLine () + ", Column = " +
elem.getColumn () ));
info.setValid (false);
}
/* Returns true if the element is permissible at this point.
* This may pop elements off the stack and push implied tags.
*/
private boolean checkElementContext (JHOpenTag elem, RepInfo info)
{
/* We are guaranteed there's something on the stack
* unless the tag is "html", but Paranoia Is A Virtue */
String name = elem.getName ();
if (elementStack.isEmpty ()) {
if ("html".equals (name)) {
return true;
}
// This shouldn't happen
return false;
}
if (elementStack.excludesTag (name)) {
return false;
}
JHOpenTag top = elementStack.top ();
for (;;) {
if (top.canGetMore () && top.allowsTag (name, this)) {
top.countComponent ();
return true;
}
if (!top.canAdvance ()) {
/* Can't advance, can't stay put. */
break;
}
top.advanceIndex ();
}
/* Kludgy special-case code for optional tags */
HtmlTagDesc topElem = top.getElement ();
if (topElem == htmlElement) {
if (!elementStack.isHeadSeen () && headElement.allowsTag (name, this)) {
JHOpenTag fakeTag = new JHOpenTag ("head");
fakeTag.setElement(headElement);
elementStack.push (fakeTag);
return true;
}
if (!elementStack.isBodySeen () &&
bodyElement != null &&
bodyElement.allowsTag (name, this)) {
JHOpenTag fakeTag = new JHOpenTag ("body");
fakeTag.setElement (bodyElement);
elementStack.push (fakeTag);
return true;
}
return false;
}
else if (topElem == headElement) {
if ("body".equals (name) || "frameset".equals (name)) {
// Pop implied head end tag. Is this too much
// special-casing?
elementStack.popp ();
elementStack.push (elem);
return true;
}
else if (!elementStack.isBodySeen () &&
bodyElement != null &&
bodyElement.allowsTag (name, this)) {
// Similar to above case except that the head is
// implicitly terminated.
elementStack.popp ();
JHOpenTag fakeTag = new JHOpenTag ("body");
fakeTag.setElement (bodyElement);
elementStack.push (fakeTag);
return true;
}
else {
return false;
}
}
// Pop elements till we find a valid context. If
// the enclosing element doesn't have an optional close
// tag, report an error but pop it anyway. But first
// check if there even is a context to which we can pop things.
boolean complained = false;
boolean searchStack = false;
if (elementStack.size () > 2) {
Iterator iter = elementStack.iterator ();
// Discard html element
iter.next ();
while (iter.hasNext ()) {
JHOpenTag otag = (JHOpenTag) iter.next ();
if (otag.allowsTag (name, this)) {
searchStack = true;
break;
}
}
}
if (searchStack) {
// We've established we can pop down to something.
while (elementStack.size () > 2) {
if (!complained) {
top = elementStack.top ();
topElem = top.getElement ();
if (topElem.isCloseTagRequired()) {
info.setValid (false);
info.setMessage (new ErrorMessage
(MessageConstants.ERR_HTML_ILLEGAL_TAG,
"Name = " + name + ", " +
"Container = " + top.getName() + ", " +
"Line = " + elem.getLine() + ", Column = " +
elem.getColumn ()));
}
}
elementStack.popp ();
top = elementStack.top ();
//topElem = top.getElement ();
if (top.allowsTag (name, this)) {
return true;
}
if (elementStack.isEmpty ()) {
break;
}
}
}
return false;
}
/** Adds all the Strings in an array to the end of a List. */
protected static void addStringsToList (String[] names, List lst)
{
for (int i = 0; i < names.length; i++) {
lst.add (names[i]);
}
}
/** Adds an attribute to a List, with unrestricted values and
* type IMPLIED. */
protected static void addSimpleAttribute (List atts, String name)
{
atts.add (new HtmlAttributeDesc (name));
}
/** Adds an attribute to a List, with unrestricted values and
* type REQUIRED. */
protected static void addRequiredAttribute (List atts, String name)
{
atts.add (new HtmlAttributeDesc (name, null, HtmlAttributeDesc.REQUIRED));
}
/** Adds an attribute to a List, with the only permitted value being
* the name of the attribute. This kind of attribute is normally
* represented in HTML without an explicit value; in fact, some (most?)
* readers won't permit an explicit value. */
protected static void addSelfAttribute (List atts, String name)
{
atts.add (new HtmlAttributeDesc (name,
new String[] { name },
HtmlAttributeDesc.IMPLIED));
}
/** Removes excluded strings from a List. */
protected static void removeStringsFromList (List lst, String [] strs)
{
for (int i = 0; i < strs.length; i++) {
lst.remove(strs[i]);
}
}
/** Pushes an element onto the element stack. */
protected void pushElementStack (JHOpenTag tag)
{
elementStack.push (tag);
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy