edu.harvard.hul.ois.jhove.module.html.ParseHtml.jj Maven / Gradle / Ivy
/**********************************************************************
* Jhove - JSTOR/Harvard Object Validation Environment
* Copyright 2004 by JSTOR and the President and Fellows of Harvard College
**********************************************************************/
/* javacc grammar for parsing HTML into a List that higher-level
code can handle.
*/
options {
IGNORE_CASE = true;
USER_CHAR_STREAM = true;
STATIC = false;
}
PARSER_BEGIN(ParseHtml)
package edu.harvard.hul.ois.jhove.module.html;
import java.util.*;
public class ParseHtml
{
private List elements;
public List getElements ()
{
return elements;
}
}
PARSER_END(ParseHtml)
/* Lexical productions start here. */
TOKEN:
{
: IN_DOCTYPE |
: IN_TAG
}
MORE:
{
: IN_PCDATA
}
TOKEN:
{
: DEFAULT
}
TOKEN:
{
|
: IN_ATTVALUE
}
TOKEN:
{
", "=", ",", "\""])+ > : IN_TAG |
: IN_TAG |
: IN_TAG
}
TOKEN:
{
|
|
}
TOKEN:
{
"> : DEFAULT
}
TOKEN:
{
: IN_DOCTYPE2
}
TOKEN:
{
"])+ > |
}
SKIP : /* white space */
{
" "
| "\t"
| "\n"
| "\r"
| "\f"
}
MORE :
{
: IN_COMMENT
}
SPECIAL_TOKEN :
{
<"--" > : ENDING_COMMENT
}
MORE :
{
< ~[] >
}
SPECIAL_TOKEN :
{
<">" > : DEFAULT |
< ~[">"] > : IN_COMMENT
}
List HtmlDoc () :
{
elements = new LinkedList ();
}
{
/* Production block -- looks a little like Java but isn't */
(Element(elements))*
{ return elements; }
}
JHElement Element (List elements) :
{
JHElement elem;
}
{
try {
LOOKAHEAD(2)
elem = Doctype () { return elem; } |
LOOKAHEAD(2)
elem = OpenTag() { return elem; } |
LOOKAHEAD(2)
elem = CloseTag() { return elem; } |
elem = PCData() { return elem; } |
LOOKAHEAD(2)
elem = XMLDecl() { return elem; } /* |
elem = ProcessingInst() */
}
catch (ParseException e) {
StringBuilder errText = new StringBuilder();
for (;;) {
token_source.SwitchTo(DEFAULT);
Token tok = getNextToken ();
if (tok.kind == LABRACKET || tok.kind == PCDATA) {
break;
}
errText.append("Text = \"").append(tok.image).append("\", Line = ")
.append(tok.beginLine).append(", Column = ").append(tok.beginColumn);
/****** Added GDM 14-Jun-05 to avoid infinite loop ********/
if ("".equals (tok.image)) {
break;
}
/******* End Added GDM 14-Jun-05 to avoid infinite loop ********/
}
return new JHErrorElement(elements, MessageConstants.ERR_HTML_PARSING_ERROR, errText.toString(), true);
}
{ return elem; }
}
JHOpenTag OpenTag () :
{
List attrs = new LinkedList ();
Token name;
String slasher;
boolean complete;
}
{
name = Name () (Attribute(attrs))* slasher = TagCloser ()
{ if ("/".equals (slasher)) {
/* This is a special hack so that a tag closed with "/>" will keep
the whole thing from falling apart, yet will generate an error */
return new JHOpenTag (elements, name.image, attrs,
name.beginLine, name.beginColumn,
MessageConstants.WRN_INCORRECT_AUTO_CLOSED_TAG);
}
else {
return new JHOpenTag (elements, name.image, attrs,
name.beginLine, name.beginColumn);
}
}
}
JHXmlDecl XMLDecl () :
{
List attrs = new LinkedList ();
}
{
(Attribute(attrs))*
{ return new JHXmlDecl (elements); }
}
JHCloseTag CloseTag () :
{
Token name;
}
{
name = Name ()
{ return new JHCloseTag (elements, name.image,
name.beginLine, name.beginColumn); }
}
JHPCData PCData () :
{
Token tok = getToken(1);
}
{
{ return new JHPCData (elements, tok.image, tok.beginLine, tok.beginColumn); }
}
JHDoctype Doctype () :
{
List doctypeElements = new LinkedList ();
}
{
(DoctypeItem (doctypeElements))*
{return new JHDoctype (elements, doctypeElements); }
}
/* This is a last-resort production which consumes a token
and returns an element that will be flagged as an error. */
JAVACODE JHErrorElement ConsumeError ()
{
Token tok = getNextToken();
return new JHErrorElement (elements, MessageConstants.ERR_HTML_PARSING_ERROR, tok.image, true);
}
void DoctypeItem (List dtElements) :
{
Token tok = getToken(1);
}
{
{ dtElements.add (tok.image); } |
{ dtElements.add (tok.image); }
}
Token Name () :
{
Token tok = getToken(1);
}
{
{ return tok; }
}
String AttrVal () :
{
Token tok = getToken(1);
}
{
{ return tok.image; } |
{ return tok.image; }
{ return tok.image; }
}
void Attribute (List attrs) :
{
JHAttribute attval;
Token name;
Token namespace;
String val;
}
{
LOOKAHEAD(2)
namespace = Name() name = Name() val = AttrVal()
{ attval = new JHAttribute (name.image, namespace.image, val,
name.beginLine, name.beginColumn);
attrs.add(attval); } |
LOOKAHEAD(2)
namespace = Name() name = Name()
{ attval = new JHAttribute (name.image, namespace.image,
null,
name.beginLine, name.beginColumn);
attrs.add(attval); } |
LOOKAHEAD(2)
name = Name() val = AttrVal()
{ attval = new JHAttribute (name.image, null, val,
name.beginLine, name.beginColumn);
attrs.add(attval); } |
LOOKAHEAD(2)
name = Name()
{ attval = new JHAttribute (name.image, null, null,
name.beginLine, name.beginColumn);
attrs.add(attval); }
}
String TagCloser () :
{
Token tok = getToken (1);
}
{
( ) { return tok.image; } |
{ return tok.image; }
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy