All Downloads are FREE. Search and download functionalities are using the official Maven repository.

edu.harvard.hul.ois.jhove.module.html.ParseHtml.jj Maven / Gradle / Ivy

/**********************************************************************
 * Jhove - JSTOR/Harvard Object Validation Environment
 * Copyright 2004 by JSTOR and the President and Fellows of Harvard College
 **********************************************************************/
 
/* javacc grammar for parsing HTML into a List that higher-level
   code can handle.
*/

options { 
    IGNORE_CASE = true; 
    USER_CHAR_STREAM = true;
    STATIC = false; 
} 

PARSER_BEGIN(ParseHtml)


package edu.harvard.hul.ois.jhove.module.html;

import java.util.*;

public class ParseHtml
{
    private List elements;
    
    public List getElements ()
    {
        return elements;
    }
    

}

PARSER_END(ParseHtml)


/* Lexical productions start here.   */
TOKEN:
{
     : IN_DOCTYPE |
     : IN_TAG 
}


MORE:
{
     : IN_PCDATA
}


TOKEN:
{
     : DEFAULT
}



TOKEN:
{
     |
     : IN_ATTVALUE 
}


TOKEN:
{
    ", "=", ",", "\""])+ > : IN_TAG |
     : IN_TAG |
     : IN_TAG 
}



TOKEN:
{
     |
     |
    
}


TOKEN:
{
    "> : DEFAULT
}

 
TOKEN:
{
     : IN_DOCTYPE2 
}


TOKEN:
{
    "])+ > |
    
}


SKIP :  /* white space */
{
  " "
| "\t"
| "\n"
| "\r"
| "\f"
}

MORE :
{
   : IN_COMMENT 
}


SPECIAL_TOKEN :
{
  <"--" > : ENDING_COMMENT
}


MORE :
{
  < ~[] >
}



SPECIAL_TOKEN :
{
  <">" > : DEFAULT |
  < ~[">"] > : IN_COMMENT
}



List HtmlDoc () :
{
    elements = new LinkedList ();
}
{
/* Production block -- looks a little like Java but isn't */
    (Element(elements))* 
    { return elements; }
}

JHElement Element (List elements) :
{
    JHElement elem;
}
{
    try {
        LOOKAHEAD(2)
        elem = Doctype () { return elem; } |
        LOOKAHEAD(2)
        elem = OpenTag() { return elem; } |
        LOOKAHEAD(2)
        elem = CloseTag() { return elem; }  |
        elem = PCData() { return elem; }  |
        LOOKAHEAD(2)
        elem = XMLDecl() { return elem; } /* |
        elem = ProcessingInst() */ 
    }
    catch (ParseException e) {
        StringBuilder errText = new StringBuilder();
        for (;;) {
            token_source.SwitchTo(DEFAULT);
            Token tok = getNextToken ();
            if (tok.kind == LABRACKET || tok.kind == PCDATA) {
                break;
            }
            errText.append("Text = \"").append(tok.image).append("\", Line = ")
                .append(tok.beginLine).append(", Column = ").append(tok.beginColumn);
            /****** Added GDM 14-Jun-05 to avoid infinite loop ********/
            if ("".equals (tok.image)) {
                break;
            }
            /******* End Added GDM 14-Jun-05 to avoid infinite loop ********/
        }
        return new JHErrorElement(elements, MessageConstants.ERR_HTML_PARSING_ERROR, errText.toString(), true); 
    }
    { return elem; } 
}


JHOpenTag OpenTag () :
{
    List attrs = new LinkedList ();
    Token name;
    String slasher;
    boolean complete;
}
{
     name = Name () (Attribute(attrs))* slasher = TagCloser ()
    { if ("/".equals (slasher)) { 
         /* This is a special hack so that a tag closed with "/>" will keep
            the whole thing from falling apart, yet will generate an error */
         return new JHOpenTag (elements, name.image, attrs, 
           name.beginLine, name.beginColumn,
           MessageConstants.WRN_INCORRECT_AUTO_CLOSED_TAG);
     }
     else {
         return new JHOpenTag (elements, name.image, attrs, 
            name.beginLine, name.beginColumn); 
     }
    }
}

JHXmlDecl XMLDecl () :
{
    List attrs = new LinkedList ();
}
{
       (Attribute(attrs))* 
    {  return new JHXmlDecl (elements); }
}

JHCloseTag CloseTag () :
{
    Token name;
}
{
      name = Name () 
    { return new JHCloseTag (elements, name.image,
              name.beginLine, name.beginColumn); }
}

JHPCData PCData () :
{
    Token tok = getToken(1);
}
{
     { return new JHPCData (elements, tok.image, tok.beginLine, tok.beginColumn); }
}

JHDoctype Doctype () :
{
    List doctypeElements = new LinkedList ();
}
{
      (DoctypeItem (doctypeElements))* 
    {return new JHDoctype (elements, doctypeElements); }
}


/* This is a last-resort production which consumes a token
   and returns an element that will be flagged as an error. */
JAVACODE JHErrorElement ConsumeError () 
{
    Token tok = getNextToken();
    return new JHErrorElement (elements, MessageConstants.ERR_HTML_PARSING_ERROR, tok.image, true);
}

void DoctypeItem (List dtElements) :
{
    Token tok = getToken(1);
}
{
     { dtElements.add (tok.image); } |
     { dtElements.add (tok.image); }
}


Token Name () :
{
    Token tok = getToken(1);
}
{
     { return tok; }
}

String AttrVal () :
{
    Token tok = getToken(1);
}
{
     { return tok.image; } |
     { return tok.image; }
     { return tok.image; }
}

void Attribute (List attrs) :
{
    JHAttribute attval;
    Token name;
    Token namespace;
    String val;
}
{
    LOOKAHEAD(2)
    namespace = Name()  name = Name()  val = AttrVal()
    { attval = new JHAttribute (name.image, namespace.image, val,
            name.beginLine, name.beginColumn);
      attrs.add(attval); } |
     LOOKAHEAD(2)
    namespace = Name()  name = Name()
    { attval = new JHAttribute (name.image, namespace.image, 
            null,
            name.beginLine, name.beginColumn);
      attrs.add(attval); } |
    LOOKAHEAD(2)
    name = Name()  val = AttrVal()
    { attval = new JHAttribute (name.image, null, val,
            name.beginLine, name.beginColumn); 
      attrs.add(attval); } |
     LOOKAHEAD(2)
    name = Name()
    { attval = new JHAttribute (name.image, null, null,
            name.beginLine, name.beginColumn);
      attrs.add(attval); }
}

String TagCloser () :
{
    Token tok = getToken (1);
}
{
    (   ) { return tok.image; } | 
      { return tok.image; }
}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy