All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.htmlparser.lexer.Lexer Maven / Gradle / Ivy

// HTMLParser Library $Name: v1_5 $ - A java-based parser for HTML
// http://sourceforge.org/projects/htmlparser
// Copyright (C) 2004 Derrick Oswald
//
// Revision Control Information
//
// $Source: /cvsroot/htmlparser/htmlparser/src/org/htmlparser/lexer/Lexer.java,v $
// $Author: derrickoswald $
// $Date: 2005/05/15 11:49:04 $
// $Revision: 1.39 $
//
// This library is free software; you can redistribute it and/or
// modify it under the terms of the GNU Lesser General Public
// License as published by the Free Software Foundation; either
// version 2.1 of the License, or (at your option) any later version.
//
// This library is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
// Lesser General Public License for more details.
//
// You should have received a copy of the GNU Lesser General Public
// License along with this library; if not, write to the Free Software
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
//

package org.htmlparser.lexer;

import java.io.Serializable;
import java.net.MalformedURLException;
import java.net.URLConnection;
import java.util.ArrayList;
import java.util.List;

import org.htmlparser.Node;
import org.htmlparser.NodeFactory;
import org.htmlparser.Remark;
import org.htmlparser.Tag;
import org.htmlparser.Text;
import org.htmlparser.nodes.RemarkNode;
import org.htmlparser.nodes.TagNode;
import org.htmlparser.nodes.TextNode;
import org.htmlparser.util.ParserException;

import com.frameworkset.util.RegexUtil;

/**
 * This class parses the HTML stream into nodes.
 * There are three major types of nodes (lexemes):
 * 
    *
  • Remark
  • *
  • Text
  • *
  • Tag
  • *
* Each time nextNode() is called, another node is returned until * the stream is exhausted, and null is returned. */ public class Lexer implements Serializable, NodeFactory { protected JspTagAware jspTagAware; // public static final String jsp_custom_tag_pattern = "<[A-Za-z][\\w]*:[^/]+[/]?>"; public static final String jsp_custom_tag_pattern = "<[A-Za-z][\\w]*:.+[/]?>"; /** * The page lexemes are retrieved from. */ protected Page mPage; /** * The current position on the page. */ protected Cursor mCursor; /** * The factory for new nodes. */ protected NodeFactory mFactory; /** * Line number to trigger on. * This is tested on each nextNode() call, as a debugging aid. * Alter this value and set a breakpoint on the guarded statement. * Remember, these line numbers are zero based, while most editors are * one based. * @see #nextNode */ protected static int mDebugLineTrigger = -1; /** * Creates a new instance of a Lexer. */ public Lexer () { this (new Page ("")); } /** * Creates a new instance of a Lexer. * @param page The page with HTML text. */ public Lexer (Page page) { setPage (page); setCursor (new Cursor (page, 0)); setNodeFactory (this); } /** * Creates a new instance of a Lexer. * @param text The text to parse. */ public Lexer (String text) { this (new Page (text)); } /** * Creates a new instance of a Lexer. * @param connection The url to parse. * @exception ParserException If an error occurs opening the connection. */ public Lexer (URLConnection connection) throws ParserException { this (new Page (connection)); } /** * Reset the lexer to start parsing from the beginning again. * The underlying components are reset such that the next call to * nextNode() will return the first lexeme on the page. */ public void reset () { getPage ().reset (); setCursor (new Cursor (getPage (), 0)); } /** * Get the page this lexer is working on. * @return The page that nodes are being read from. */ public Page getPage () { return (mPage); } /** * Set the page this lexer is working on. * @param page The page that nodes will be read from. */ public void setPage (Page page) { if (null == page) throw new IllegalArgumentException ("page cannot be null"); // todo: sanity checks mPage = page; } /** * Get the current scanning position. * @return The lexer's cursor position. */ public Cursor getCursor () { return (mCursor); } /** * Set the current scanning position. * @param cursor The lexer's new cursor position. */ public void setCursor (Cursor cursor) { if (null == cursor) throw new IllegalArgumentException ("cursor cannot be null"); // todo: sanity checks mCursor = cursor; } /** * Get the current node factory. * @return The lexer's node factory. */ public NodeFactory getNodeFactory () { return (mFactory); } /** * Set the current node factory. * @param factory The node factory to be used by the lexer. */ public void setNodeFactory (NodeFactory factory) { if (null == factory) throw new IllegalArgumentException ("node factory cannot be null"); mFactory = factory; } /** * Get the current cursor position. * @return The current character offset into the source. */ public int getPosition () { return (getCursor ().getPosition ()); } /** * Set the current cursor position. * @param position The new character offset into the source. */ public void setPosition (int position) { // todo: sanity checks getCursor ().setPosition (position); } /** * Get the current line number. * @return The line number the lexer's working on. */ public int getCurrentLineNumber () { return (getPage ().row (getCursor ())); } /** * Get the current line. * @return The string the lexer's working on. */ public String getCurrentLine () { return (getPage ().getLine (getCursor ())); } /** * Get the next node from the source. * @return A Remark, Text or Tag, or null if no * more lexemes are present. * @exception ParserException If there is a problem with the * underlying page. */ public Node nextNode () throws ParserException { return nextNode (false); } /** * Get the next node from the source. * @param quotesmart If true, strings ignore quoted contents. * @return A Remark, Text or Tag, or null if no * more lexemes are present. * @exception ParserException If there is a problem with the * underlying page. */ public Node nextNode (boolean quotesmart) throws ParserException { int start; char ch; Node ret; // debugging suppport if (-1 != mDebugLineTrigger) { Page page = getPage (); int lineno = page.row (mCursor); if (mDebugLineTrigger < lineno) mDebugLineTrigger = lineno + 1; // trigger on next line too } start = mCursor.getPosition (); ch = mPage.getCharacter (mCursor); switch (ch) { case Page.EOF: ret = null; break; case '<': ch = mPage.getCharacter (mCursor); if (Page.EOF == ch) ret = makeString (start, mCursor.getPosition ()); else if ('%' == ch) { mCursor.retreat (); ret = parseJsp (start); } else if ('/' == ch || '%' == ch || Character.isLetter (ch)) { mCursor.retreat (); ret = parseTag (start); } else if ('!' == ch) { ch = mPage.getCharacter (mCursor); if (Page.EOF == ch) ret = makeString (start, mCursor.getPosition ()); else { if ('>' == ch) // handle ret = makeRemark (start, mCursor.getPosition ()); else { mCursor.retreat (); // remark/tag need this char if ('-' == ch) ret = parseRemark (start, quotesmart); else { mCursor.retreat (); // tag needs prior one too ret = parseTag (start); } } } } else ret = parseString (start, quotesmart); break; case '[': ch = mPage.getCharacter (mCursor); if (Page.EOF == ch) { ret = makeString (start, mCursor.getPosition ()); //ret.setResource(true); } else if ('/' == ch ||Character.isLetter (ch)) { mCursor.retreat (); ret = parseTag (start); ret.setResource(true); } else if ('!' == ch) { ch = mPage.getCharacter (mCursor); if (Page.EOF == ch) { ret = makeString (start, mCursor.getPosition ()); //ret.setResource(true); } else { if (']' == ch) // handle [!] { ret = makeRemark (start, mCursor.getPosition ()); ret.setResource(true); } else { mCursor.retreat (); // remark/tag need this char if ('-' == ch) { ret = parseRemark (start, quotesmart); ret.setResource(true); } else { mCursor.retreat (); // tag needs prior one too ret = parseTag (start); ret.setResource(true); } } } } else ret = parseString (start, quotesmart); break; default: mCursor.retreat (); // string needs to see leading foreslash ret = parseString (start, quotesmart); break; } return (ret); } /** * Advance the cursor through a JIS escape sequence. * @param cursor A cursor positioned within the escape sequence. * @exception ParserException If a problem occurs reading from the source. */ protected void scanJIS (Cursor cursor) throws ParserException { boolean done; char ch; int state; done = false; state = 0; while (!done) { ch = mPage.getCharacter (cursor); if (Page.EOF == ch) done = true; else switch (state) { case 0: if (0x1b == ch) // escape state = 1; break; case 1: if ('(' == ch) state = 2; else state = 0; break; case 2: if ('J' == ch) done = true; else state = 0; break; default: throw new IllegalStateException ("state " + state); } } } /** * Parse a string node. * Scan characters until "</", "<%", "<!" or < followed by a * letter is encountered, or the input stream is exhausted, in which * case null is returned. * @param start The position at which to start scanning. * @param quotesmart If true, strings ignore quoted contents. * @return The parsed node. * @exception ParserException If a problem occurs reading from the source. */ protected Node parseString (int start, boolean quotesmart) throws ParserException { boolean done; char ch; char quote; done = false; quote = 0; while (!done) { ch = mPage.getCharacter (mCursor); if (Page.EOF == ch) done = true; else if (0x1b == ch) // escape { ch = mPage.getCharacter (mCursor); if (Page.EOF == ch) done = true; else if ('$' == ch) { ch = mPage.getCharacter (mCursor); if (Page.EOF == ch) done = true; else if ('B' == ch) scanJIS (mCursor); else { mCursor.retreat (); mCursor.retreat (); } } else mCursor.retreat (); } else if (quotesmart && (0 == quote) && (('\'' == ch) || ('"' == ch))) quote = ch; // enter quoted state // patch from Gernot Fricke to handle escaped closing quote else if (quotesmart && (0 != quote) && ('\\' == ch)) { ch = mPage.getCharacter (mCursor); // try to consume escape if ((Page.EOF != ch) && ('\\' != ch) // escaped backslash && (ch != quote)) // escaped quote character // ( reflects ["] or ['] whichever opened the quotation) mCursor.retreat(); // unconsume char if char not an escape } else if (quotesmart && (ch == quote)) quote = 0; // exit quoted state else if (quotesmart && (0 == quote) && (ch == '/')) { // handle multiline and double slash comments (with a quote) // in script like: // I can't handle single quotations. ch = mPage.getCharacter (mCursor); if (Page.EOF == ch) done = true; else if ('/' == ch) { do ch = mPage.getCharacter (mCursor); while ((Page.EOF != ch) && ('\n' != ch)); } else if ('*' == ch) { do { do ch = mPage.getCharacter (mCursor); while ((Page.EOF != ch) && ('*' != ch)); ch = mPage.getCharacter (mCursor); if (ch == '*') mCursor.retreat (); } while ((Page.EOF != ch) && ('/' != ch)); } else mCursor.retreat (); } else if ((0 == quote) ) { if('<' == ch) { ch = mPage.getCharacter (mCursor); if (Page.EOF == ch) done = true; // the order of these tests might be optimized for speed: else if ('/' == ch || Character.isLetter (ch) || '!' == ch || '%' == ch) { done = true; mCursor.retreat (); mCursor.retreat (); } else { // it's not a tag, so keep going, but check for quotes mCursor.retreat (); } } else if('[' == ch) { ch = mPage.getCharacter (mCursor); if (Page.EOF == ch) done = true; // the order of these tests might be optimized for speed: else if ('/' == ch || Character.isLetter (ch) || '!' == ch ) { done = true; mCursor.retreat (); mCursor.retreat (); } else { // it's not a tag, so keep going, but check for quotes mCursor.retreat (); } } } } return (makeString (start, mCursor.getPosition ())); } /** * Create a string node based on the current cursor and the one provided. * @param start The starting point of the node. * @param end The ending point of the node. * @exception ParserException If the nodefactory creation of the text * node fails. * @return The new Text node. */ protected Node makeString (int start, int end) throws ParserException { int length; Node ret; length = end - start; if (0 != length) // got some characters ret = getNodeFactory ().createStringNode ( this.getPage (), start, end); else ret = null; return (ret); } /** * Generate a whitespace 'attribute', * @param attributes The list so far. * @param bookmarks The array of positions. */ private void whitespace (List attributes, int[] bookmarks) { if (bookmarks[1] > bookmarks[0]) attributes.add (new PageAttribute ( mPage, -1, -1, bookmarks[0], bookmarks[1], (char)0)); } /** * Generate a standalone attribute -- font. * @param attributes The list so far. * @param bookmarks The array of positions. */ private void standalone (List attributes, int[] bookmarks) { attributes.add (new PageAttribute ( mPage, bookmarks[1], bookmarks[2], -1, -1, (char)0)); } /** * Generate an empty attribute -- color=. * @param attributes The list so far. * @param bookmarks The array of positions. */ private void empty (List attributes, int[] bookmarks) { attributes.add (new PageAttribute ( mPage, bookmarks[1], bookmarks[2], bookmarks[2] + 1, -1, (char)0)); } /** * Generate an unquoted attribute -- size=1. * @param attributes The list so far. * @param bookmarks The array of positions. */ private void naked (List attributes, int[] bookmarks) { attributes.add (new PageAttribute ( mPage, bookmarks[1], bookmarks[2], bookmarks[3], bookmarks[4], (char)0)); } /** * Generate an single quoted attribute -- width='100%'. * @param attributes The list so far. * @param bookmarks The array of positions. */ private void single_quote (List attributes, int[] bookmarks) { attributes.add (new PageAttribute ( mPage, bookmarks[1], bookmarks[2], bookmarks[4] + 1, bookmarks[5], '\'')); } /** * Generate an double quoted attribute -- CONTENT="Test Development". * @param attributes The list so far. * @param bookmarks The array of positions. */ private void double_quote (List attributes, int[] bookmarks) { attributes.add (new PageAttribute ( mPage, bookmarks[1], bookmarks[2], bookmarks[5] + 1, bookmarks[6], '"')); } /** * Parse a tag. * Parse the name and attributes from a start tag.

* From the * HTML 4.01 Specification, W3C Recommendation 24 December 1999 * http://www.w3.org/TR/html4/intro/sgmltut.html#h-3.2.2

* * 3.2.2 Attributes

* Elements may have associated properties, called attributes, which may * have values (by default, or set by authors or scripts). Attribute/value * pairs appear before the final ">" of an element's start tag. Any number * of (legal) attribute value pairs, separated by spaces, may appear in an * element's start tag. They may appear in any order.

* In this example, the id attribute is set for an H1 element: * * <H1 id="section1"> * * This is an identified heading thanks to the id attribute * * </H1> * * By default, SGML requires that all attribute values be delimited using * either double quotation marks (ASCII decimal 34) or single quotation * marks (ASCII decimal 39). Single quote marks can be included within the * attribute value when the value is delimited by double quote marks, and * vice versa. Authors may also use numeric character references to * represent double quotes (&#34;) and single quotes (&#39;). * For doublequotes authors can also use the character entity reference * &quot;.

* In certain cases, authors may specify the value of an attribute without * any quotation marks. The attribute value may only contain letters * (a-z and A-Z), digits (0-9), hyphens (ASCII decimal 45), * periods (ASCII decimal 46), underscores (ASCII decimal 95), * and colons (ASCII decimal 58). We recommend using quotation marks even * when it is possible to eliminate them.

* Attribute names are always case-insensitive.

* Attribute values are generally case-insensitive. The definition of each * attribute in the reference manual indicates whether its value is * case-insensitive.

* All the attributes defined by this specification are listed in the * attribute index.

* *

* This method uses a state machine with the following states: *

    *
  1. state 0 - outside of any attribute
  2. *
  3. state 1 - within attributre name
  4. *
  5. state 2 - equals hit
  6. *
  7. state 3 - within naked attribute value.
  8. *
  9. state 4 - within single quoted attribute value
  10. *
  11. state 5 - within double quoted attribute value
  12. *
  13. state 6 - whitespaces after attribute name could lead to state 2 (=)or state 0
  14. *
*

* The starting point for the various components is stored in an array * of integers that match the initiation point for the states one-for-one, * i.e. bookmarks[0] is where state 0 began, bookmarks[1] is where state 1 * began, etc. * Attributes are stored in a List having * one slot for each whitespace or attribute/value pair. * The first slot is for attribute name (kind of like a standalone attribute). * @param start The position at which to start scanning. * @return The parsed tag. * @exception ParserException If a problem occurs reading from the source. */ protected Node parseTag (int start) throws ParserException { boolean done; char ch; int state; int[] bookmarks; List attributes; done = false; attributes = new ArrayList (); state = 0; bookmarks = new int[8]; bookmarks[0] = mCursor.getPosition (); while (!done) { bookmarks[state + 1] = mCursor.getPosition (); ch = mPage.getCharacter (mCursor); switch (state) { case 0: // outside of any attribute if(Page.EOF == ch) { if ('<' == ch || '[' == ch) { // don't consume the opening angle mCursor.retreat (); bookmarks[state + 1] = mCursor.getPosition (); } whitespace (attributes, bookmarks); done = true; } else if ( ('>' == ch) || ('<' == ch)) { if ('<' == ch) { // don't consume the opening angle mCursor.retreat (); bookmarks[state + 1] = mCursor.getPosition (); } whitespace (attributes, bookmarks); done = true; } else if((']' == ch) || ('[' == ch)) { if ('[' == ch) { // don't consume the opening angle mCursor.retreat (); bookmarks[state + 1] = mCursor.getPosition (); } whitespace (attributes, bookmarks); done = true; } else if (!Character.isWhitespace (ch)) { whitespace (attributes, bookmarks); state = 1; } break; case 1: // within attribute name if ((Page.EOF == ch)) { if ('<' == ch || '[' == ch) { // don't consume the opening angle mCursor.retreat (); bookmarks[state + 1] = mCursor.getPosition (); } standalone (attributes, bookmarks); done = true; } else if (('>' == ch) || ('<' == ch)) { if ('<' == ch) { // don't consume the opening angle mCursor.retreat (); bookmarks[state + 1] = mCursor.getPosition (); } standalone (attributes, bookmarks); done = true; } else if ((']' == ch) || ('[' == ch)) { if ('[' == ch) { // don't consume the opening angle mCursor.retreat (); bookmarks[state + 1] = mCursor.getPosition (); } standalone (attributes, bookmarks); done = true; } else if (Character.isWhitespace (ch)) { // whitespaces might be followed by next attribute or an equal sign // see Bug #891058 Bug in lexer. bookmarks[6] = bookmarks[2]; // setting the bookmark[0] is done in state 6 if applicable state = 6; } else if ('=' == ch) state = 2; break; case 2: // equals hit if ((Page.EOF == ch) || ('>' == ch) || (']' == ch)) { empty (attributes, bookmarks); done = true; } else if ('\'' == ch) { state = 4; bookmarks[4] = bookmarks[3]; } else if ('"' == ch) { state = 5; bookmarks[5] = bookmarks[3]; } else if (Character.isWhitespace (ch)) { // collect white spaces after "=" into the assignment string; // do nothing // see Bug #891058 Bug in lexer. } else state = 3; break; case 3: // within naked attribute value if ((Page.EOF == ch) || ('>' == ch) || (']' == ch)) { naked (attributes, bookmarks); done = true; } else if (Character.isWhitespace (ch)) { naked (attributes, bookmarks); bookmarks[0] = bookmarks[4]; state = 0; } break; case 4: // within single quoted attribute value if (Page.EOF == ch) { single_quote (attributes, bookmarks); done = true; // complain? } else if ('\'' == ch) { single_quote (attributes, bookmarks); bookmarks[0] = bookmarks[5] + 1; state = 0; } break; case 5: // within double quoted attribute value if (Page.EOF == ch) { double_quote (attributes, bookmarks); done = true; // complain? } else if ('"' == ch) { double_quote (attributes, bookmarks); bookmarks[0] = bookmarks[6] + 1; state = 0; } break; // patch for lexer state correction by // Gernot Fricke // See Bug # 891058 Bug in lexer. case 6: // undecided for state 0 or 2 // we have read white spaces after an attributte name if (Page.EOF == ch) { // same as last else clause standalone (attributes, bookmarks); bookmarks[0]=bookmarks[6]; mCursor.retreat(); state=0; } else if (Character.isWhitespace (ch)) { // proceed } else if ('=' == ch) // yepp. the white spaces belonged to the equal. { bookmarks[2] = bookmarks[6]; bookmarks[3] = bookmarks[7]; state=2; } else { // white spaces were not ended by equal // meaning the attribute was a stand alone attribute // now: create the stand alone attribute and rewind // the cursor to the end of the white spaces // and restart scanning as whitespace attribute. standalone (attributes, bookmarks); bookmarks[0]=bookmarks[6]; mCursor.retreat(); state=0; } break; default: throw new IllegalStateException ("how the fuck did we get in state " + state); } } return (makeTag (start, mCursor.getPosition (), attributes)); } /** * Create a tag node based on the current cursor and the one provided. * @param start The starting point of the node. * @param end The ending point of the node. * @param attributes The attributes parsed from the tag. * @exception ParserException If the nodefactory creation of the tag node fails. * @return The new Tag node. */ protected Node makeTag (int start, int end, List attributes) throws ParserException { int length; Node ret; length = end - start; if (0 != length) { // return tag based on second character, '/', '%', Letter (ch), '!' if (2 > length) // this is an error return (makeString (start, end)); ret = getNodeFactory ().createTagNode (this.getPage (), start, end, attributes); } else ret = null; return (ret); } /** * Parse a comment. * Parse a remark markup.

* From the * HTML 4.01 Specification, W3C Recommendation 24 December 1999 * http://www.w3.org/TR/html4/intro/sgmltut.html#h-3.2.4

* * 3.2.4 Comments

* HTML comments have the following syntax:

* * <!-- this is a comment -->

* <!-- and so is this one,

* which occupies more than one line -->

* * White space is not permitted between the markup declaration * open delimiter("<!") and the comment open delimiter ("--"), * but is permitted between the comment close delimiter ("--") and * the markup declaration close delimiter (">"). * A common error is to include a string of hyphens ("---") within a comment. * Authors should avoid putting two or more adjacent hyphens inside comments. * Information that appears between comments has no special meaning * (e.g., character references are not interpreted as such). * Note that comments are markup.

* *

* This method uses a state machine with the following states: *

    *
  1. state 0 - prior to the first open delimiter
  2. *
  3. state 1 - prior to the second open delimiter
  4. *
  5. state 2 - prior to the first closing delimiter
  6. *
  7. state 3 - prior to the second closing delimiter
  8. *
  9. state 4 - prior to the terminating >
  10. *
*

* All comment text (everything excluding the < and >), is included * in the remark text. * We allow terminators like --!> even though this isn't part of the spec. * @param start The position at which to start scanning. * @param quotesmart If true, strings ignore quoted contents. * @return The parsed node. * @exception ParserException If a problem occurs reading from the source. */ protected Node parseRemark (int start, boolean quotesmart) throws ParserException { boolean done; char ch; int state; done = false; state = 0; while (!done) { ch = mPage.getCharacter (mCursor); if (Page.EOF == ch) done = true; else switch (state) { case 0: // prior to the first open delimiter if ('>' == ch || ']' == ch) done = true; if ('-' == ch) state = 1; else return (parseString (start, quotesmart)); break; case 1: // prior to the second open delimiter if ('-' == ch) { // handle because netscape does ch = mPage.getCharacter (mCursor); if (Page.EOF == ch) done = true; else if ('>' == ch || ']' == ch) done = true; else { mCursor.retreat (); state = 2; } } else return (parseString (start, quotesmart)); break; case 2: // prior to the first closing delimiter if ('-' == ch) state = 3; else if (Page.EOF == ch) return (parseString (start, quotesmart)); // no terminator break; case 3: // prior to the second closing delimiter if ('-' == ch) state = 4; else state = 2; break; case 4: // prior to the terminating > if ('>' == ch || ']' == ch) done = true; else if (('!' == ch) || ('-' == ch) || Character.isWhitespace (ch)) { // stay in state 4 } else state = 2; break; default: throw new IllegalStateException ("how the fuck did we get in state " + state); } } return (makeRemark (start, mCursor.getPosition ())); } /** * Create a remark node based on the current cursor and the one provided. * @param start The starting point of the node. * @param end The ending point of the node. * @exception ParserException If the nodefactory creation of the remark node fails. * @return The new Remark node. */ protected Node makeRemark (int start, int end) throws ParserException { int length; Node ret; length = end - start; if (0 != length) { // return tag based on second character, '/', '%', Letter (ch), '!' if (2 > length) // this is an error return (makeString (start, end)); ret = getNodeFactory ().createRemarkNode (this.getPage (), start, end); } else ret = null; return (ret); } /** * Parse a java server page node. * Scan characters until "%>" is encountered, or the input stream is * exhausted, in which case null is returned. * @param start The position at which to start scanning. * @return The parsed node. * @exception ParserException If a problem occurs reading from the source. */ protected Node parseJsp (int start) throws ParserException { boolean done; char ch; int state; List attributes; int code; done = false; state = 0; code = 0; attributes = new ArrayList (); // <%xyz%> // 012223d // <%=xyz%> // 0122223d // <%@xyz%d // 0122223d while (!done) { ch = mPage.getCharacter (mCursor); switch (state) { case 0: // prior to the percent switch (ch) { case '%': // <% state = 1; break; // case Page.EOF: // <\0 // case '>': // <> default: done = true; break; } break; case 1: // prior to the optional qualifier switch (ch) { case Page.EOF: // <%\0 case '>': // <%> done = true; break; case '=': // <%= case '@': // <%@ code = mCursor.getPosition (); attributes.add (new PageAttribute (mPage, start + 1, code, -1, -1, (char)0)); state = 2; break; default: // <%x code = mCursor.getPosition () - 1; attributes.add (new PageAttribute (mPage, start + 1, code, -1, -1, (char)0)); state = 2; break; } break; case 2: // prior to the closing percent switch (ch) { case Page.EOF: // <%x\0 case '>': // <%x> done = true; break; case '\'': case '"':// <%???" state = ch; break; case '%': // <%???% state = 3; break; default: // <%???x break; } break; case 3: switch (ch) { case Page.EOF: // <%x??%\0 done = true; break; case '>': state = 4; done = true; break; default: // <%???%x state = 2; break; } break; case '"': switch (ch) { case Page.EOF: // <%x??"\0 done = true; break; case '"': state = 2; break; default: // <%???'??x break; } break; case '\'': switch (ch) { case Page.EOF: // <%x??'\0 done = true; break; case '\'': state = 2; break; default: // <%???"??x break; } break; default: throw new IllegalStateException ("how the fuck did we get in state " + state); } } if (4 == state) // normal exit { if (0 != code) { state = mCursor.getPosition () - 2; // reuse state attributes.add (new PageAttribute (mPage, code, state, -1, -1, (char)0)); attributes.add (new PageAttribute (mPage, state, state + 1, -1, -1, (char)0)); } else throw new IllegalStateException ("jsp with no code!"); } else return (parseString (start, true)); // hmmm, true? return (makeTag (start, mCursor.getPosition (), attributes)); } /** * Return CDATA as a text node. * According to appendix * B.3.2 Specifying non-HTML data of the * HTML 4.01 Specification:
* * Element content
* When script or style data is the content of an element (SCRIPT and STYLE), * the data begins immediately after the element start tag and ends at the * first ETAGO ("</") delimiter followed by a name start character ([a-zA-Z]); * note that this may not be the element's end tag. * Authors should therefore escape "</" within the content. Escape mechanisms * are specific to each scripting or style sheet language. *
* @return The TextNode of the CDATA or null if none. * @exception ParserException If a problem occurs reading from the source. */ public Node parseCDATA () throws ParserException { return (parseCDATA (false)); } private static final String scriptendtag = "script>"; private static final String styleendtag = "style>"; public Node parseCompositeCDATA_ (boolean quotesmart,String endtag,int length) throws ParserException { int start; int state; boolean done; char quote; char ch; int end; start = mCursor.getPosition (); state = 0; done = false; quote = 0; while (!done) { ch = mPage.getCharacter (mCursor); switch (state) { case 0: // prior to ETAGO switch (ch) { case Page.EOF: done = true; break; case '\'': if (quotesmart) if (0 == quote) quote = '\''; // enter quoted state else if ('\'' == quote) quote = 0; // exit quoted state break; case '"': if (quotesmart) if (0 == quote) quote = '"'; // enter quoted state else if ('"' == quote) quote = 0; // exit quoted state break; case '\\': if (quotesmart) if (0 != quote) { ch = mPage.getCharacter (mCursor); // try to consume escaped character if (Page.EOF == ch) done = true; else if ( (ch != '\\') && (ch != quote)) mCursor.retreat (); // unconsume char if character was not an escapable char. } break; case '/': if (quotesmart) if (0 == quote) { // handle multiline and double slash comments (with a quote) ch = mPage.getCharacter (mCursor); if (Page.EOF == ch) done = true; else if ('/' == ch) { do ch = mPage.getCharacter (mCursor); while ((Page.EOF != ch) && ('\n' != ch)); } else if ('*' == ch) { do { do ch = mPage.getCharacter (mCursor); while ((Page.EOF != ch) && ('*' != ch)); ch = mPage.getCharacter (mCursor); if (ch == '*') mCursor.retreat (); } while ((Page.EOF != ch) && ('/' != ch)); } else mCursor.retreat (); } break; case '<': if (quotesmart) { if (0 == quote) state = 1; } else state = 1; break; case '[': if (quotesmart) { if (0 == quote) state = 1; } else state = 1; break; default: break; } break; case 1: // < switch (ch) { case Page.EOF: done = true; break; case '/': state = 2; break; default: state = 0; break; } break; case 2: // ' || ch == '<') { script.append(ch); } } if(done) break; // else if(script.toString().toLowerCase().equals("script>")) // { // done = true; // mCursor.retreat (); // mCursor.retreat (); // mCursor.retreat (); // mCursor.retreat (); // mCursor.retreat (); // mCursor.retreat (); // // back up to the start of ETAGO // mCursor.retreat (); // mCursor.retreat (); // mCursor.retreat (); // } else if(script.toString().toLowerCase().equals(endtag)) { done = true; for(int ii = 0; ii < length; ii ++) { mCursor.retreat (); } // back up to the start of ETAGO mCursor.retreat (); mCursor.retreat (); mCursor.retreat (); } else state = 0; } else state = 0; break; default: throw new IllegalStateException ("how the fuck did we get in state " + state); } } end = mCursor.getPosition (); Text ret = (Text)(makeString (start, end)); if(ret != null) { String text = ret.getText(); if(text != null && this.jspTagAware != null) { if(text.contains("<%")) this.jspTagAware.setJspTagAware(true); else if(RegexUtil.isContain(text, jsp_custom_tag_pattern)) { this.jspTagAware.setJspTagAware(true); } } } return ret; } // public static void main(String[] ars) // { // System.out.println(RegexUtil.isContain("", "<\\w+:\\w+>")); // } public Node parseScriptCDATA (boolean quotesmart) throws ParserException { return parseCompositeCDATA_ ( quotesmart, scriptendtag, scriptendtag.length()-1); } public Node parseStyleCDATA (boolean quotesmart) throws ParserException { return parseCompositeCDATA_ ( quotesmart, styleendtag, styleendtag.length()-1); } /** * Return CDATA as a text node. * Slightly less rigid than {@link #parseCDATA()} this method provides for * parsing CDATA that may contain quoted strings that have embedded * ETAGO ("</") delimiters and skips single and multiline comments. * @param quotesmart If true the strict definition of CDATA is * extended to allow for single or double quoted ETAGO ("</") sequences. * @return The TextNode of the CDATA or null if none. * @see #parseCDATA() * @exception ParserException If a problem occurs reading from the source. */ public Node parseCDATA (boolean quotesmart) throws ParserException { int start; int state; boolean done; char quote; char ch; int end; start = mCursor.getPosition (); state = 0; done = false; quote = 0; while (!done) { ch = mPage.getCharacter (mCursor); switch (state) { case 0: // prior to ETAGO switch (ch) { case Page.EOF: done = true; break; case '\'': if (quotesmart) if (0 == quote) quote = '\''; // enter quoted state else if ('\'' == quote) quote = 0; // exit quoted state break; case '"': if (quotesmart) if (0 == quote) quote = '"'; // enter quoted state else if ('"' == quote) quote = 0; // exit quoted state break; case '\\': if (quotesmart) if (0 != quote) { ch = mPage.getCharacter (mCursor); // try to consume escaped character if (Page.EOF == ch) done = true; else if ( (ch != '\\') && (ch != quote)) mCursor.retreat (); // unconsume char if character was not an escapable char. } break; case '/': if (quotesmart) if (0 == quote) { // handle multiline and double slash comments (with a quote) ch = mPage.getCharacter (mCursor); if (Page.EOF == ch) done = true; else if ('/' == ch) { do ch = mPage.getCharacter (mCursor); while ((Page.EOF != ch) && ('\n' != ch)); } else if ('*' == ch) { do { do ch = mPage.getCharacter (mCursor); while ((Page.EOF != ch) && ('*' != ch)); ch = mPage.getCharacter (mCursor); if (ch == '*') mCursor.retreat (); } while ((Page.EOF != ch) && ('/' != ch)); } else mCursor.retreat (); } break; case '<': if (quotesmart) { if (0 == quote) state = 1; } else state = 1; break; case '[': if (quotesmart) { if (0 == quote) state = 1; } else state = 1; break; default: break; } break; case 1: // < switch (ch) { case Page.EOF: done = true; break; case '/': state = 2; break; default: state = 0; break; } break; case 2: // = args.length) // System.out.println ("usage: java -jar htmllexer.jar "); // else // { // try // { // ConnectionManager manager = Page.getConnectionManager (); // lexer = new Lexer (manager.openConnection (args[0])); // while (null != (node = lexer.nextNode ())) // System.out.println (node.toString ()); // } // catch (ParserException pe) // { // System.out.println (pe.getMessage ()); // if (null != pe.getThrowable ()) // System.out.println (pe.getThrowable ().getMessage ()); // } // } // System.out.println(RegexUtil.isContain("", "<[A-Za-z][\\w]*:[A-Za-z][\\w]*>")); String pattern = "<[A-Za-z][\\w]*:.+[/]?>"; System.out.println(RegexUtil.isContain("", pattern)); System.out.println(RegexUtil.isContain("", pattern)); System.out.println(RegexUtil.isContain("", pattern)); System.out.println(RegexUtil.isContain("", pattern)); System.out.println(RegexUtil.isContain("", pattern)); System.out.println(RegexUtil.isContain("", pattern)); System.out.println(RegexUtil.isContain("", pattern)); System.out.println(RegexUtil.isContain("", pattern)); System.out.println(RegexUtil.isContain("", pattern)); System.out.println(RegexUtil.isContain("", pattern)); System.out.println(RegexUtil.isContain("", pattern)); System.out.println(RegexUtil.isContain("", pattern)); } public JspTagAware getJspTagAware() { return jspTagAware; } public void setJspTagAware(JspTagAware jspTagAware) { this.jspTagAware = jspTagAware; } }





© 2015 - 2025 Weber Informatics LLC | Privacy Policy