sunlabs.brazil.util.LexML Maven / Gradle / Ivy
Show all versions of sunlabs.brazil Show documentation
/*
* LexML.java
*
* Brazil project web application toolkit,
* export version: 2.3
* Copyright (c) 1999-2006 Sun Microsystems, Inc.
*
* Sun Public License Notice
*
* The contents of this file are subject to the Sun Public License Version
* 1.0 (the "License"). You may not use this file except in compliance with
* the License. A copy of the License is included as the file "license.terms",
* and also available at http://www.sun.com/
*
* The Original Code is from:
* Brazil project web application toolkit release 2.3.
* The Initial Developer of the Original Code is: cstevens.
* Portions created by cstevens are Copyright (C) Sun Microsystems, Inc.
* All Rights Reserved.
*
* Contributor(s): cstevens, suhler.
*
* Version: 2.7
* Created by cstevens on 99/09/29
* Last modified by suhler on 06/12/18 14:47:23
*
* Version Histories:
*
* 2.7 06/12/18-14:47:23 (suhler)
* bug fix: isSingleton() wasn't reporting singletons in some cases
*
* 2.6 04/12/30-12:40:00 (suhler)
* javadoc fixes.
*
* 2.5 04/11/30-15:19:45 (suhler)
* fixed sccs version string
*
* 2.4 04/11/18-12:23:20 (suhler)
* init bug
*
* 2.3 04/04/28-15:55:16 (suhler)
* added methods for better error diagnosis
*
* 2.2 03/08/01-16:17:27 (suhler)
* fixes for javadoc
*
* 2.1 02/10/01-16:37:00 (suhler)
* version change
*
* 1.13 02/07/24-10:49:53 (suhler)
* doc updates
*
* 1.12 02/02/04-14:34:21 (suhler)
* remove "new" close tag finding bahavior: allow sub-classes to implement it instead
* .
*
* 1.11 02/01/29-10:08:46 (suhler)
* Changed (new) unescaped ">" behavior back to old as the default. Use
* allowGt() to turn on or off.
*
* 1.10 02/01/23-11:45:09 (suhler)
* - Changed the default behavior of LexML to allow unescaped
* >'s inside of quoted strings. This could be a major incompatibility,
* so it needs thourough testing. The "old" behavior can be
* re-established on a global or per-instance basis
*
* 1.9 01/08/20-16:38:30 (suhler)
* doc lint
*
* 1.8 01/07/30-17:32:58 (suhler)
* added internal "done" flag that allows references to getXXX() after
* nextToken returns false;
*
* 1.7 01/03/08-16:06:24 (cstevens)
* Handle singleton HTML tags like
or . Before, these forms
* were interpreted incorrectly as the "br/" tag or the "a" tag with the
* attribute "name" and the value "foo/".
*
* 1.6 01/01/16-14:19:08 (suhler)
* bug! (running off the end of comments)
*
* 1.5 00/05/31-13:52:45 (suhler)
* docs
*
* 1.4 99/10/21-18:24:15 (cstevens)
* Added ability to change a tag into a comment. Used by BSL and Tcl templates,
* to keep track of where the substitution occurred when examining the resultant
* HTML document.
*
* 1.3 99/10/14-12:58:22 (cstevens)
* Documentation
*
* 1.2 99/10/04-16:03:32 (cstevens)
* Documentation for LexML and StringMap.
*
* 1.2 99/09/29-16:12:36 (Codemgr)
* SunPro Code Manager data about conflicts, renames, etc...
* Name history : 1 0 util/LexML.java
*
* 1.1 99/09/29-16:12:35 (cstevens)
* date and time created 99/09/29 16:12:35 by cstevens
*
*/
package sunlabs.brazil.util;
/**
* This class breaks angle-bracket-separated markup languages like SGML, XML,
* and HTML into tokens. It understands three types of tokens:
* - tags
*
- Formally known as "entities", tags are delimited by "<" and
* ">". The first word in the tag is the tag name and the
* rest of the tag consists of the attributes, a set of
* "name=value" or "name" data. Spaces in tags are not significant
* except for quoted values in the attributes.
*
*
- string
*
- Plain strings that are not in angle-brackets. Spaces are
* significant and preserved.
*
*
- comments
*
- Delimited by "<!--" and "-->". All text between the
* delimiters is part of the comment. However, by convention,
* some comments actually contain data and so the methods that
* extract the fields from tags can be used to attempt to extract
* the fields from comments, too. Spaces are significant and
* preserved in a comment, unless the comment is treated as a
* tag, in which the tag rules apply.
*
*
* This class is intended to parse markup languages, not to validate them.
* "Malformed" data is interpreted as graciously as possible, in order to
* extract as much information as possible. For instance: spaces are
* allowed between the "<" and the tag name, values in tags do not need
* to be quoted, and unbalanced quotes are accepted.
*
*
* One type of "malformed" data specifically not handled is a quoted
* ">" character occurring within the body of a tag. Even if it is
* quoted, a ">" in the attributes of a tag will be interpreted as the
* end of the tag. For example, the single tag <img src='foo.jpg'
* alt='xyz > abc'>
will be erroneously broken by
* this parser into two tokens:
* - the tag
<img src='foo.jpg' alt='xyz >
* - the string "abc'>" (and possibly whatever text follows after).
*
* Unfortunately, this type of "malformed" data is known to occur regularly.
*
* This class also may not properly parse all well-formed XML tags, such
* as tags with extended paired delimiters <&
and
* &>
, <?
and ?>
, or
* <![CDATA[
and ]]>
.
* Additionally, XML tags that have embedded comments containing the
* ">" character will not be parsed correctly (for example:
* <!DOCTYPE foo SYSTEM -- a > b -- foo.dtd>
),
* since the ">" in the comment will be interpreted as
* the end of declaration tag, for the same reason mentioned
* above.
*
* Note: this behavior may be changed on a per-application basis by
* overriding the findClose
method in a subclass.
*
* @author Colin Stevens ([email protected])
* @version 2.7
*/
public class LexML
{
/**
* The value returned by getType
for comment tokens
*/
public static final int COMMENT = 0;
/**
* The value returned by getType
for tag tokens
*/
public static final int TAG = 1;
/**
* The value returned by getType
for string tokens
*/
public static final int STRING = 2;
private static final String SPACE = " \t\r\n";
private static final String SPACE_EQUAL = SPACE + "=";
int type;
boolean singleton; // Tag of form
boolean done; // set when we run out of tokens
protected String str; // The string we are scanning
int strEnd;
int tokenStart;
int tokenEnd;
int tagStart;
int tagEnd;
int argsStart;
int argsEnd;
/**
* Create a new ML parser, which can be used to iterate over the
* tokens in the given string.
*
* @param str
* The ML to parse.
*/
public
LexML(String str) {
replace(str);
}
/**
* Advances to the next token. The user can then call the other methods
* in this class to get information about the new current token.
*
* @return true
if a token was found, false
* if there were no more tokens left.
*/
public boolean
nextToken()
{
if (tokenEnd >= strEnd) {
done=true;
singleton = false;
type=STRING;
return false;
}
tokenStart = tokenEnd;
if (str.startsWith("", tokenStart + 4);
} catch (StringIndexOutOfBoundsException e) {
tokenEnd = -1;
}
if (tokenEnd < 0) {
str += "-->";
tokenEnd = strEnd;
strEnd += 3;
}
tokenEnd += 3;
type = COMMENT;
} else if (str.charAt(tokenStart) == '<') {
tokenEnd = findClose(tokenStart);
if (tokenEnd < 0) {
str += ">";
strEnd++;
tokenEnd = strEnd;
done=true;
}
tokenEnd++;
type = TAG;
} else {
tokenEnd = str.indexOf('<', tokenStart);
if (tokenEnd < 0) {
tokenEnd = strEnd;
}
type = STRING;
}
return true;
}
/**
* Find the closing tag ">".
* This may be overriden by sub-classes to allow more sophisticated
* behavior.
* @param start The starting index in str
to look for
* the matching >
* @return The index of str that contains the
* matching >
*/
protected int
findClose(int start) {
return str.indexOf('>', start);
}
/**
* Gets the type of the current token.
*
* @return The type.
*
* @see #COMMENT
* @see #TAG
* @see #STRING
*/
public int
getType() {
return type;
}
/**
* A tag is a "singleton" if the closing ">" is preceded by
* a slash (/). (e.g. <br/>
*/
public boolean
isSingleton() {
if (type == TAG) {
split(); // computes singleton as a side effect
}
return singleton;
}
/**
* Gets the string making up the whole current token, including the
* brackets or comment delimiters, if appropriate.
*
* @return The current token.
*/
public String
getToken() {
return done ? null : str.substring(tokenStart, tokenEnd);
}
/**
* Gets the string making up the current token, not including the angle
* brackets or comment delimiters, if appropriate.
*
* @return The body of the token.
*/
public String
getBody() {
if (done) {
return null;
} else if (type == TAG) {
return str.substring(tokenStart + 1, tokenEnd - 1);
} else if (type == COMMENT) {
return str.substring(tokenStart + 4, tokenEnd - 3);
} else {
return str.substring(tokenStart, tokenEnd);
}
}
/**
* Return the string we are currently processing
*/
public String
getString() {
return str;
}
/**
* Return the current processing location.
* @return The character index of the current tag.
*/
public int
getLocation() {
return tokenStart;
}
private void
split() {
if (tagStart <= tokenStart) {
int off = tokenStart + 1;
int end = (type == TAG) ? tokenEnd - 1 : tokenEnd - 3;
tagStart = skip(SPACE, str, off, end);
tagEnd = next(SPACE, str, tagStart, end);
argsStart = skip(SPACE, str, tagEnd, end);
argsEnd = end;
singleton = false;
if (str.charAt(argsEnd - 1) == '/') {
singleton = true;
argsEnd--;
if (argsStart > argsEnd) {
argsStart = argsEnd;
}
if (tagEnd > argsEnd) {
tagEnd = argsEnd;
}
}
}
}
/**
* Gets the tag name at the beginning of the current tag. In other
* words, the tag name for <table border=3>
is
* "table". Any surrounding space characters are removed, but the
* case of the tag is preserved.
*
* For comments, the "tag" is the first word in the comment. This can
* be used to help parse comments that are structured similar to regular
* tags, such as server-side include comments like
* <!--#include virtual="file.inc">
. The tag in
* this case would be "!--#include".
*
* @return The tag name, or null
if the current token
* was a string.
*
*/
public String
getTag() {
if (type == STRING) {
return null;
}
split();
return str.substring(tagStart, tagEnd);
}
/**
* Gets the name/value pairs in the body of the current tag as a
* string.
*
* @return The name/value pairs, or null
if
* the current token was a string.
*/
public String
getArgs() {
if (type == STRING) {
return null;
}
split();
return str.substring(argsStart, argsEnd);
}
/**
* Gets the name/value pairs in the body of the current tag as a
* table.
*
* Any quote marks in the body, either single or double quotes, are
* left on the values, so that the values can be easily re-emitted
* and still form a valid body.
*
* For names that have no associated value in the tag, the value is
* stored as the empty string "". Therefore, the two tags
* <table border>
and
* <table border="">
cannot be distinguished
* based on the result of calling getAttributes
.
*
* @return The table of name/value pairs, or null
if
* the current token was a string.
*/
public StringMap
getAttributes() {
if (type == STRING) {
return null;
}
StringMap map = new StringMap();
split();
int off = argsStart;
int end = argsEnd;
String token = str;
while (off < end) {
int nameStart = off;
int nameEnd = next(SPACE_EQUAL, token, off + 1, end);
String name = token.substring(nameStart, nameEnd);
off = skip(SPACE, token, nameEnd, end);
if ((off < end) && (token.charAt(off) == '=')) {
off = skip(SPACE, token, off + 1, end);
if (off < end) {
char ch = token.charAt(off);
int valueStart = off;
int valueEnd;
if ((ch == '"') || (ch == '\'')) {
off++;
if (off < end) {
off = token.indexOf(ch, off);
if (off < 0) {
off = end;
}
}
off++;
valueEnd = off;
} else {
off = next(SPACE, token, off, end);
valueEnd = off;
}
map.add(name, token.substring(valueStart, valueEnd));
off = skip(SPACE, token, off, end);
continue;
}
}
map.add(name, "");
}
return map;
}
/**
* Gets the rest of the string that has not yet been parsed.
*
* Example use: to help the parser in circumstances such as the HTML
* "<script>" tag where the script body doesn't the obey the rules
* because it might contain lone "<" or ">" characters, which this
* parser would interpret as the start or end of funny-looking tags.
*
* @return The unparsed remainder of the string.
*
* @see #replace
*/
public String
rest() {
return done ? null : str.substring(tokenEnd);
}
/**
* Changes the string that this LexML is parsing.
*
* Example use: the caller decided to parse part of the body,
* and now wants this LexML to pick up and parse the rest of it.
*
* @param str
* The string that this LexML should now parse. Whatever
* string this LexML was parsing is forgotten, and it now
* starts parsing at the beginning of the new string.
*
* @see #rest
*/
public void
replace(String str)
{
this.type=STRING;
this.str = str;
this.tokenStart = 0;
this.tokenEnd = 0;
this.tagStart = 0;
singleton = false;
if (str == null) {
done = true;
} else {
this.strEnd = str.length();
done=(tokenEnd >= strEnd);
}
}
private int
skip(String pattern, String str, int i, int end)
{
for ( ; i < end; i++) {
if (pattern.indexOf(str.charAt(i)) < 0) {
break;
}
}
return i;
}
private int
next(String pattern, String str, int i, int end)
{
for ( ; i < end; i++) {
if (pattern.indexOf(str.charAt(i)) >= 0) {
break;
}
}
return i;
}
}