org.w3c.tidy.ParsePropertyImpl Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of jtidy Show documentation
Show all versions of jtidy Show documentation
JTidy is a Java port of HTML Tidy, a HTML syntax checker and pretty printer. Like its non-Java cousin, JTidy can be
used as a tool for cleaning up malformed and faulty HTML. In addition, JTidy provides a DOM interface to the
document that is being processed, which effectively makes you able to use JTidy as a DOM parser for real-world HTML.
The newest version!
/*
* Java HTML Tidy - JTidy
* HTML parser and pretty printer
*
* Copyright (c) 1998-2000 World Wide Web Consortium (Massachusetts
* Institute of Technology, Institut National de Recherche en
* Informatique et en Automatique, Keio University). All Rights
* Reserved.
*
* Contributing Author(s):
*
* Dave Raggett
* Andy Quick (translation to Java)
* Gary L Peskin (Java development)
* Sami Lempinen (release management)
* Fabrizio Giustina
*
* The contributing author(s) would like to thank all those who
* helped with testing, bug fixes, and patience. This wouldn't
* have been possible without all of you.
*
* COPYRIGHT NOTICE:
*
* This software and documentation is provided "as is," and
* the copyright holders and contributing author(s) make no
* representations or warranties, express or implied, including
* but not limited to, warranties of merchantability or fitness
* for any particular purpose or that the use of the software or
* documentation will not infringe any third party patents,
* copyrights, trademarks or other rights.
*
* The copyright holders and contributing author(s) will not be
* liable for any direct, indirect, special or consequential damages
* arising out of any use of the software or documentation, even if
* advised of the possibility of such damage.
*
* Permission is hereby granted to use, copy, modify, and distribute
* this source code, or portions hereof, documentation and executables,
* for any purpose, without fee, subject to the following restrictions:
*
* 1. The origin of this source code must not be misrepresented.
* 2. Altered versions must be plainly marked as such and must
* not be misrepresented as being the original source.
* 3. This Copyright notice may not be removed or altered from any
* source or altered source distribution.
*
* The copyright holders and contributing author(s) specifically
* permit, without fee, and encourage the use of this source code
* as a component for supporting the Hypertext Markup Language in
* commercial products. If you use this source code in a product,
* acknowledgment is not required but would be appreciated.
*
*/
package org.w3c.tidy;
import java.util.Iterator;
import java.util.List;
import java.util.StringTokenizer;
/**
* Property parser instances.
* @author Fabrizio Giustina
* @version $Revision $ ($Author $)
*/
public final class ParsePropertyImpl
{
/**
* configuration parser for int values.
*/
static final ParseProperty INT = new ParseInt();
/**
* configuration parser for boolean values.
*/
static final ParseProperty BOOL = new ParseBoolean();
/**
* configuration parser for inverted boolean values.
*/
static final ParseProperty INVBOOL = new ParseInvBoolean();
/**
* configuration parser for char encoding values.
*/
static final ParseProperty CHAR_ENCODING = new ParseCharEncoding();
/**
* configuration parser for name values.
*/
static final ParseProperty NAME = new ParseName();
/**
* configuration parser for tag names.
*/
static final ParseProperty TAGNAMES = new ParseTagNames();
/**
* configuration parser for doctype property.
*/
static final ParseProperty DOCTYPE = new ParseDocType();
/**
* configuration parser for repetated attribute property.
*/
static final ParseProperty REPEATED_ATTRIBUTES = new ParseRepeatedAttribute();
/**
* configuration parser for String values.
*/
static final ParseProperty STRING = new ParseString();
/**
* configuration parser for indent property.
*/
static final ParseProperty INDENT = new ParseIndent();
/**
* configuration parser for css selectors.
*/
static final ParseProperty CSS1SELECTOR = new ParseCSS1Selector();
/**
* configuration parser for new line bytes.
*/
static final ParseProperty NEWLINE = new ParseNewLine();
/**
* don't instantiate.
*/
private ParsePropertyImpl()
{
// unused
}
/**
* parser for integer values.
*/
static class ParseInt implements ParseProperty
{
/**
* @see org.w3c.tidy.ParseProperty#parse(java.lang.String, java.lang.String, org.w3c.tidy.Configuration)
*/
public Object parse(String value, String option, Configuration configuration)
{
int i = 0;
try
{
i = Integer.parseInt(value);
}
catch (NumberFormatException e)
{
configuration.report.badArgument(value, option);
i = -1;
}
return new Integer(i);
}
/**
* @see org.w3c.tidy.ParseProperty#getType()
*/
public String getType()
{
return "Integer";
}
/**
* @see org.w3c.tidy.ParseProperty#getOptionValues()
*/
public String getOptionValues()
{
return "0, 1, 2, ...";
}
/**
* @see org.w3c.tidy.ParseProperty#getFriendlyName(java.lang.String, java.lang.Object, Configuration)
*/
public String getFriendlyName(String option, Object value, Configuration configuration)
{
return value == null ? "" : value.toString();
}
}
/**
* parser for boolean values.
*/
static class ParseBoolean implements ParseProperty
{
/**
* @see org.w3c.tidy.ParseProperty#parse(java.lang.String, java.lang.String, org.w3c.tidy.Configuration)
*/
public Object parse(String value, String option, Configuration configuration)
{
Boolean b = Boolean.TRUE;
if (value != null && value.length() > 0)
{
char c = value.charAt(0);
if ((c == 't') || (c == 'T') || (c == 'Y') || (c == 'y') || (c == '1'))
{
b = Boolean.TRUE;
}
else if ((c == 'f') || (c == 'F') || (c == 'N') || (c == 'n') || (c == '0'))
{
b = Boolean.FALSE;
}
else
{
configuration.report.badArgument(value, option);
}
}
return b;
}
/**
* @see org.w3c.tidy.ParseProperty#getType()
*/
public String getType()
{
return "Boolean";
}
/**
* @see org.w3c.tidy.ParseProperty#getOptionValues()
*/
public String getOptionValues()
{
return "y/n, yes/no, t/f, true/false, 1/0";
}
/**
* @see org.w3c.tidy.ParseProperty#getFriendlyName(java.lang.String, java.lang.Object, Configuration)
*/
public String getFriendlyName(String option, Object value, Configuration configuration)
{
if (value == null)
{
return "";
}
return ((Boolean) value).booleanValue() ? "yes" : "no";
}
}
/**
* parser for boolean values.
*/
static class ParseInvBoolean implements ParseProperty
{
/**
* @see org.w3c.tidy.ParseProperty#parse(java.lang.String, java.lang.String, org.w3c.tidy.Configuration)
*/
public Object parse(String value, String option, Configuration configuration)
{
return (((Boolean) BOOL.parse(value, option, configuration)).booleanValue() ? Boolean.FALSE : Boolean.TRUE);
}
/**
* @see org.w3c.tidy.ParseProperty#getType()
*/
public String getType()
{
return "Boolean";
}
/**
* @see org.w3c.tidy.ParseProperty#getOptionValues()
*/
public String getOptionValues()
{
return "yes, no, true, false";
}
/**
* @see org.w3c.tidy.ParseProperty#getFriendlyName(java.lang.String, java.lang.Object, Configuration)
*/
public String getFriendlyName(String option, Object value, Configuration configuration)
{
if (value == null)
{
return "";
}
return ((Boolean) value).booleanValue() ? "no" : "yes";
}
}
/**
* parse character encoding option. Can be any java encoding name supported by the runtime platform.
*/
static class ParseCharEncoding implements ParseProperty
{
/**
* @see org.w3c.tidy.ParseProperty#parse(java.lang.String, java.lang.String, org.w3c.tidy.Configuration)
*/
public Object parse(String value, String option, Configuration configuration)
{
if ("raw".equalsIgnoreCase(value))
{
// special value for compatibility with tidy c
configuration.rawOut = true;
}
else if (!TidyUtils.isCharEncodingSupported(value))
{
configuration.report.badArgument(value, option);
}
else if ("input-encoding".equalsIgnoreCase(option))
{
configuration.setInCharEncodingName(value);
}
else if ("output-encoding".equalsIgnoreCase(option))
{
configuration.setOutCharEncodingName(value);
}
else if ("char-encoding".equalsIgnoreCase(option))
{
configuration.setInCharEncodingName(value);
configuration.setOutCharEncodingName(value);
}
return null;
}
/**
* @see org.w3c.tidy.ParseProperty#getType()
*/
public String getType()
{
return "Encoding";
}
/**
* @see org.w3c.tidy.ParseProperty#getOptionValues()
*/
public String getOptionValues()
{
// ascii, latin1, raw, utf-8, iso2022, mac, utf-16, utf-16be, utf-16le, big5, shiftjis
return "Any valid java char encoding name";
}
/**
* @see org.w3c.tidy.ParseProperty#getFriendlyName(java.lang.String, java.lang.Object, Configuration)
*/
public String getFriendlyName(String option, Object value, Configuration configuration)
{
if ("output-encoding".equalsIgnoreCase(option))
{
return configuration.getOutCharEncodingName();
}
// for input-encoding or char-encoding
return configuration.getInCharEncodingName();
}
}
/**
* parser for name values (a string excluding whitespace).
*/
static class ParseName implements ParseProperty
{
/**
* @see org.w3c.tidy.ParseProperty#parse(java.lang.String, java.lang.String, org.w3c.tidy.Configuration)
*/
public Object parse(String value, String option, Configuration configuration)
{
StringTokenizer t = new StringTokenizer(value);
String rs = null;
if (t.countTokens() >= 1)
{
rs = t.nextToken();
}
else
{
configuration.report.badArgument(value, option);
}
return rs;
}
/**
* @see org.w3c.tidy.ParseProperty#getType()
*/
public String getType()
{
return "Name";
}
/**
* @see org.w3c.tidy.ParseProperty#getOptionValues()
*/
public String getOptionValues()
{
return "-";
}
/**
* @see org.w3c.tidy.ParseProperty#getFriendlyName(java.lang.String, java.lang.Object, Configuration)
*/
public String getFriendlyName(String option, Object value, Configuration configuration)
{
return value == null ? "" : value.toString();
}
}
/**
* parser for name values.
*/
static class ParseTagNames implements ParseProperty
{
/**
* @see org.w3c.tidy.ParseProperty#parse(java.lang.String, java.lang.String, org.w3c.tidy.Configuration)
*/
public Object parse(String value, String option, Configuration configuration)
{
short tagType = Dict.TAGTYPE_INLINE;
if ("new-inline-tags".equals(option))
{
tagType = Dict.TAGTYPE_INLINE;
}
else if ("new-blocklevel-tags".equals(option))
{
tagType = Dict.TAGTYPE_BLOCK;
}
else if ("new-empty-tags".equals(option))
{
tagType = Dict.TAGTYPE_EMPTY;
}
else if ("new-pre-tags".equals(option))
{
tagType = Dict.TAGTYPE_PRE;
}
StringTokenizer t = new StringTokenizer(value, " \t\n\r,");
while (t.hasMoreTokens())
{
configuration.definedTags |= tagType;
configuration.tt.defineTag(tagType, t.nextToken());
}
return null;
}
/**
* @see org.w3c.tidy.ParseProperty#getType()
*/
public String getType()
{
return "Tag names";
}
/**
* @see org.w3c.tidy.ParseProperty#getOptionValues()
*/
public String getOptionValues()
{
return "tagX, tagY, ...";
}
/**
* @see org.w3c.tidy.ParseProperty#getFriendlyName(java.lang.String, java.lang.Object, Configuration)
*/
public String getFriendlyName(String option, Object value, Configuration configuration)
{
short tagType;
if ("new-inline-tags".equals(option))
{
tagType = Dict.TAGTYPE_INLINE;
}
else if ("new-blocklevel-tags".equals(option))
{
tagType = Dict.TAGTYPE_BLOCK;
}
else if ("new-empty-tags".equals(option))
{
tagType = Dict.TAGTYPE_EMPTY;
}
else if ("new-pre-tags".equals(option))
{
tagType = Dict.TAGTYPE_PRE;
}
else
{
return "";
}
List tagList = configuration.tt.findAllDefinedTag(tagType);
if (tagList.isEmpty())
{
return "";
}
StringBuffer buffer = new StringBuffer();
Iterator iterator = tagList.iterator();
while (iterator.hasNext())
{
buffer.append(iterator.next());
buffer.append(" ");
}
return buffer.toString();
}
}
/**
* Parse doctype preference. doctype: omit | auto | strict | loose | [fpi]
where the fpi is a string
* similar to "-//ACME//DTD HTML 3.14159//EN"
.
*/
static class ParseDocType implements ParseProperty
{
/**
* @see org.w3c.tidy.ParseProperty#parse(java.lang.String, java.lang.String, org.w3c.tidy.Configuration)
*/
public Object parse(String value, String option, Configuration configuration)
{
value = value.trim();
/* "-//ACME//DTD HTML 3.14159//EN" or similar */
if (value.startsWith("\""))
{
configuration.docTypeMode = Configuration.DOCTYPE_USER;
return value;
}
/* read first word */
String word = "";
StringTokenizer t = new StringTokenizer(value, " \t\n\r,");
if (t.hasMoreTokens())
{
word = t.nextToken();
}
// #443663 - fix by Terry Teague 23 Jul 01
if ("auto".equalsIgnoreCase(word))
{
configuration.docTypeMode = Configuration.DOCTYPE_AUTO;
}
else if ("omit".equalsIgnoreCase(word))
{
configuration.docTypeMode = Configuration.DOCTYPE_OMIT;
}
else if ("strict".equalsIgnoreCase(word))
{
configuration.docTypeMode = Configuration.DOCTYPE_STRICT;
}
else if ("loose".equalsIgnoreCase(word) || "transitional".equalsIgnoreCase(word))
{
configuration.docTypeMode = Configuration.DOCTYPE_LOOSE;
}
else
{
configuration.report.badArgument(value, option);
}
return null;
}
/**
* @see org.w3c.tidy.ParseProperty#getType()
*/
public String getType()
{
return "DocType";
}
/**
* @see org.w3c.tidy.ParseProperty#getOptionValues()
*/
public String getOptionValues()
{
return "omit | auto | strict | loose | [fpi]";
}
/**
* @see org.w3c.tidy.ParseProperty#getFriendlyName(java.lang.String, java.lang.Object, Configuration)
*/
public String getFriendlyName(String option, Object value, Configuration configuration)
{
String stringValue;
switch (configuration.docTypeMode)
{
case Configuration.DOCTYPE_AUTO :
stringValue = "auto";
break;
case Configuration.DOCTYPE_OMIT :
stringValue = "omit";
break;
case Configuration.DOCTYPE_STRICT :
stringValue = "strict";
break;
case Configuration.DOCTYPE_LOOSE :
stringValue = "transitional";
break;
case Configuration.DOCTYPE_USER :
stringValue = configuration.docTypeStr;
break;
default :
stringValue = "unknown";
break;
}
return stringValue;
}
}
/**
* keep-first or keep-last?
*/
static class ParseRepeatedAttribute implements ParseProperty
{
/**
* @see org.w3c.tidy.ParseProperty#parse(java.lang.String, java.lang.String, org.w3c.tidy.Configuration)
*/
public Object parse(String value, String option, Configuration configuration)
{
int dupAttr;
if ("keep-first".equalsIgnoreCase(value))
{
dupAttr = Configuration.KEEP_FIRST;
}
else if ("keep-last".equalsIgnoreCase(value))
{
dupAttr = Configuration.KEEP_LAST;
}
else
{
configuration.report.badArgument(value, option);
dupAttr = -1;
}
return new Integer(dupAttr);
}
/**
* @see org.w3c.tidy.ParseProperty#getType()
*/
public String getType()
{
return "Enum";
}
/**
* @see org.w3c.tidy.ParseProperty#getOptionValues()
*/
public String getOptionValues()
{
return "keep-first, keep-last";
}
/**
* @see org.w3c.tidy.ParseProperty#getFriendlyName(java.lang.String, java.lang.Object, Configuration)
*/
public String getFriendlyName(String option, Object value, Configuration configuration)
{
if (value == null)
{
return "";
}
int intValue = ((Integer) value).intValue();
String stringValue;
switch (intValue)
{
case Configuration.KEEP_FIRST :
stringValue = "keep-first";
break;
case Configuration.KEEP_LAST :
stringValue = "keep-last";
break;
default :
stringValue = "unknown";
break;
}
return stringValue;
}
}
/**
* Parser for String values.
*/
static class ParseString implements ParseProperty
{
/**
* @see org.w3c.tidy.ParseProperty#parse(java.lang.String, java.lang.String, org.w3c.tidy.Configuration)
*/
public Object parse(String value, String option, Configuration configuration)
{
return value;
}
/**
* @see org.w3c.tidy.ParseProperty#getType()
*/
public String getType()
{
return "String";
}
/**
* @see org.w3c.tidy.ParseProperty#getOptionValues()
*/
public String getOptionValues()
{
return "-";
}
/**
* @see org.w3c.tidy.ParseProperty#getFriendlyName(java.lang.String, java.lang.Object, Configuration)
*/
public String getFriendlyName(String option, Object value, Configuration configuration)
{
return value == null ? "" : (String) value;
}
}
/**
* Parser for indent values.
*/
static class ParseIndent implements ParseProperty
{
/**
* @see org.w3c.tidy.ParseProperty#parse(java.lang.String, java.lang.String, org.w3c.tidy.Configuration)
*/
public Object parse(String value, String option, Configuration configuration)
{
boolean b = configuration.indentContent;
if ("yes".equalsIgnoreCase(value))
{
b = true;
configuration.smartIndent = false;
}
else if ("true".equalsIgnoreCase(value))
{
b = true;
configuration.smartIndent = false;
}
else if ("no".equalsIgnoreCase(value))
{
b = false;
configuration.smartIndent = false;
}
else if ("false".equalsIgnoreCase(value))
{
b = false;
configuration.smartIndent = false;
}
else if ("auto".equalsIgnoreCase(value))
{
b = true;
configuration.smartIndent = true;
}
else
{
configuration.report.badArgument(value, option);
}
return b ? Boolean.TRUE : Boolean.FALSE;
}
/**
* @see org.w3c.tidy.ParseProperty#getType()
*/
public String getType()
{
return "Indent";
}
/**
* @see org.w3c.tidy.ParseProperty#getOptionValues()
*/
public String getOptionValues()
{
return "auto, y/n, yes/no, t/f, true/false, 1/0";
}
/**
* @see org.w3c.tidy.ParseProperty#getFriendlyName(java.lang.String, java.lang.Object, Configuration)
*/
public String getFriendlyName(String option, Object value, Configuration configuration)
{
return value == null ? "" : value.toString();
}
}
/**
* Parser for css selectors.
*/
static class ParseCSS1Selector implements ParseProperty
{
/**
* @see org.w3c.tidy.ParseProperty#parse(java.lang.String, java.lang.String, org.w3c.tidy.Configuration)
*/
public Object parse(String value, String option, Configuration configuration)
{
StringTokenizer t = new StringTokenizer(value);
String buf = null;
if (t.countTokens() >= 1)
{
buf = t.nextToken() + "-"; // Make sure any escaped Unicode is terminated so valid class names are
// generated after Tidy appends last digits.
}
else
{
configuration.report.badArgument(value, option);
}
if (!Lexer.isCSS1Selector(value))
{
configuration.report.badArgument(value, option);
}
return buf;
}
/**
* @see org.w3c.tidy.ParseProperty#getType()
*/
public String getType()
{
return "Name";
}
/**
* @see org.w3c.tidy.ParseProperty#getOptionValues()
*/
public String getOptionValues()
{
return "CSS1 selector";
}
/**
* @see org.w3c.tidy.ParseProperty#getFriendlyName(java.lang.String, java.lang.Object, Configuration)
*/
public String getFriendlyName(String option, Object value, Configuration configuration)
{
return value == null ? "" : (String) value;
}
}
/**
* Parser for newline bytes. Allows lf|crlf|cr.
*/
static class ParseNewLine implements ParseProperty
{
/**
* @see org.w3c.tidy.ParseProperty#parse(java.lang.String, java.lang.String, org.w3c.tidy.Configuration)
*/
public Object parse(String value, String option, Configuration configuration)
{
// lf|crlf|cr
if ("lf".equalsIgnoreCase(value))
{
configuration.newline = new char[]{'\n'};
}
else if ("cr".equalsIgnoreCase(value))
{
configuration.newline = new char[]{'\r'};
}
else if ("crlf".equalsIgnoreCase(value))
{
configuration.newline = new char[]{'\r', '\n'};
}
else
{
configuration.report.badArgument(value, option);
}
return null;
}
/**
* @see org.w3c.tidy.ParseProperty#getType()
*/
public String getType()
{
return "Enum";
}
/**
* @see org.w3c.tidy.ParseProperty#getOptionValues()
*/
public String getOptionValues()
{
return "lf, crlf, cr";
}
/**
* @see org.w3c.tidy.ParseProperty#getFriendlyName(java.lang.String, java.lang.Object, Configuration)
*/
public String getFriendlyName(String option, Object value, Configuration configuration)
{
if (configuration.newline.length == 1)
{
return (configuration.newline[0] == '\n') ? "lf" : "cr";
}
return "crlf";
}
}
}