org.w3c.tidy.Configuration Maven / Gradle / Ivy
/*
* Copyright 2010 Alibaba Group Holding Limited.
* All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/*
* @(#)Configuration.java 1.11 2000/08/16
*
*/
package org.w3c.tidy;
/**
*
* Read configuration file and manage configuration properties.
*
* (c) 1998-2000 (W3C) MIT, INRIA, Keio University
* See Tidy.java for the copyright notice.
* Derived from
* HTML Tidy Release 4 Aug 2000
*
* @author Dave Raggett
* @author Andy Quick (translation to Java)
* @version 1.0, 1999/05/22
* @version 1.0.1, 1999/05/29
* @version 1.1, 1999/06/18 Java Bean
* @version 1.2, 1999/07/10 Tidy Release 7 Jul 1999
* @version 1.3, 1999/07/30 Tidy Release 26 Jul 1999
* @version 1.4, 1999/09/04 DOM support
* @version 1.5, 1999/10/23 Tidy Release 27 Sep 1999
* @version 1.6, 1999/11/01 Tidy Release 22 Oct 1999
* @version 1.7, 1999/12/06 Tidy Release 30 Nov 1999
* @version 1.8, 2000/01/22 Tidy Release 13 Jan 2000
* @version 1.9, 2000/06/03 Tidy Release 30 Apr 2000
* @version 1.10, 2000/07/22 Tidy Release 8 Jul 2000
* @version 1.11, 2000/08/16 Tidy Release 4 Aug 2000
*/
/*
Configuration files associate a property name with a value.
The format is that of a Java .properties file.
*/
import java.io.FileInputStream;
import java.io.IOException;
import java.util.Enumeration;
import java.util.Properties;
import java.util.StringTokenizer;
public class Configuration implements java.io.Serializable {
/**
*
*/
private static final long serialVersionUID = 7370280176303390368L;
/* character encodings */
public static final int RAW = 0;
public static final int ASCII = 1;
public static final int LATIN1 = 2;
public static final int UTF8 = 3;
public static final int ISO2022 = 4;
public static final int MACROMAN = 5;
/* mode controlling treatment of doctype */
public static final int DOCTYPE_OMIT = 0;
public static final int DOCTYPE_AUTO = 1;
public static final int DOCTYPE_STRICT = 2;
public static final int DOCTYPE_LOOSE = 3;
public static final int DOCTYPE_USER = 4;
protected int spaces = 2; /*
* default indentation
*/
protected int wraplen = 68; /*
* default wrap margin
*/
protected int CharEncoding = ASCII;
protected int tabsize = 4;
protected int docTypeMode = DOCTYPE_AUTO; /*
* see doctype property
*/
protected String altText = null; /*
* default text for alt attribute
*/
protected String slidestyle = null; /*
* style sheet for slides
*/
protected String docTypeStr = null; /*
* user specified doctype
*/
protected String errfile = null; /*
* file name to write errors to
*/
protected boolean writeback = false; /*
* if true then output tidied markup
*/
protected boolean OnlyErrors = false; /*
* if true normal output is suppressed
*/
protected boolean ShowWarnings = true; /*
* however errors are always shown
*/
protected boolean Quiet = false; /*
* no 'Parsing X', guessed DTD or summary
*/
protected boolean IndentContent = false; /*
* indent content of appropriate
* tags
*/
protected boolean SmartIndent = false; /*
* does text /block level content
* effect indentation
*/
protected boolean HideEndTags = false; /*
* suppress optional end tags
*/
protected boolean XmlTags = false; /*
* treat input as XML
*/
protected boolean XmlOut = false; /*
* create output as XML
*/
protected boolean xHTML = false; /*
* output extensible HTML
*/
protected boolean XmlPi = false; /*
* add for XML docs
*/
protected boolean RawOut = false; /*
* avoid mapping values > 127 to entities
*/
protected boolean UpperCaseTags = false; /*
* output tags in upper not lower
* case
*/
protected boolean UpperCaseAttrs = false; /*
* output attributes in upper not
* lower case
*/
protected boolean MakeClean = false; /*
* remove presentational clutter
*/
protected boolean LogicalEmphasis = false; /*
* replace i by em and b by
* strong
*/
protected boolean DropFontTags = false; /*
* discard presentation tags
*/
protected boolean DropEmptyParas = true; /*
* discard empty p elements
*/
protected boolean FixComments = true; /*
* fix comments with adjacent hyphens
*/
protected boolean BreakBeforeBR = false; /*
* o/p newline before
or not?
*/
protected boolean BurstSlides = false; /*
* create slides on each h2 element
*/
protected boolean NumEntities = false; /*
* use numeric entities
*/
protected boolean QuoteMarks = false; /*
* output " marks as "
*/
protected boolean QuoteNbsp = true; /*
* output non -breaking space as entity
*/
protected boolean QuoteAmpersand = true; /*
* output naked ampersand as &
*/
protected boolean WrapAttVals = false; /*
* wrap within attribute values
*/
protected boolean WrapScriptlets = false; /*
* wrap within JavaScript string
* literals
*/
protected boolean WrapSection = true; /*
* wrap within section tags
*/
protected boolean WrapAsp = true; /*
* wrap within ASP pseudo elements
*/
protected boolean WrapJste = true; /*
* wrap within JSTE pseudo elements
*/
protected boolean WrapPhp = true; /*
* wrap within PHP pseudo elements
*/
protected boolean FixBackslash = true; /*
* fix URLs by replacing \ with /
*/
protected boolean IndentAttributes = false; /*
* newline+ indent before each
* attribute
*/
protected boolean XmlPIs = false; /*
* if set to yes PIs must end with ?>
*/
protected boolean XmlSpace = false; /*
* if set to yes adds xml :space attr as
* needed
*/
protected boolean EncloseBodyText = false; /*
* if yes text at body is wrapped
* in 's
*/
protected boolean EncloseBlockText = false; /*
* if yes text in blocks is
* wrapped in
's
*/
protected boolean KeepFileTimes = true; /*
* if yes last modied time is
* preserved
*/
protected boolean Word2000 = false; /*
* draconian cleaning for Word2000
*/
protected boolean TidyMark = true; /*
* add meta element indicating tidied doc
*/
protected boolean Emacs = false; /*
* if true format error output for GNU
* Emacs
*/
protected boolean LiteralAttribs = false; /*
* if true attributes may use
* newlines
*/
protected TagTable tt; /*
* TagTable associated with this Configuration
*/
private transient Properties _properties = new Properties();
public Configuration() {
}
public void addProps(Properties p) {
Enumeration e = p.propertyNames();
while (e.hasMoreElements()) {
String key = (String) e.nextElement();
String value = p.getProperty(key);
_properties.put(key, value);
}
parseProps();
}
public void parseFile(String filename) {
try {
_properties.load(new FileInputStream(filename));
} catch (IOException e) {
System.err.println(filename + e.toString());
return;
}
parseProps();
}
private void parseProps() {
String value;
value = _properties.getProperty("indent-spaces");
if (value != null) {
spaces = parseInt(value, "indent-spaces");
}
value = _properties.getProperty("wrap");
if (value != null) {
wraplen = parseInt(value, "wrap");
}
value = _properties.getProperty("wrap-attributes");
if (value != null) {
WrapAttVals = parseBool(value, "wrap-attributes");
}
value = _properties.getProperty("wrap-script-literals");
if (value != null) {
WrapScriptlets = parseBool(value, "wrap-script-literals");
}
value = _properties.getProperty("wrap-sections");
if (value != null) {
WrapSection = parseBool(value, "wrap-sections");
}
value = _properties.getProperty("wrap-asp");
if (value != null) {
WrapAsp = parseBool(value, "wrap-asp");
}
value = _properties.getProperty("wrap-jste");
if (value != null) {
WrapJste = parseBool(value, "wrap-jste");
}
value = _properties.getProperty("wrap-php");
if (value != null) {
WrapPhp = parseBool(value, "wrap-php");
}
value = _properties.getProperty("literal-attributes");
if (value != null) {
LiteralAttribs = parseBool(value, "literal-attributes");
}
value = _properties.getProperty("tab-size");
if (value != null) {
tabsize = parseInt(value, "tab-size");
}
value = _properties.getProperty("markup");
if (value != null) {
OnlyErrors = parseInvBool(value, "markup");
}
value = _properties.getProperty("quiet");
if (value != null) {
Quiet = parseBool(value, "quiet");
}
value = _properties.getProperty("tidy-mark");
if (value != null) {
TidyMark = parseBool(value, "tidy-mark");
}
value = _properties.getProperty("indent");
if (value != null) {
IndentContent = parseIndent(value, "indent");
}
value = _properties.getProperty("indent-attributes");
if (value != null) {
IndentAttributes = parseBool(value, "ident-attributes");
}
value = _properties.getProperty("hide-endtags");
if (value != null) {
HideEndTags = parseBool(value, "hide-endtags");
}
value = _properties.getProperty("input-xml");
if (value != null) {
XmlTags = parseBool(value, "input-xml");
}
value = _properties.getProperty("output-xml");
if (value != null) {
XmlOut = parseBool(value, "output-xml");
}
value = _properties.getProperty("output-xhtml");
if (value != null) {
xHTML = parseBool(value, "output-xhtml");
}
value = _properties.getProperty("add-xml-pi");
if (value != null) {
XmlPi = parseBool(value, "add-xml-pi");
}
value = _properties.getProperty("add-xml-decl");
if (value != null) {
XmlPi = parseBool(value, "add-xml-decl");
}
value = _properties.getProperty("assume-xml-procins");
if (value != null) {
XmlPIs = parseBool(value, "assume-xml-procins");
}
value = _properties.getProperty("raw");
if (value != null) {
RawOut = parseBool(value, "raw");
}
value = _properties.getProperty("uppercase-tags");
if (value != null) {
UpperCaseTags = parseBool(value, "uppercase-tags");
}
value = _properties.getProperty("uppercase-attributes");
if (value != null) {
UpperCaseAttrs = parseBool(value, "uppercase-attributes");
}
value = _properties.getProperty("clean");
if (value != null) {
MakeClean = parseBool(value, "clean");
}
value = _properties.getProperty("logical-emphasis");
if (value != null) {
LogicalEmphasis = parseBool(value, "logical-emphasis");
}
value = _properties.getProperty("word-2000");
if (value != null) {
Word2000 = parseBool(value, "word-2000");
}
value = _properties.getProperty("drop-empty-paras");
if (value != null) {
DropEmptyParas = parseBool(value, "drop-empty-paras");
}
value = _properties.getProperty("drop-font-tags");
if (value != null) {
DropFontTags = parseBool(value, "drop-font-tags");
}
value = _properties.getProperty("enclose-text");
if (value != null) {
EncloseBodyText = parseBool(value, "enclose-text");
}
value = _properties.getProperty("enclose-block-text");
if (value != null) {
EncloseBlockText = parseBool(value, "enclose-block-text");
}
value = _properties.getProperty("alt-text");
if (value != null) {
altText = value;
}
value = _properties.getProperty("add-xml-space");
if (value != null) {
XmlSpace = parseBool(value, "add-xml-space");
}
value = _properties.getProperty("fix-bad-comments");
if (value != null) {
FixComments = parseBool(value, "fix-bad-comments");
}
value = _properties.getProperty("split");
if (value != null) {
BurstSlides = parseBool(value, "split");
}
value = _properties.getProperty("break-before-br");
if (value != null) {
BreakBeforeBR = parseBool(value, "break-before-br");
}
value = _properties.getProperty("numeric-entities");
if (value != null) {
NumEntities = parseBool(value, "numeric-entities");
}
value = _properties.getProperty("quote-marks");
if (value != null) {
QuoteMarks = parseBool(value, "quote-marks");
}
value = _properties.getProperty("quote-nbsp");
if (value != null) {
QuoteNbsp = parseBool(value, "quote-nbsp");
}
value = _properties.getProperty("quote-ampersand");
if (value != null) {
QuoteAmpersand = parseBool(value, "quote-ampersand");
}
value = _properties.getProperty("write-back");
if (value != null) {
writeback = parseBool(value, "write-back");
}
value = _properties.getProperty("keep-time");
if (value != null) {
KeepFileTimes = parseBool(value, "keep-time");
}
value = _properties.getProperty("show-warnings");
if (value != null) {
ShowWarnings = parseBool(value, "show-warnings");
}
value = _properties.getProperty("error-file");
if (value != null) {
errfile = parseName(value, "error-file");
}
value = _properties.getProperty("slide-style");
if (value != null) {
slidestyle = parseName(value, "slide-style");
}
value = _properties.getProperty("new-inline-tags");
if (value != null) {
parseInlineTagNames(value, "new-inline-tags");
}
value = _properties.getProperty("new-blocklevel-tags");
if (value != null) {
parseBlockTagNames(value, "new-blocklevel-tags");
}
value = _properties.getProperty("new-empty-tags");
if (value != null) {
parseEmptyTagNames(value, "new-empty-tags");
}
value = _properties.getProperty("new-pre-tags");
if (value != null) {
parsePreTagNames(value, "new-pre-tags");
}
value = _properties.getProperty("char-encoding");
if (value != null) {
CharEncoding = parseCharEncoding(value, "char-encoding");
}
value = _properties.getProperty("doctype");
if (value != null) {
docTypeStr = parseDocType(value, "doctype");
}
value = _properties.getProperty("fix-backslash");
if (value != null) {
FixBackslash = parseBool(value, "fix-backslash");
}
value = _properties.getProperty("gnu-emacs");
if (value != null) {
Emacs = parseBool(value, "gnu-emacs");
}
}
/* ensure that config is self consistent */
public void adjust() {
if (EncloseBlockText) {
EncloseBodyText = true;
}
/* avoid the need to set IndentContent when SmartIndent is set */
if (SmartIndent) {
IndentContent = true;
}
/* disable wrapping */
if (wraplen == 0) {
wraplen = 0x7FFFFFFF;
}
/* Word 2000 needs o:p to be declared as inline */
if (Word2000) {
tt.defineInlineTag("o:p");
}
/* XHTML is written in lower case */
if (xHTML) {
XmlOut = true;
UpperCaseTags = false;
UpperCaseAttrs = false;
}
/* if XML in, then XML out */
if (XmlTags) {
XmlOut = true;
XmlPIs = true;
}
/* XML requires end tags */
if (XmlOut) {
QuoteAmpersand = true;
HideEndTags = false;
}
}
private static int parseInt(String s, String option) {
int i = 0;
try {
i = Integer.parseInt(s);
} catch (NumberFormatException e) {
Report.badArgument(option);
i = -1;
}
return i;
}
private static boolean parseBool(String s, String option) {
boolean b = false;
if (s != null && s.length() > 0) {
char c = s.charAt(0);
if (c == 't' || c == 'T' || c == 'Y' || c == 'y' || c == '1') {
b = true;
} else if (c == 'f' || c == 'F' || c == 'N' || c == 'n' || c == '0') {
b = false;
} else {
Report.badArgument(option);
}
}
return b;
}
private static boolean parseInvBool(String s, String option) {
boolean b = false;
if (s != null && s.length() > 0) {
char c = s.charAt(0);
if (c == 't' || c == 'T' || c == 'Y' || c == 'y') {
b = true;
} else if (c == 'f' || c == 'F' || c == 'N' || c == 'n') {
b = false;
} else {
Report.badArgument(option);
}
}
return !b;
}
private static String parseName(String s, String option) {
StringTokenizer t = new StringTokenizer(s);
String rs = null;
if (t.countTokens() >= 1) {
rs = t.nextToken();
} else {
Report.badArgument(option);
}
return rs;
}
private static int parseCharEncoding(String s, String option) {
int result = ASCII;
if (Lexer.wstrcasecmp(s, "ascii") == 0) {
result = ASCII;
} else if (Lexer.wstrcasecmp(s, "latin1") == 0) {
result = LATIN1;
} else if (Lexer.wstrcasecmp(s, "raw") == 0) {
result = RAW;
} else if (Lexer.wstrcasecmp(s, "utf8") == 0) {
result = UTF8;
} else if (Lexer.wstrcasecmp(s, "iso2022") == 0) {
result = ISO2022;
} else if (Lexer.wstrcasecmp(s, "mac") == 0) {
result = MACROMAN;
} else {
Report.badArgument(option);
}
return result;
}
/* slight hack to avoid changes to pprint.c */
private boolean parseIndent(String s, String option) {
boolean b = IndentContent;
if (Lexer.wstrcasecmp(s, "yes") == 0) {
b = true;
SmartIndent = false;
} else if (Lexer.wstrcasecmp(s, "true") == 0) {
b = true;
SmartIndent = false;
} else if (Lexer.wstrcasecmp(s, "no") == 0) {
b = false;
SmartIndent = false;
} else if (Lexer.wstrcasecmp(s, "false") == 0) {
b = false;
SmartIndent = false;
} else if (Lexer.wstrcasecmp(s, "auto") == 0) {
b = true;
SmartIndent = true;
} else {
Report.badArgument(option);
}
return b;
}
private void parseInlineTagNames(String s, String option) {
StringTokenizer t = new StringTokenizer(s, " \t\n\r,");
while (t.hasMoreTokens()) {
tt.defineInlineTag(t.nextToken());
}
}
private void parseBlockTagNames(String s, String option) {
StringTokenizer t = new StringTokenizer(s, " \t\n\r,");
while (t.hasMoreTokens()) {
tt.defineBlockTag(t.nextToken());
}
}
private void parseEmptyTagNames(String s, String option) {
StringTokenizer t = new StringTokenizer(s, " \t\n\r,");
while (t.hasMoreTokens()) {
tt.defineEmptyTag(t.nextToken());
}
}
private void parsePreTagNames(String s, String option) {
StringTokenizer t = new StringTokenizer(s, " \t\n\r,");
while (t.hasMoreTokens()) {
tt.definePreTag(t.nextToken());
}
}
/*
* doctype: omit | auto | strict | loose | where the fpi is a string
* similar to "-//ACME//DTD HTML 3.14159//EN"
*/
protected String parseDocType(String s, String option) {
s = s.trim();
/* "-//ACME//DTD HTML 3.14159//EN" or similar */
if (s.startsWith("\"")) {
docTypeMode = DOCTYPE_USER;
return s;
}
/* read first word */
String word = "";
StringTokenizer t = new StringTokenizer(s, " \t\n\r,");
if (t.hasMoreTokens()) {
word = t.nextToken();
}
if (Lexer.wstrcasecmp(word, "omit") == 0) {
docTypeMode = DOCTYPE_OMIT;
} else if (Lexer.wstrcasecmp(word, "strict") == 0) {
docTypeMode = DOCTYPE_STRICT;
} else if (Lexer.wstrcasecmp(word, "loose") == 0 || Lexer.wstrcasecmp(word, "transitional") == 0) {
docTypeMode = DOCTYPE_LOOSE;
} else if (Lexer.wstrcasecmp(word, "auto") == 0) {
docTypeMode = DOCTYPE_AUTO;
} else {
docTypeMode = DOCTYPE_AUTO;
Report.badArgument(option);
}
return null;
}
}