All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.w3c.tidy.ParsePropertyImpl Maven / Gradle / Ivy

Go to download

JTidy is a Java port of HTML Tidy, a HTML syntax checker and pretty printer. Like its non-Java cousin, JTidy can be used as a tool for cleaning up malformed and faulty HTML. In addition, JTidy provides a DOM interface to the document that is being processed, which effectively makes you able to use JTidy as a DOM parser for real-world HTML.

The newest version!
/*
 *  Java HTML Tidy - JTidy
 *  HTML parser and pretty printer
 *
 *  Copyright (c) 1998-2000 World Wide Web Consortium (Massachusetts
 *  Institute of Technology, Institut National de Recherche en
 *  Informatique et en Automatique, Keio University). All Rights
 *  Reserved.
 *
 *  Contributing Author(s):
 *
 *     Dave Raggett 
 *     Andy Quick  (translation to Java)
 *     Gary L Peskin  (Java development)
 *     Sami Lempinen  (release management)
 *     Fabrizio Giustina 
 *
 *  The contributing author(s) would like to thank all those who
 *  helped with testing, bug fixes, and patience.  This wouldn't
 *  have been possible without all of you.
 *
 *  COPYRIGHT NOTICE:
 * 
 *  This software and documentation is provided "as is," and
 *  the copyright holders and contributing author(s) make no
 *  representations or warranties, express or implied, including
 *  but not limited to, warranties of merchantability or fitness
 *  for any particular purpose or that the use of the software or
 *  documentation will not infringe any third party patents,
 *  copyrights, trademarks or other rights. 
 *
 *  The copyright holders and contributing author(s) will not be
 *  liable for any direct, indirect, special or consequential damages
 *  arising out of any use of the software or documentation, even if
 *  advised of the possibility of such damage.
 *
 *  Permission is hereby granted to use, copy, modify, and distribute
 *  this source code, or portions hereof, documentation and executables,
 *  for any purpose, without fee, subject to the following restrictions:
 *
 *  1. The origin of this source code must not be misrepresented.
 *  2. Altered versions must be plainly marked as such and must
 *     not be misrepresented as being the original source.
 *  3. This Copyright notice may not be removed or altered from any
 *     source or altered source distribution.
 * 
 *  The copyright holders and contributing author(s) specifically
 *  permit, without fee, and encourage the use of this source code
 *  as a component for supporting the Hypertext Markup Language in
 *  commercial products. If you use this source code in a product,
 *  acknowledgment is not required but would be appreciated.
 *
 */
package org.w3c.tidy;

import java.util.Iterator;
import java.util.List;
import java.util.StringTokenizer;


/**
 * Property parser instances.
 * @author Fabrizio Giustina
 * @version $Revision $ ($Author $)
 */
public final class ParsePropertyImpl
{

    /**
     * configuration parser for int values.
     */
    static final ParseProperty INT = new ParseInt();

    /**
     * configuration parser for boolean values.
     */
    static final ParseProperty BOOL = new ParseBoolean();

    /**
     * configuration parser for inverted boolean values.
     */
    static final ParseProperty INVBOOL = new ParseInvBoolean();

    /**
     * configuration parser for char encoding values.
     */
    static final ParseProperty CHAR_ENCODING = new ParseCharEncoding();

    /**
     * configuration parser for name values.
     */
    static final ParseProperty NAME = new ParseName();

    /**
     * configuration parser for tag names.
     */
    static final ParseProperty TAGNAMES = new ParseTagNames();

    /**
     * configuration parser for doctype property.
     */
    static final ParseProperty DOCTYPE = new ParseDocType();

    /**
     * configuration parser for repetated attribute property.
     */
    static final ParseProperty REPEATED_ATTRIBUTES = new ParseRepeatedAttribute();

    /**
     * configuration parser for String values.
     */
    static final ParseProperty STRING = new ParseString();

    /**
     * configuration parser for indent property.
     */
    static final ParseProperty INDENT = new ParseIndent();

    /**
     * configuration parser for css selectors.
     */
    static final ParseProperty CSS1SELECTOR = new ParseCSS1Selector();

    /**
     * configuration parser for new line bytes.
     */
    static final ParseProperty NEWLINE = new ParseNewLine();

    /**
     * don't instantiate.
     */
    private ParsePropertyImpl()
    {
        // unused
    }

    /**
     * parser for integer values.
     */
    static class ParseInt implements ParseProperty
    {

        /**
         * @see org.w3c.tidy.ParseProperty#parse(java.lang.String, java.lang.String, org.w3c.tidy.Configuration)
         */
        public Object parse(String value, String option, Configuration configuration)
        {
            int i = 0;
            try
            {
                i = Integer.parseInt(value);
            }
            catch (NumberFormatException e)
            {
                configuration.report.badArgument(value, option);
                i = -1;
            }
            return new Integer(i);
        }

        /**
         * @see org.w3c.tidy.ParseProperty#getType()
         */
        public String getType()
        {
            return "Integer";
        }

        /**
         * @see org.w3c.tidy.ParseProperty#getOptionValues()
         */
        public String getOptionValues()
        {
            return "0, 1, 2, ...";
        }

        /**
         * @see org.w3c.tidy.ParseProperty#getFriendlyName(java.lang.String, java.lang.Object, Configuration)
         */
        public String getFriendlyName(String option, Object value, Configuration configuration)
        {
            return value == null ? "" : value.toString();
        }
    }

    /**
     * parser for boolean values.
     */
    static class ParseBoolean implements ParseProperty
    {

        /**
         * @see org.w3c.tidy.ParseProperty#parse(java.lang.String, java.lang.String, org.w3c.tidy.Configuration)
         */
        public Object parse(String value, String option, Configuration configuration)
        {
            Boolean b = Boolean.TRUE;
            if (value != null && value.length() > 0)
            {
                char c = value.charAt(0);
                if ((c == 't') || (c == 'T') || (c == 'Y') || (c == 'y') || (c == '1'))
                {
                    b = Boolean.TRUE;
                }
                else if ((c == 'f') || (c == 'F') || (c == 'N') || (c == 'n') || (c == '0'))
                {
                    b = Boolean.FALSE;
                }
                else
                {
                    configuration.report.badArgument(value, option);
                }
            }
            return b;
        }

        /**
         * @see org.w3c.tidy.ParseProperty#getType()
         */
        public String getType()
        {
            return "Boolean";
        }

        /**
         * @see org.w3c.tidy.ParseProperty#getOptionValues()
         */
        public String getOptionValues()
        {
            return "y/n, yes/no, t/f, true/false, 1/0";
        }

        /**
         * @see org.w3c.tidy.ParseProperty#getFriendlyName(java.lang.String, java.lang.Object, Configuration)
         */
        public String getFriendlyName(String option, Object value, Configuration configuration)
        {
            if (value == null)
            {
                return "";
            }

            return ((Boolean) value).booleanValue() ? "yes" : "no";
        }
    }

    /**
     * parser for boolean values.
     */
    static class ParseInvBoolean implements ParseProperty
    {

        /**
         * @see org.w3c.tidy.ParseProperty#parse(java.lang.String, java.lang.String, org.w3c.tidy.Configuration)
         */
        public Object parse(String value, String option, Configuration configuration)
        {
            return (((Boolean) BOOL.parse(value, option, configuration)).booleanValue() ? Boolean.FALSE : Boolean.TRUE);
        }

        /**
         * @see org.w3c.tidy.ParseProperty#getType()
         */
        public String getType()
        {
            return "Boolean";
        }

        /**
         * @see org.w3c.tidy.ParseProperty#getOptionValues()
         */
        public String getOptionValues()
        {
            return "yes, no, true, false";
        }

        /**
         * @see org.w3c.tidy.ParseProperty#getFriendlyName(java.lang.String, java.lang.Object, Configuration)
         */
        public String getFriendlyName(String option, Object value, Configuration configuration)
        {
            if (value == null)
            {
                return "";
            }

            return ((Boolean) value).booleanValue() ? "no" : "yes";
        }
    }

    /**
     * parse character encoding option. Can be any java encoding name supported by the runtime platform.
     */
    static class ParseCharEncoding implements ParseProperty
    {

        /**
         * @see org.w3c.tidy.ParseProperty#parse(java.lang.String, java.lang.String, org.w3c.tidy.Configuration)
         */
        public Object parse(String value, String option, Configuration configuration)
        {

            if ("raw".equalsIgnoreCase(value))
            {
                // special value for compatibility with tidy c
                configuration.rawOut = true;
            }
            else if (!TidyUtils.isCharEncodingSupported(value))
            {
                configuration.report.badArgument(value, option);
            }
            else if ("input-encoding".equalsIgnoreCase(option))
            {
                configuration.setInCharEncodingName(value);
            }
            else if ("output-encoding".equalsIgnoreCase(option))
            {
                configuration.setOutCharEncodingName(value);
            }
            else if ("char-encoding".equalsIgnoreCase(option))
            {
                configuration.setInCharEncodingName(value);
                configuration.setOutCharEncodingName(value);
            }

            return null;
        }

        /**
         * @see org.w3c.tidy.ParseProperty#getType()
         */
        public String getType()
        {
            return "Encoding";
        }

        /**
         * @see org.w3c.tidy.ParseProperty#getOptionValues()
         */
        public String getOptionValues()
        {
            // ascii, latin1, raw, utf-8, iso2022, mac, utf-16, utf-16be, utf-16le, big5, shiftjis
            return "Any valid java char encoding name";
        }

        /**
         * @see org.w3c.tidy.ParseProperty#getFriendlyName(java.lang.String, java.lang.Object, Configuration)
         */
        public String getFriendlyName(String option, Object value, Configuration configuration)
        {
            if ("output-encoding".equalsIgnoreCase(option))
            {
                return configuration.getOutCharEncodingName();
            }

            // for input-encoding or char-encoding
            return configuration.getInCharEncodingName();
        }
    }

    /**
     * parser for name values (a string excluding whitespace).
     */
    static class ParseName implements ParseProperty
    {

        /**
         * @see org.w3c.tidy.ParseProperty#parse(java.lang.String, java.lang.String, org.w3c.tidy.Configuration)
         */
        public Object parse(String value, String option, Configuration configuration)
        {
            StringTokenizer t = new StringTokenizer(value);
            String rs = null;
            if (t.countTokens() >= 1)
            {
                rs = t.nextToken();
            }
            else
            {
                configuration.report.badArgument(value, option);
            }
            return rs;
        }

        /**
         * @see org.w3c.tidy.ParseProperty#getType()
         */
        public String getType()
        {
            return "Name";
        }

        /**
         * @see org.w3c.tidy.ParseProperty#getOptionValues()
         */
        public String getOptionValues()
        {
            return "-";
        }

        /**
         * @see org.w3c.tidy.ParseProperty#getFriendlyName(java.lang.String, java.lang.Object, Configuration)
         */
        public String getFriendlyName(String option, Object value, Configuration configuration)
        {
            return value == null ? "" : value.toString();
        }
    }

    /**
     * parser for name values.
     */
    static class ParseTagNames implements ParseProperty
    {

        /**
         * @see org.w3c.tidy.ParseProperty#parse(java.lang.String, java.lang.String, org.w3c.tidy.Configuration)
         */
        public Object parse(String value, String option, Configuration configuration)
        {
            short tagType = Dict.TAGTYPE_INLINE;

            if ("new-inline-tags".equals(option))
            {
                tagType = Dict.TAGTYPE_INLINE;
            }
            else if ("new-blocklevel-tags".equals(option))
            {
                tagType = Dict.TAGTYPE_BLOCK;
            }
            else if ("new-empty-tags".equals(option))
            {
                tagType = Dict.TAGTYPE_EMPTY;
            }
            else if ("new-pre-tags".equals(option))
            {
                tagType = Dict.TAGTYPE_PRE;
            }

            StringTokenizer t = new StringTokenizer(value, " \t\n\r,");
            while (t.hasMoreTokens())
            {
                configuration.definedTags |= tagType;
                configuration.tt.defineTag(tagType, t.nextToken());
            }
            return null;
        }

        /**
         * @see org.w3c.tidy.ParseProperty#getType()
         */
        public String getType()
        {
            return "Tag names";
        }

        /**
         * @see org.w3c.tidy.ParseProperty#getOptionValues()
         */
        public String getOptionValues()
        {
            return "tagX, tagY, ...";
        }

        /**
         * @see org.w3c.tidy.ParseProperty#getFriendlyName(java.lang.String, java.lang.Object, Configuration)
         */
        public String getFriendlyName(String option, Object value, Configuration configuration)
        {
            short tagType;
            if ("new-inline-tags".equals(option))
            {
                tagType = Dict.TAGTYPE_INLINE;
            }
            else if ("new-blocklevel-tags".equals(option))
            {
                tagType = Dict.TAGTYPE_BLOCK;
            }
            else if ("new-empty-tags".equals(option))
            {
                tagType = Dict.TAGTYPE_EMPTY;
            }
            else if ("new-pre-tags".equals(option))
            {
                tagType = Dict.TAGTYPE_PRE;
            }
            else
            {
                return "";
            }

            List tagList = configuration.tt.findAllDefinedTag(tagType);
            if (tagList.isEmpty())
            {
                return "";
            }

            StringBuffer buffer = new StringBuffer();
            Iterator iterator = tagList.iterator();
            while (iterator.hasNext())
            {
                buffer.append(iterator.next());
                buffer.append(" ");
            }

            return buffer.toString();
        }
    }

    /**
     * Parse doctype preference. doctype: omit | auto | strict | loose | [fpi] where the fpi is a string
     * similar to "-//ACME//DTD HTML 3.14159//EN".
     */
    static class ParseDocType implements ParseProperty
    {

        /**
         * @see org.w3c.tidy.ParseProperty#parse(java.lang.String, java.lang.String, org.w3c.tidy.Configuration)
         */
        public Object parse(String value, String option, Configuration configuration)
        {
            value = value.trim();

            /* "-//ACME//DTD HTML 3.14159//EN" or similar */

            if (value.startsWith("\""))
            {
                configuration.docTypeMode = Configuration.DOCTYPE_USER;
                return value;
            }

            /* read first word */
            String word = "";
            StringTokenizer t = new StringTokenizer(value, " \t\n\r,");
            if (t.hasMoreTokens())
            {
                word = t.nextToken();
            }
            // #443663 - fix by Terry Teague 23 Jul 01
            if ("auto".equalsIgnoreCase(word))
            {
                configuration.docTypeMode = Configuration.DOCTYPE_AUTO;
            }
            else if ("omit".equalsIgnoreCase(word))
            {
                configuration.docTypeMode = Configuration.DOCTYPE_OMIT;
            }
            else if ("strict".equalsIgnoreCase(word))
            {
                configuration.docTypeMode = Configuration.DOCTYPE_STRICT;
            }
            else if ("loose".equalsIgnoreCase(word) || "transitional".equalsIgnoreCase(word))
            {
                configuration.docTypeMode = Configuration.DOCTYPE_LOOSE;
            }
            else
            {
                configuration.report.badArgument(value, option);
            }
            return null;
        }

        /**
         * @see org.w3c.tidy.ParseProperty#getType()
         */
        public String getType()
        {
            return "DocType";
        }

        /**
         * @see org.w3c.tidy.ParseProperty#getOptionValues()
         */
        public String getOptionValues()
        {
            return "omit | auto | strict | loose | [fpi]";
        }

        /**
         * @see org.w3c.tidy.ParseProperty#getFriendlyName(java.lang.String, java.lang.Object, Configuration)
         */
        public String getFriendlyName(String option, Object value, Configuration configuration)
        {

            String stringValue;

            switch (configuration.docTypeMode)
            {
                case Configuration.DOCTYPE_AUTO :
                    stringValue = "auto";
                    break;

                case Configuration.DOCTYPE_OMIT :
                    stringValue = "omit";
                    break;

                case Configuration.DOCTYPE_STRICT :
                    stringValue = "strict";
                    break;

                case Configuration.DOCTYPE_LOOSE :
                    stringValue = "transitional";
                    break;

                case Configuration.DOCTYPE_USER :
                    stringValue = configuration.docTypeStr;
                    break;

                default :
                    stringValue = "unknown";
                    break;
            }

            return stringValue;
        }
    }

    /**
     * keep-first or keep-last?
     */
    static class ParseRepeatedAttribute implements ParseProperty
    {

        /**
         * @see org.w3c.tidy.ParseProperty#parse(java.lang.String, java.lang.String, org.w3c.tidy.Configuration)
         */
        public Object parse(String value, String option, Configuration configuration)
        {
            int dupAttr;

            if ("keep-first".equalsIgnoreCase(value))
            {
                dupAttr = Configuration.KEEP_FIRST;
            }
            else if ("keep-last".equalsIgnoreCase(value))
            {
                dupAttr = Configuration.KEEP_LAST;
            }
            else
            {
                configuration.report.badArgument(value, option);
                dupAttr = -1;
            }
            return new Integer(dupAttr);
        }

        /**
         * @see org.w3c.tidy.ParseProperty#getType()
         */
        public String getType()
        {
            return "Enum";
        }

        /**
         * @see org.w3c.tidy.ParseProperty#getOptionValues()
         */
        public String getOptionValues()
        {
            return "keep-first, keep-last";
        }

        /**
         * @see org.w3c.tidy.ParseProperty#getFriendlyName(java.lang.String, java.lang.Object, Configuration)
         */
        public String getFriendlyName(String option, Object value, Configuration configuration)
        {
            if (value == null)
            {
                return "";
            }

            int intValue = ((Integer) value).intValue();
            String stringValue;

            switch (intValue)
            {
                case Configuration.KEEP_FIRST :
                    stringValue = "keep-first";
                    break;

                case Configuration.KEEP_LAST :
                    stringValue = "keep-last";
                    break;

                default :
                    stringValue = "unknown";
                    break;
            }

            return stringValue;
        }
    }

    /**
     * Parser for String values.
     */
    static class ParseString implements ParseProperty
    {

        /**
         * @see org.w3c.tidy.ParseProperty#parse(java.lang.String, java.lang.String, org.w3c.tidy.Configuration)
         */
        public Object parse(String value, String option, Configuration configuration)
        {
            return value;
        }

        /**
         * @see org.w3c.tidy.ParseProperty#getType()
         */
        public String getType()
        {
            return "String";
        }

        /**
         * @see org.w3c.tidy.ParseProperty#getOptionValues()
         */
        public String getOptionValues()
        {
            return "-";
        }

        /**
         * @see org.w3c.tidy.ParseProperty#getFriendlyName(java.lang.String, java.lang.Object, Configuration)
         */
        public String getFriendlyName(String option, Object value, Configuration configuration)
        {
            return value == null ? "" : (String) value;
        }
    }

    /**
     * Parser for indent values.
     */
    static class ParseIndent implements ParseProperty
    {

        /**
         * @see org.w3c.tidy.ParseProperty#parse(java.lang.String, java.lang.String, org.w3c.tidy.Configuration)
         */
        public Object parse(String value, String option, Configuration configuration)
        {
            boolean b = configuration.indentContent;

            if ("yes".equalsIgnoreCase(value))
            {
                b = true;
                configuration.smartIndent = false;
            }
            else if ("true".equalsIgnoreCase(value))
            {
                b = true;
                configuration.smartIndent = false;
            }
            else if ("no".equalsIgnoreCase(value))
            {
                b = false;
                configuration.smartIndent = false;
            }
            else if ("false".equalsIgnoreCase(value))
            {
                b = false;
                configuration.smartIndent = false;
            }
            else if ("auto".equalsIgnoreCase(value))
            {
                b = true;
                configuration.smartIndent = true;
            }
            else
            {
                configuration.report.badArgument(value, option);
            }
            return b ? Boolean.TRUE : Boolean.FALSE;
        }

        /**
         * @see org.w3c.tidy.ParseProperty#getType()
         */
        public String getType()
        {
            return "Indent";
        }

        /**
         * @see org.w3c.tidy.ParseProperty#getOptionValues()
         */
        public String getOptionValues()
        {
            return "auto, y/n, yes/no, t/f, true/false, 1/0";
        }

        /**
         * @see org.w3c.tidy.ParseProperty#getFriendlyName(java.lang.String, java.lang.Object, Configuration)
         */
        public String getFriendlyName(String option, Object value, Configuration configuration)
        {
            return value == null ? "" : value.toString();
        }
    }

    /**
     * Parser for css selectors.
     */
    static class ParseCSS1Selector implements ParseProperty
    {

        /**
         * @see org.w3c.tidy.ParseProperty#parse(java.lang.String, java.lang.String, org.w3c.tidy.Configuration)
         */
        public Object parse(String value, String option, Configuration configuration)
        {
            StringTokenizer t = new StringTokenizer(value);
            String buf = null;
            if (t.countTokens() >= 1)
            {
                buf = t.nextToken() + "-"; // Make sure any escaped Unicode is terminated so valid class names are
                // generated after Tidy appends last digits.
            }
            else
            {
                configuration.report.badArgument(value, option);
            }

            if (!Lexer.isCSS1Selector(value))
            {
                configuration.report.badArgument(value, option);
            }

            return buf;
        }

        /**
         * @see org.w3c.tidy.ParseProperty#getType()
         */
        public String getType()
        {
            return "Name";
        }

        /**
         * @see org.w3c.tidy.ParseProperty#getOptionValues()
         */
        public String getOptionValues()
        {
            return "CSS1 selector";
        }

        /**
         * @see org.w3c.tidy.ParseProperty#getFriendlyName(java.lang.String, java.lang.Object, Configuration)
         */
        public String getFriendlyName(String option, Object value, Configuration configuration)
        {
            return value == null ? "" : (String) value;
        }
    }

    /**
     * Parser for newline bytes. Allows lf|crlf|cr.
     */
    static class ParseNewLine implements ParseProperty
    {

        /**
         * @see org.w3c.tidy.ParseProperty#parse(java.lang.String, java.lang.String, org.w3c.tidy.Configuration)
         */
        public Object parse(String value, String option, Configuration configuration)
        {
            // lf|crlf|cr
            if ("lf".equalsIgnoreCase(value))
            {
                configuration.newline = new char[]{'\n'};
            }
            else if ("cr".equalsIgnoreCase(value))
            {
                configuration.newline = new char[]{'\r'};
            }
            else if ("crlf".equalsIgnoreCase(value))
            {
                configuration.newline = new char[]{'\r', '\n'};
            }
            else
            {
                configuration.report.badArgument(value, option);
            }
            return null;
        }

        /**
         * @see org.w3c.tidy.ParseProperty#getType()
         */
        public String getType()
        {
            return "Enum";
        }

        /**
         * @see org.w3c.tidy.ParseProperty#getOptionValues()
         */
        public String getOptionValues()
        {
            return "lf, crlf, cr";
        }

        /**
         * @see org.w3c.tidy.ParseProperty#getFriendlyName(java.lang.String, java.lang.Object, Configuration)
         */
        public String getFriendlyName(String option, Object value, Configuration configuration)
        {
            if (configuration.newline.length == 1)
            {
                return (configuration.newline[0] == '\n') ? "lf" : "cr";
            }
            return "crlf";
        }
    }

}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy