org.htmlparser.lexerapplications.tabby.Tabby Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of bboss-htmlparser Show documentation
bboss is a j2ee framework include aop/ioc,mvc,persistent,taglib,rpc,event ,bean-xml serializable and so on.http://www.bbossgroups.com
The newest version!
// HTMLParser Library $Name: v1_5 $ - A java-based parser for HTML
// http://sourceforge.org/projects/htmlparser
// Copyright (C) 2003 Derrick Oswald
//
// Revision Control Information
//
// $Source: /cvsroot/htmlparser/htmlparser/src/org/htmlparser/lexerapplications/tabby/Tabby.java,v $
// $Author: derrickoswald $
// $Date: 2005/03/13 14:51:44 $
// $Revision: 1.3 $
//
// This library is free software; you can redistribute it and/or
// modify it under the terms of the GNU Lesser General Public
// License as published by the Free Software Foundation; either
// version 2.1 of the License, or (at your option) any later version.
//
// This library is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
// Lesser General Public License for more details.
//
// You should have received a copy of the GNU Lesser General Public
// License along with this library; if not, write to the Free Software
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
//

package org.htmlparser.lexerapplications.tabby;

import java.io.File;
import java.io.FileFilter;
import java.io.FileInputStream;
import java.io.FileOutputStream;

import java.util.regex.Matcher;
import java.util.regex.Pattern;
import java.util.regex.PatternSyntaxException;

import org.htmlparser.lexer.Cursor;
import org.htmlparser.lexer.Page;

/**
 * Replace tabs with spaces.
 * Convert tabs to the correct number of spaces according to a tabstop,
 * change DOS \r\n line endings to Unix \n form, and remove trailing whitespace
 */
public class Tabby
{
    /**
     * The default tab stop spacing.
     */
    private static final int DEFAULT_TABSTOP = 4;

    /**
     * The file filter to apply.
     */
    protected Filter mFilter;

    /**
     * The replacement tab stop size.
     */
    protected int mTabsize;

    /**
     * Creates a new instance of Tabby with no file filter and a tab stop of 4.
     */
    public Tabby ()
    {
        mFilter = null;
        mTabsize = DEFAULT_TABSTOP;
    }

    /**
     * Creates a new instance of Tabby using the given regular expression and
     * a tab stop of 4.
     * @param filter The regular expression to apply to the files searched.
     */
    public Tabby (final String filter)
    {
        this ();
        mFilter = new Filter (filter);
    }

    /** Creates a new instance of Tabby.
     * @param filter The regular expression to apply to the files searched.
     * @param tabsize The tab stop setting.
     * @exception IllegalArgumentException If tabsize is not a positive number.
     */
    public Tabby (final String filter, final int tabsize)
        throws
            IllegalArgumentException
    {
        this (filter);
        if (0 >= tabsize)
            throw new IllegalArgumentException ("tab size cannot be negative");
        mTabsize = tabsize;
    }

    /**
     * Process the file or directory.
     * @param file The file to process.
     */
    protected void process (final File file)
    {
        File[] files;

        if (file.isDirectory ())
        {
            files = file.listFiles (mFilter);
            for (int i = 0; i < files.length; i++)
                process (files[i]);
        }
        else
            edit (file);
    }

    /**
     * Process the file or directory.
     * @param file The file to edit.
     */
    protected void edit (final File file)
    {
        FileInputStream in;
        Page page;
        Cursor cursor;
        int position;
        int expected;
        boolean modified;
        char ch;
        int last;
        StringBuilder buffer;
        FileOutputStream out;

        try
        {
            in = new FileInputStream (file);
            buffer = new StringBuilder (in.available ());
            try
            {
                page = new Page (in, null);
                cursor = new Cursor (page, 0);
                position = 0;
                modified = false;
                expected = 0;
                last = -1;
                while (Page.EOF != (ch = page.getCharacter (cursor)))
                {
                    if (++expected != cursor.getPosition ())
                    {
                        modified = true;
                        expected = cursor.getPosition ();
                    }
                    if ('\t' == ch)
                    {
                        do
                        {
                            buffer.append (' ');
                            position++;
                        }
                        while (0 != (position % mTabsize));
                        modified = true;
                    }
                    else if ('\n' == ch)
                    {
                        // check for whitespace on the end of the line
                        if (last + 1 != position)
                        {
                            // remove trailing whitespace
                            last = buffer.length () - (position - last - 1);
                            buffer.setLength (last);
                            modified = true;
                        }
                        buffer.append (ch);
                        position = 0;
                        last = -1;
                    }
                    else
                    {
                        buffer.append (ch);
                        if (!Character.isWhitespace (ch))
                            last = position;
                        position++;
                    }
                }
            }
            finally
            {
                in.close ();
            }
            if (modified)
            {
                System.out.println (file.getAbsolutePath ());
                out = new FileOutputStream (file);
                out.write (buffer.toString ().getBytes (Page.DEFAULT_CHARSET));
                out.close ();
            }
        }
        catch (Exception e)
        {
            System.out.println (e);
        }
    }

    /**
     * Implement a file filter.
     */
    class Filter implements FileFilter
    {
        /**
         * The compiled expression.
         */
        protected Pattern mExpression;

        /**
         * Create a file filter from the regular expression.
         * @param expression The regular expression.
         * A useful regular expression is ".*\.java" which accepts all
         * .java files.
         * @exception IllegalArgumentException If the expression is
         * null.
         * @exception PatternSyntaxException If the expression is not a valid
         * regular expression.
         */
        public Filter (final String expression)
            throws
                PatternSyntaxException
        {
            if (null == expression)
                throw new IllegalArgumentException (
                    "filter expression cannot be null");
            mExpression = Pattern.compile (expression);
        }

        //
        // FileFilter interface
        //

        /**
         * Tests whether or not the file should be included in a pathname list.
         * @param pathname The abstract pathname to be tested.
         * @return true if and only if pathname
         * should be included.
         */
        public boolean accept (final File pathname)
        {
            Matcher matcher;
            boolean ret;

            // match directories
            if (pathname.isDirectory ())
                ret = true;
            else
            {
                matcher = mExpression.matcher (pathname.getAbsolutePath ());
                ret = matcher.matches ();
            }

            return (ret);
        }
    }

    /**
     * Run Tabby on a file or directory.
     * @param args The command line arguments.
     *      * args[0] The file or directory to work on.
     * args[1] Optional, the regular expression to use as a file filter
     * args[2] Optional, the tab stop setting (integer).
     * 
     */
    public static void main (final String[] args)
    {
        Tabby tabby;
        File file;

        if (0 == args.length)
            System.out.println (
                  "usage: Tabby (|)"
                + " [file-match regexp] [tabsize]");
        else
        {
            if (2 < args.length)
                tabby = new Tabby (args[1], Integer.parseInt (args[2]));
            else
                if (1 < args.length)
                    tabby = new Tabby (args[1]);
                else
                    tabby = new Tabby ();
            file = new File (args[0]);
            tabby.process (file);
        }
    }
}

/*
 * Revision Control Modification History
 *
 * $Log: Tabby.java,v $
 * Revision 1.3  2005/03/13 14:51:44  derrickoswald
 * Bug #1121401 No Parsing with yahoo!
 * By default nio.charset.CharsetDecoder replaces characters it cannot
 * represent in the current encoding with zero, which was the value
 * returned by the page when the Stream reached EOF.
 * This changes the Page return value to (char)Source.EOF (-1) when
 * the end of stream is encountered.
 *
 * Revision 1.2  2004/07/31 16:42:34  derrickoswald
 * Remove unused variables and other fixes exposed by turning on compiler warnings.
 *
 * Revision 1.1  2003/09/10 03:38:26  derrickoswald
 * Add style checking target to ant build script:
 *     ant checkstyle
 * It uses a jar from http://checkstyle.sourceforge.net which is dropped in the lib directory.
 * The rules are in the file htmlparser_checks.xml in the src directory.
 *
 * Added lexerapplications package with Tabby as the first app. It performs whitespace manipulation
 * on source files to follow the style rules. This reduced the number of style violations to roughly 14,000.
 *
 * There are a few issues with the style checker that need to be resolved before it should be taken too seriously.
 * For example:
 * It thinks all method arguments should be final, even if they are modified by the code (which the compiler frowns on).
 * It complains about long lines, even when there is no possibility of wrapping the line, i.e. a URL in a comment
 * that's more than 80 characters long.
 * It considers all naked integers as 'magic numbers', even when they are obvious, i.e. the 4 corners of a box.
 * It complains about whitespace following braces, even in array initializers, i.e. X[][] = { {a, b} { } }
 *
 * But it points out some really interesting things, even if you don't agree with the style guidelines,
 * so it's worth a look.
 *
 *
 */