org.htmlparser.lexerapplications.tabby.Tabby Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of bboss-htmlparser Show documentation
Show all versions of bboss-htmlparser Show documentation
bboss is a j2ee framework include aop/ioc,mvc,persistent,taglib,rpc,event ,bean-xml serializable and so on.http://www.bbossgroups.com
The newest version!
// HTMLParser Library $Name: v1_5 $ - A java-based parser for HTML
// http://sourceforge.org/projects/htmlparser
// Copyright (C) 2003 Derrick Oswald
//
// Revision Control Information
//
// $Source: /cvsroot/htmlparser/htmlparser/src/org/htmlparser/lexerapplications/tabby/Tabby.java,v $
// $Author: derrickoswald $
// $Date: 2005/03/13 14:51:44 $
// $Revision: 1.3 $
//
// This library is free software; you can redistribute it and/or
// modify it under the terms of the GNU Lesser General Public
// License as published by the Free Software Foundation; either
// version 2.1 of the License, or (at your option) any later version.
//
// This library is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
// Lesser General Public License for more details.
//
// You should have received a copy of the GNU Lesser General Public
// License along with this library; if not, write to the Free Software
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
//
package org.htmlparser.lexerapplications.tabby;
import java.io.File;
import java.io.FileFilter;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import java.util.regex.PatternSyntaxException;
import org.htmlparser.lexer.Cursor;
import org.htmlparser.lexer.Page;
/**
* Replace tabs with spaces.
* Convert tabs to the correct number of spaces according to a tabstop,
* change DOS \r\n line endings to Unix \n form, and remove trailing whitespace
*/
public class Tabby
{
/**
* The default tab stop spacing.
*/
private static final int DEFAULT_TABSTOP = 4;
/**
* The file filter to apply.
*/
protected Filter mFilter;
/**
* The replacement tab stop size.
*/
protected int mTabsize;
/**
* Creates a new instance of Tabby with no file filter and a tab stop of 4.
*/
public Tabby ()
{
mFilter = null;
mTabsize = DEFAULT_TABSTOP;
}
/**
* Creates a new instance of Tabby using the given regular expression and
* a tab stop of 4.
* @param filter The regular expression to apply to the files searched.
*/
public Tabby (final String filter)
{
this ();
mFilter = new Filter (filter);
}
/** Creates a new instance of Tabby.
* @param filter The regular expression to apply to the files searched.
* @param tabsize The tab stop setting.
* @exception IllegalArgumentException If tabsize is not a positive number.
*/
public Tabby (final String filter, final int tabsize)
throws
IllegalArgumentException
{
this (filter);
if (0 >= tabsize)
throw new IllegalArgumentException ("tab size cannot be negative");
mTabsize = tabsize;
}
/**
* Process the file or directory.
* @param file The file to process.
*/
protected void process (final File file)
{
File[] files;
if (file.isDirectory ())
{
files = file.listFiles (mFilter);
for (int i = 0; i < files.length; i++)
process (files[i]);
}
else
edit (file);
}
/**
* Process the file or directory.
* @param file The file to edit.
*/
protected void edit (final File file)
{
FileInputStream in;
Page page;
Cursor cursor;
int position;
int expected;
boolean modified;
char ch;
int last;
StringBuilder buffer;
FileOutputStream out;
try
{
in = new FileInputStream (file);
buffer = new StringBuilder (in.available ());
try
{
page = new Page (in, null);
cursor = new Cursor (page, 0);
position = 0;
modified = false;
expected = 0;
last = -1;
while (Page.EOF != (ch = page.getCharacter (cursor)))
{
if (++expected != cursor.getPosition ())
{
modified = true;
expected = cursor.getPosition ();
}
if ('\t' == ch)
{
do
{
buffer.append (' ');
position++;
}
while (0 != (position % mTabsize));
modified = true;
}
else if ('\n' == ch)
{
// check for whitespace on the end of the line
if (last + 1 != position)
{
// remove trailing whitespace
last = buffer.length () - (position - last - 1);
buffer.setLength (last);
modified = true;
}
buffer.append (ch);
position = 0;
last = -1;
}
else
{
buffer.append (ch);
if (!Character.isWhitespace (ch))
last = position;
position++;
}
}
}
finally
{
in.close ();
}
if (modified)
{
System.out.println (file.getAbsolutePath ());
out = new FileOutputStream (file);
out.write (buffer.toString ().getBytes (Page.DEFAULT_CHARSET));
out.close ();
}
}
catch (Exception e)
{
System.out.println (e);
}
}
/**
* Implement a file filter.
*/
class Filter implements FileFilter
{
/**
* The compiled expression.
*/
protected Pattern mExpression;
/**
* Create a file filter from the regular expression.
* @param expression The regular expression.
* A useful regular expression is ".*\.java" which accepts all
* .java files.
* @exception IllegalArgumentException If the expression is
* null
.
* @exception PatternSyntaxException If the expression is not a valid
* regular expression.
*/
public Filter (final String expression)
throws
PatternSyntaxException
{
if (null == expression)
throw new IllegalArgumentException (
"filter expression cannot be null");
mExpression = Pattern.compile (expression);
}
//
// FileFilter interface
//
/**
* Tests whether or not the file should be included in a pathname list.
* @param pathname The abstract pathname to be tested.
* @return true
if and only if pathname
* should be included.
*/
public boolean accept (final File pathname)
{
Matcher matcher;
boolean ret;
// match directories
if (pathname.isDirectory ())
ret = true;
else
{
matcher = mExpression.matcher (pathname.getAbsolutePath ());
ret = matcher.matches ();
}
return (ret);
}
}
/**
* Run Tabby on a file or directory.
* @param args The command line arguments.
*
* args[0] The file or directory to work on.
* args[1] Optional, the regular expression to use as a file filter
* args[2] Optional, the tab stop setting (integer).
*
*/
public static void main (final String[] args)
{
Tabby tabby;
File file;
if (0 == args.length)
System.out.println (
"usage: Tabby (|)"
+ " [file-match regexp] [tabsize]");
else
{
if (2 < args.length)
tabby = new Tabby (args[1], Integer.parseInt (args[2]));
else
if (1 < args.length)
tabby = new Tabby (args[1]);
else
tabby = new Tabby ();
file = new File (args[0]);
tabby.process (file);
}
}
}
/*
* Revision Control Modification History
*
* $Log: Tabby.java,v $
* Revision 1.3 2005/03/13 14:51:44 derrickoswald
* Bug #1121401 No Parsing with yahoo!
* By default nio.charset.CharsetDecoder replaces characters it cannot
* represent in the current encoding with zero, which was the value
* returned by the page when the Stream reached EOF.
* This changes the Page return value to (char)Source.EOF (-1) when
* the end of stream is encountered.
*
* Revision 1.2 2004/07/31 16:42:34 derrickoswald
* Remove unused variables and other fixes exposed by turning on compiler warnings.
*
* Revision 1.1 2003/09/10 03:38:26 derrickoswald
* Add style checking target to ant build script:
* ant checkstyle
* It uses a jar from http://checkstyle.sourceforge.net which is dropped in the lib directory.
* The rules are in the file htmlparser_checks.xml in the src directory.
*
* Added lexerapplications package with Tabby as the first app. It performs whitespace manipulation
* on source files to follow the style rules. This reduced the number of style violations to roughly 14,000.
*
* There are a few issues with the style checker that need to be resolved before it should be taken too seriously.
* For example:
* It thinks all method arguments should be final, even if they are modified by the code (which the compiler frowns on).
* It complains about long lines, even when there is no possibility of wrapping the line, i.e. a URL in a comment
* that's more than 80 characters long.
* It considers all naked integers as 'magic numbers', even when they are obvious, i.e. the 4 corners of a box.
* It complains about whitespace following braces, even in array initializers, i.e. X[][] = { {a, b} { } }
*
* But it points out some really interesting things, even if you don't agree with the style guidelines,
* so it's worth a look.
*
*
*/