All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.htmlparser.lexer.InputStreamSource Maven / Gradle / Ivy

// HTMLParser Library $Name: v1_5 $ - A java-based parser for HTML
// http://sourceforge.org/projects/htmlparser
// Copyright (C) 2004 Derrick Oswald
//
// Revision Control Information
//
// $Source: /cvsroot/htmlparser/htmlparser/src/org/htmlparser/lexer/InputStreamSource.java,v $
// $Author: derrickoswald $
// $Date: 2005/05/15 11:49:04 $
// $Revision: 1.7 $
//
// This library is free software; you can redistribute it and/or
// modify it under the terms of the GNU Lesser General Public
// License as published by the Free Software Foundation; either
// version 2.1 of the License, or (at your option) any later version.
//
// This library is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
// Lesser General Public License for more details.
//
// You should have received a copy of the GNU Lesser General Public
// License along with this library; if not, write to the Free Software
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
//

package org.htmlparser.lexer;

import java.io.ByteArrayInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.ObjectInputStream;
import java.io.ObjectOutputStream;
import java.io.UnsupportedEncodingException;

import org.htmlparser.util.EncodingChangeException;
import org.htmlparser.util.ParserException;

/**
 * A source of characters based on an InputStream such as from a URLConnection.
 */
public class InputStreamSource
    extends
        Source
{
    /**
     * An initial buffer size.
     * Has a default value of {@value}.
     */
    public static int BUFFER_SIZE = 16384;

    /**
     * The stream of bytes.
     * Set to null when the source is closed.
     */
    protected transient InputStream mStream;

    /**
     * The character set in use.
     */
    protected String mEncoding;

    /**
     * The converter from bytes to characters.
     */
    protected transient InputStreamReader mReader;

    /**
     * The characters read so far.
     */
    protected char[] mBuffer;

    /**
     * The number of valid bytes in the buffer.
     */
    protected int mLevel;

    /**
     * The offset of the next byte returned by read().
     */
    protected int mOffset;

    /**
     * The bookmark.
     */
    protected int mMark;

    /**
     * Create a source of characters using the default character set.
     * @param stream The stream of bytes to use.
     * @exception UnsupportedEncodingException If the default character set
     * is unsupported.
     */
    public InputStreamSource (InputStream stream)
        throws
            UnsupportedEncodingException
    {
        this (stream, null, BUFFER_SIZE);
    }

    /**
     * Create a source of characters.
     * @param stream The stream of bytes to use.
     * @param charset The character set used in encoding the stream.
     * @exception UnsupportedEncodingException If the character set
     * is unsupported.
     */
    public InputStreamSource (InputStream stream, String charset)
        throws
            UnsupportedEncodingException
    {
        this (stream, charset, BUFFER_SIZE);
    }

    /**
     * Create a source of characters.
     * @param stream The stream of bytes to use.
     * @param charset The character set used in encoding the stream.
     * @param size The initial character buffer size.
     * @exception UnsupportedEncodingException If the character set
     * is unsupported.
     */
    public InputStreamSource (InputStream stream, String charset, int size)
        throws
            UnsupportedEncodingException
    {
        if (null == stream)
            stream = new Stream (null);
        else
            // bug #1044707 mark()/reset() issues
            if (!stream.markSupported ())
                // wrap the stream so we can reset
                stream = new Stream (stream);
            // else
                // just because mark is supported doesn't guarantee
                // proper reset operation; there is no call to mark
                // in this code, so if reset misbehaves there is an
                // appropriate message in setEncoding() to suggest
                // wraping it in a Stream.
                // This was deemed better than an attempt to call
                // reset at this point just to check if we would
                // succeed later, or to call mark with an arbitrary
                // lookahead size
        mStream = stream;
        if (null == charset)
        {
            mReader = new InputStreamReader (stream);
            mEncoding = mReader.getEncoding ();
        }
        else
        {
            mEncoding = charset;
            mReader = new InputStreamReader (stream, charset);
        }
        mBuffer = new char[size];
        mLevel = 0;
        mOffset = 0;
        mMark = -1;
    }

    //
    // Serialization support
    //

    /**
     * Serialization support.
     * @param out Where to write this object.
     * @exception IOException If serialization has a problem.
     */
    private void writeObject (ObjectOutputStream out)
        throws
            IOException
    {
        int offset;
        char[] buffer;

        if (null != mStream)
        {
            // remember the offset, drain the input stream, restore the offset
            offset = mOffset;
            buffer = new char[4096];
            while (EOF != read (buffer))
                ;
            mOffset = offset;
        }

        out.defaultWriteObject ();
    }

    /**
     * Deserialization support.
     * @param in Where to read this object from.
     * @exception IOException If deserialization has a problem.
     */
    private void readObject (ObjectInputStream in)
        throws
            IOException,
            ClassNotFoundException
    {
        in.defaultReadObject ();
        if (null != mBuffer) // buffer is null when destroy's been called
            // pretend we're open, mStream goes null when exhausted
            mStream = new ByteArrayInputStream (new byte[0]);
    }

    /**
     * Get the input stream being used.
     * @return The current input stream.
     */
    public InputStream getStream ()
    {
        return (mStream);
    }

    /**
     * Get the encoding being used to convert characters.
     * @return The current encoding.
     */
    public String getEncoding ()
    {
        return (mEncoding);
    }

    /**
     * Begins reading from the source with the given character set.
     * If the current encoding is the same as the requested encoding,
     * this method is a no-op. Otherwise any subsequent characters read from
     * this page will have been decoded using the given character set.

* Some magic happens here to obtain this result if characters have already * been consumed from this source. * Since a Reader cannot be dynamically altered to use a different character * set, the underlying stream is reset, a new Source is constructed * and a comparison made of the characters read so far with the newly * read characters up to the current position. * If a difference is encountered, or some other problem occurs, * an exception is thrown. * @param character_set The character set to use to convert bytes into * characters. * @exception ParserException If a character mismatch occurs between * characters already provided and those that would have been returned * had the new character set been in effect from the beginning. An * exception is also thrown if the underlying stream won't put up with * these shenanigans. */ public void setEncoding (String character_set) throws ParserException { String encoding; InputStream stream; char[] buffer; int offset; char[] new_chars; encoding = getEncoding (); if (!encoding.equalsIgnoreCase (character_set)) { stream = getStream (); try { buffer = mBuffer; offset = mOffset; stream.reset (); try { mEncoding = character_set; mReader = new InputStreamReader (stream, character_set); mBuffer = new char[mBuffer.length]; mLevel = 0; mOffset = 0; mMark = -1; if (0 != offset) { new_chars = new char[offset]; if (offset != read (new_chars)) throw new ParserException ("reset stream failed"); for (int i = 0; i < offset; i++) if (new_chars[i] != buffer[i]) throw new EncodingChangeException ("character mismatch (new: " + new_chars[i] + " [0x" + Integer.toString (new_chars[i], 16) + "] != old: " + " [0x" + Integer.toString (buffer[i], 16) + buffer[i] + "]) for encoding change from " + encoding + " to " + character_set + " at character offset " + i); } } catch (IOException ioe) { throw new ParserException (ioe.getMessage (), ioe); } } catch (IOException ioe) { // bug #1044707 mark()/reset() issues throw new ParserException ("Stream reset failed (" + ioe.getMessage () + "), try wrapping it with a org.htmlparser.lexer.Stream", ioe); } } } /** * Fetch more characters from the underlying reader. * Has no effect if the underlying reader has been drained. * @param min The minimum to read. * @exception IOException If the underlying reader read() throws one. */ protected void fill (int min) throws IOException { char[] buffer; int size; int read; if (null != mReader) // mReader goes null when it's been sucked dry { size = mBuffer.length - mLevel; // available space if (size < min) // oops, better get some buffer space { // unknown length... keep doubling size = mBuffer.length * 2; read = mLevel + min; if (size < read) // or satisfy min, whichever is greater size = read; else min = size - mLevel; // read the max buffer = new char[size]; } else { buffer = mBuffer; min = size; } // read into the end of the 'new' buffer read = mReader.read (buffer, mLevel, min); if (EOF == read) { mReader.close (); mReader = null; } else { if (mBuffer != buffer) { // copy the bytes previously read System.arraycopy (mBuffer, 0, buffer, 0, mLevel); mBuffer = buffer; } mLevel += read; } // todo, should repeat on read shorter than original min } } // // Reader overrides // /** * Does nothing. * It's supposed to close the source, but use destroy() instead. * @exception IOException not used * @see #destroy */ public void close () throws IOException { } /** * Read a single character. * This method will block until a character is available, * an I/O error occurs, or the end of the stream is reached. * @return The character read, as an integer in the range 0 to 65535 * (0x00-0xffff), or {@link #EOF EOF} if the end of the stream has * been reached * @exception IOException If an I/O error occurs. */ public int read () throws IOException { int ret; if (mLevel - mOffset < 1) { if (null == mStream) throw new IOException ("source is closed"); fill (1); if (mOffset >= mLevel) ret = EOF; else ret = mBuffer[mOffset++]; } else ret = mBuffer[mOffset++]; return (ret); } /** * Read characters into a portion of an array. This method will block * until some input is available, an I/O error occurs, or the end of the * stream is reached. * @param cbuf Destination buffer * @param off Offset at which to start storing characters * @param len Maximum number of characters to read * @return The number of characters read, or {@link #EOF EOF} if the end of * the stream has been reached * @exception IOException If an I/O error occurs. */ public int read (char[] cbuf, int off, int len) throws IOException { int ret; if (null == mStream) throw new IOException ("source is closed"); if ((null == cbuf) || (0 > off) || (0 > len)) throw new IOException ("illegal argument read (" + ((null == cbuf) ? "null" : "cbuf") + ", " + off + ", " + len + ")"); if (mLevel - mOffset < len) fill (len - (mLevel - mOffset)); // minimum to satisfy this request if (mOffset >= mLevel) ret = EOF; else { ret = Math.min (mLevel - mOffset, len); System.arraycopy (mBuffer, mOffset, cbuf, off, ret); mOffset += ret; } return (ret); } /** * Read characters into an array. * This method will block until some input is available, an I/O error occurs, * or the end of the stream is reached. * @param cbuf Destination buffer. * @return The number of characters read, or {@link #EOF EOF} if the end of * the stream has been reached. * @exception IOException If an I/O error occurs. */ public int read (char[] cbuf) throws IOException { return (read (cbuf, 0, cbuf.length)); } /** * Reset the source. * Repositions the read point to begin at zero. * @exception IllegalStateException If the source has been closed. */ public void reset () throws IllegalStateException { if (null == mStream) throw new IllegalStateException ("source is closed"); if (-1 != mMark) mOffset = mMark; else mOffset = 0; } /** * Tell whether this source supports the mark() operation. * @return true. */ public boolean markSupported () { return (true); } /** * Mark the present position in the source. * Subsequent calls to {@link #reset()} * will attempt to reposition the source to this point. * @param readAheadLimit Not used. * @exception IOException If the source is closed. * */ public void mark (int readAheadLimit) throws IOException { if (null == mStream) throw new IOException ("source is closed"); mMark = mOffset; } /** * Tell whether this source is ready to be read. * @return true if the next read() is guaranteed not to block * for input, false otherwise. * Note that returning false does not guarantee that the next read will block. * @exception IOException If the source is closed. */ public boolean ready () throws IOException { if (null == mStream) throw new IOException ("source is closed"); return (mOffset < mLevel); } /** * Skip characters. * This method will block until some characters are available, * an I/O error occurs, or the end of the stream is reached. * Note: n is treated as an int * @param n The number of characters to skip. * @return The number of characters actually skipped * @exception IllegalArgumentException If n is negative. * @exception IOException If an I/O error occurs. */ public long skip (long n) throws IOException, IllegalArgumentException { long ret; if (null == mStream) throw new IOException ("source is closed"); if (0 > n) throw new IllegalArgumentException ("cannot skip backwards"); else { if (mLevel - mOffset < n) fill ((int)(n - (mLevel - mOffset))); // minimum to satisfy this request if (mOffset >= mLevel) ret = EOF; else { ret = Math.min (mLevel - mOffset, n); mOffset += ret; } } return (ret); } // // Methods not in your Daddy's Reader // /** * Undo the read of a single character. * @exception IOException If the source is closed or no characters have * been read. */ public void unread () throws IOException { if (null == mStream) throw new IOException ("source is closed"); if (0 < mOffset) mOffset--; else throw new IOException ("can't unread no characters"); } /** * Retrieve a character again. * @param offset The offset of the character. * @return The character at offset. * @exception IOException If the offset is beyond {@link #offset()} or the * source is closed. */ public char getCharacter (int offset) throws IOException { char ret; if (null == mStream) throw new IOException ("source is closed"); if (offset >= mBuffer.length) throw new IOException ("illegal read ahead"); else ret = mBuffer[offset]; return (ret); } /** * Retrieve characters again. * @param array The array of characters. * @param offset The starting position in the array where characters are to be placed. * @param start The starting position, zero based. * @param end The ending position * (exclusive, i.e. the character at the ending position is not included), * zero based. * @exception IOException If the start or end is beyond {@link #offset()} * or the source is closed. */ public void getCharacters (char[] array, int offset, int start, int end) throws IOException { if (null == mStream) throw new IOException ("source is closed"); System.arraycopy (mBuffer, start, array, offset, end - start); } /** * Retrieve a string. * @param offset The offset of the first character. * @param length The number of characters to retrieve. * @return A string containing the length characters at offset. * @exception IOException If the offset or (offset + length) is beyond * {@link #offset()} or the source is closed. */ public String getString (int offset, int length) throws IOException { String ret; if (null == mStream) throw new IOException ("source is closed"); if (offset + length >= mBuffer.length) throw new IOException ("illegal read ahead"); else ret = new String (mBuffer, offset, length); return (ret); } /** * Append characters already read into a StringBuffer. * @param buffer The buffer to append to. * @param offset The offset of the first character. * @param length The number of characters to retrieve. * @exception IOException If the offset or (offset + length) is beyond * {@link #offset()} or the source is closed. */ public void getCharacters (StringBuffer buffer, int offset, int length) throws IOException { if (null == mStream) throw new IOException ("source is closed"); buffer.append (mBuffer, offset, length); } /** * Close the source. * Once a source has been closed, further {@link #read() read}, * {@link #ready ready}, {@link #mark mark}, {@link #reset reset}, * {@link #skip skip}, {@link #unread unread}, * {@link #getCharacter getCharacter} or {@link #getString getString} * invocations will throw an IOException. * Closing a previously-closed source, however, has no effect. * @exception IOException If an I/O error occurs */ public void destroy () throws IOException { mStream = null; if (null != mReader) mReader.close (); mReader = null; mBuffer = null; mLevel = 0; mOffset = 0; mMark = -1; } /** * Get the position (in characters). * @return The number of characters that have already been read, or * {@link #EOF EOF} if the source is closed. */ public int offset () { int ret; if (null == mStream) ret = EOF; else ret = mOffset; return (ret); } /** * Get the number of available characters. * @return The number of characters that can be read without blocking or * zero if the source is closed. */ public int available () { int ret; if (null == mStream) ret = 0; else ret = mLevel - mOffset; return (ret); } }





© 2015 - 2025 Weber Informatics LLC | Privacy Policy