org.armedbear.lisp.util.DecodingReader Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of abcl Show documentation
Common Lisp implementation running on the JVM
There is a newer version: 1.9.2
/*
 * DecodingStreamReader.java
 *
 * Copyright (C) 2010 Erik Huelsmann
 * $Id$
 *
 * This program is free software; you can redistribute it and/or
 * modify it under the terms of the GNU General Public License
 * as published by the Free Software Foundation; either version 2
 * of the License, or (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.
 *
 * As a special exception, the copyright holders of this library give you
 * permission to link this library with independent modules to produce an
 * executable, regardless of the license terms of these independent
 * modules, and to copy and distribute the resulting executable under
 * terms of your choice, provided that you also meet, for each linked
 * independent module, the terms and conditions of the license of that
 * module.  An independent module is a module which is not derived from
 * or based on this library.  If you modify this library, you may extend
 * this exception to your version of the library, but you are not
 * obligated to do so.  If you do not wish to do so, delete this
 * exception statement from your version.
 */

package org.armedbear.lisp.util;

import java.io.IOException;
import java.io.InputStream;
import java.io.PushbackInputStream;
import java.io.PushbackReader;
import java.io.Reader;
import java.io.StringReader;
import java.nio.ByteBuffer;
import java.nio.CharBuffer;
import java.nio.charset.Charset;
import java.nio.charset.CharsetDecoder;
import java.nio.charset.CharsetEncoder;
import java.nio.charset.CoderResult;
import java.nio.charset.CodingErrorAction;

import org.armedbear.lisp.Debug;

/** Class to support mid-stream change of character encoding
 * to support setExternalFormat operation in Stream.java
 *
 * Note: extends PushbackReader, but only for its interface;
 * all methods are overridden.
 */
public class DecodingReader
    extends PushbackReader {

    // dummy reader which we need to call the Pushback constructor
    // because a null value won't work
    private static Reader staticReader = new StringReader("");

    // contains the currently buffered bytes read from the stream
    private ByteBuffer bbuf;

    // stream to read from, wrapped in a PushbackInputStream
    private PushbackInputStream stream;

    // Decoder, used for decoding characters on the input stream
    private CharsetDecoder cd;

    // Encoder, used to put characters back on the input stream when unreading
    private CharsetEncoder ce;

    public DecodingReader(InputStream stream, int size, Charset cs) {
        super(staticReader); // pass a dummy stream value into the constructor

          // we need to be able to unread the byte buffer
        this.stream = new PushbackInputStream(stream, size);
        this.cd = cs.newDecoder();
        this.cd.onUnmappableCharacter(CodingErrorAction.REPLACE);
        this.cd.onMalformedInput(CodingErrorAction.REPLACE);
        this.ce = cs.newEncoder();
        bbuf = ByteBuffer.allocate(size);
        bbuf.flip();  // mark the buffer as 'needs refill'
    }

    /** Change the Charset used to decode bytes from the input stream
     * into characters.
     */
    public final void setCharset(Charset cs) {
        this.cd = cs.newDecoder();
        this.cd.onUnmappableCharacter(CodingErrorAction.REPLACE);
        this.cd.onMalformedInput(CodingErrorAction.REPLACE);
        this.ce = cs.newEncoder();
    }

    /** Get the Charset used to decode bytes from the input stream. */
    public final Charset getCharset() {
        return this.cd.charset();
    }

    @Override
    public final void close() throws IOException {
        stream.close();
    }

    @Override
    public final void mark(int readAheadLimit) throws IOException {
        throw new IOException("mark/reset not supported.");
    }

    @Override
    public final boolean markSupported() {
        return false;
    }

    @Override
    public final boolean ready() throws IOException {
        return stream.available() != 0 || bbuf.remaining() != 0;
    }

    @Override
    public final void reset() throws IOException {
        throw new IOException("reset/mark not supported.");
    }

    /** Skips 'n' characters, or as many as can be read off the stream
     * before its end.
     *
     * Returns the number of characters actually skipped
     */
    @Override
    public final long skip(long n) throws IOException {
        char[] cbuf = new char[(int)Math.min(4096, n)];
        long m = n;

        while (m > 0) {
            int r = read(cbuf, 0, (int)Math.min(cbuf.length, m));

            if (r < 0)
                return (n - m);

            m += Math.min(cbuf.length, m);
        }

        return n;
    }

    /** Unread a single code point.
     *
     * Decomposes the code point into UTF-16 surrogate pairs
     * and unreads them using the char[] unreader function.
     *
     */
    @Override
    public final void unread(int c) throws IOException {
        char[] ch = Character.toChars(c);
        unread(ch, 0, ch.length);
    }

    /** Unread the character array into the reader.
     *
     * Decodes the characters in the array into bytes,
     * allowing the encoding to be changed before reading from
     * the stream again, using a different charset.
     */
    @Override
    public final void unread(char[] cbuf, int off, int len) throws IOException {

        ByteBuffer tb = // temp buffer
            ce.encode(CharBuffer.wrap(cbuf, off, len));

        if (tb.limit() > bbuf.position()) {
            // unread bbuf into the pushback input stream
            // in order to free up space for the content of 'tb'
            for (int i = bbuf.limit(); i-- > bbuf.position(); )
                stream.unread(bbuf.get(i));

            bbuf.clear();
            ce.encode(CharBuffer.wrap(cbuf, off, len), bbuf, true);
            bbuf.flip();
        } else {
            // Don't unread bbuf, since tb will fit in front of the
            // existing data
            int j = bbuf.position() - 1;
            for (int i = tb.limit(); i-- > 0; j--) // two-counter loop
                bbuf.put(j, tb.get(i));

            bbuf.position(j+1);
        }
    }

    @Override
    public final void unread(char[] cbuf) throws IOException {
        unread(cbuf, 0, cbuf.length);
    }

    // fill bbuf, either when empty or when forced
    private boolean ensureBbuf(boolean force) throws IOException {
        if (bbuf.remaining() == 0 || force) {
            bbuf.compact();

            int size = stream.available();
            if (size > bbuf.remaining() || size == 0)
                // by reading more than the available bytes when
                // none available, block only if we need to on
                // interactive streams
                size = bbuf.remaining();

            byte[] by = new byte[size];
            int c = stream.read(by);

            if (c < 0) {
                bbuf.flip();  // prepare bbuf for reading
                return false;
            }

            bbuf.put(by, 0, c);
            bbuf.flip();
        }
        return true;
    }

    @Override
    public final int read() throws IOException {
        // read the first UTF-16 character
        char[] ch = new char[1];

        int i = read(ch, 0, 1);
        if (i < 0)
            return i;

        // if this is not a high surrogate,
        // it must be a character which doesn't need one
        if (! Character.isHighSurrogate(ch[0]))
            return ch[0];

        // save the high surrogate and read the low surrogate
        char high = ch[0];
        i = read(ch, 0, 1);
        if (i < 0)
            return i;

        // combine the two and return the resulting code point
        return Character.toCodePoint(high, ch[0]);
    }

    @Override
    public final int read(char[] cbuf, int off, int len) throws IOException {
        CharBuffer cb = CharBuffer.wrap(cbuf, off, len);
        return read(cb);
    }

    @Override
    public final int read(CharBuffer cb) throws IOException {
        int len = cb.remaining();
        boolean notEof = true;
        boolean forceRead = false;


        while (cb.remaining() > 0 && notEof) {
            int oldRemaining = cb.remaining();
            notEof = ensureBbuf(forceRead);
            CoderResult r = cd.decode(bbuf, cb, ! notEof);
            if (oldRemaining == cb.remaining()
                && CoderResult.OVERFLOW == r) {
                // if this happens, the decoding failed
                // but the bufs didn't advance. Advance
                // them manually and do manual replacing,
                // otherwise we loop endlessly. This occurs
                // at least when parsing latin1 files with
                // lowercase o-umlauts in them.
                // Note that this is at the moment copy-paste
                // with RandomAccessCharacterFile.read()
                cb.put('?');
                bbuf.get();
            }
            forceRead = (CoderResult.UNDERFLOW == r);
        }
        if (cb.remaining() == len)
            return -1;
        else
            return len - cb.remaining();
    }

    @Override
    public final int read(char[] cbuf) throws IOException {
        return read(cbuf, 0, cbuf.length);
    }

}