org.armedbear.lisp.util.RandomAccessCharacterFile Maven / Gradle / Ivy

Go to download
/*
 * RandomAccessCharacterFile.java
 *
 * Copyright (C) 2008 Hideo at Yokohama
 * Copyright (C) 2008-2009 Erik Huelsmann
 * $Id$
 *
 * This program is free software; you can redistribute it and/or
 * modify it under the terms of the GNU General Public License
 * as published by the Free Software Foundation; either version 2
 * of the License, or (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.
 *
 * As a special exception, the copyright holders of this library give you
 * permission to link this library with independent modules to produce an
 * executable, regardless of the license terms of these independent
 * modules, and to copy and distribute the resulting executable under
 * terms of your choice, provided that you also meet, for each linked
 * independent module, the terms and conditions of the license of that
 * module.  An independent module is a module which is not derived from
 * or based on this library.  If you modify this library, you may extend
 * this exception to your version of the library, but you are not
 * obligated to do so.  If you do not wish to do so, delete this
 * exception statement from your version.
 */

package org.armedbear.lisp.util;

import java.io.IOException;
import java.io.PushbackInputStream;
import java.io.OutputStream;
import java.io.RandomAccessFile;
import java.io.PushbackReader;
import java.io.Reader;
import java.io.StringReader;
import java.io.Writer;
import java.nio.ByteBuffer;
import java.nio.CharBuffer;
import java.nio.channels.FileChannel;
import java.nio.charset.Charset;
import java.nio.charset.CharsetDecoder;
import java.nio.charset.CharsetEncoder;
import java.nio.charset.CoderResult;
import java.nio.charset.CodingErrorAction;
import java.nio.charset.UnsupportedCharsetException;

import org.armedbear.lisp.Debug;

import static org.armedbear.lisp.Lisp.error;

import org.armedbear.lisp.SimpleError;
import org.armedbear.lisp.SimpleString;

public class RandomAccessCharacterFile {

    private class RandomAccessInputStream extends PushbackInputStream {

        public RandomAccessInputStream() {
            super(null);
        }

        private byte[] read_buf = new byte[1];

        @Override
        public final int read() throws IOException {
            int len = read(read_buf);
            if (len == 1) {
                // byte is signed, char is unsigned, int is signed.
                // buf can hold 0xff, we want it as 0xff in int, not -1.
                return 0xff & (int) read_buf[0];
            } else {
                return -1;
            }
            // ### BUG: 'int read()' is to return a *codepoint*,
            // not the half of a surrogate pair!
        }

        @Override
        public final int read(byte[] b, int off, int len) throws IOException {
            return RandomAccessCharacterFile.this.read(b, off, len);
        }

        @Override
        public final void unread(int b) throws IOException {
            RandomAccessCharacterFile.this.unreadByte((byte)b);
        }

        @Override
        public final void unread(byte[] b, int off, int len) throws IOException {
            for (int i = 0; i < len; i++)
                this.unread(b[off+i]);
        }

        @Override
        public final void unread(byte[] b) throws IOException {
            this.unread(b, 0, b.length);
        }

        @Override
        public final int available() throws IOException {
            return (int)(RandomAccessCharacterFile.this.length()
                            - RandomAccessCharacterFile.this.position());
        }

        @Override
        public final synchronized void mark(int readlimit) {
        }

        @Override
        public final boolean markSupported() {
            return false;
        }

        @Override
        public final synchronized void reset() throws IOException {
            throw new IOException("Operation not supported");
        }

        @Override
        public final long skip(long n) throws IOException {
            RandomAccessCharacterFile.this.position(RandomAccessCharacterFile.this.position()+n);
            return n;
        }

        @Override
        public final int read(byte[] b) throws IOException {
            return this.read(b, 0, b.length);
        }

        @Override
        public final void close() throws IOException {
            RandomAccessCharacterFile.this.close();
        }
    }

    private class RandomAccessOutputStream extends OutputStream {

        RandomAccessOutputStream() {
        }

        private byte[] buf = new byte[1];
        public final void write(int b) throws IOException {
            buf[0] = (byte)b;
            RandomAccessCharacterFile.this.write(buf, 0, 1);
        }

        @Override
        public final void write(byte[] b) throws IOException {
            RandomAccessCharacterFile.this.write(b, 0, b.length);
        }

        @Override
        public final void write(byte[] b, int off, int len) throws IOException {
            RandomAccessCharacterFile.this.write(b, off, len);
        }

        @Override
        public final void flush() throws IOException {
            RandomAccessCharacterFile.this.flush();
        }

        @Override
        public final void close() throws IOException {
            RandomAccessCharacterFile.this.close();
        }
    }

    // dummy reader which we need to call the Pushback constructor
    // because a null value won't work
    static Reader staticReader = new StringReader("");

    private class RandomAccessReader extends PushbackReader {

        RandomAccessReader() {
                // because we override all methods of Pushbackreader,
                // staticReader will never be referenced
                super(staticReader);
        }

        @Override
        public final void close() throws IOException {
            RandomAccessCharacterFile.this.close();
        }

        private char[] read_buf = new char[1];

        @Override
        public final int read() throws IOException {
            int n = this.read(read_buf);

            if (n == 1)
                return read_buf[0];
            else
                return -1;
            // ### BUG: 'int read()' is to return a codepoint!
            // not the half of a surrogate pair!
        }

        @Override
        public final void unread(int c) throws IOException {
            RandomAccessCharacterFile.this.unreadChar((char)c);
        }

        @Override
        public final void unread(char[] cbuf, int off, int len) throws IOException {
            for (int i = 0; i < len; i++)
                this.unread(cbuf[off+i]);
        }

        @Override
        public final void unread(char[] cbuf) throws IOException {
            this.unread(cbuf, 0, cbuf.length);
        }

        @Override
        public final int read(CharBuffer target) throws IOException {
            //FIXME: to be implemented
            throw new IOException("Not implemented");
        }

        @Override
        public final int read(char[] cbuf) throws IOException {
            return RandomAccessCharacterFile.this.read(cbuf, 0, cbuf.length);
        }

        @Override
        public final int read(char[] cb, int off, int len) throws IOException {
            return RandomAccessCharacterFile.this.read(cb, off, len);
        }

        @Override
        public final boolean ready() throws IOException {
            return true;
        }
    }

    private class RandomAccessWriter extends Writer {

        RandomAccessWriter() {
        }

        public final void close() throws IOException {
            RandomAccessCharacterFile.this.close();
        }

        public final void flush() throws IOException {
            RandomAccessCharacterFile.this.flush();
        }

        @Override
        public final void write(char[] cb, int off, int len) throws IOException {
            RandomAccessCharacterFile.this.write(cb, off, len);
        }

    }


    final static int BUFSIZ = 4*1024; // setting this to a small value like 8 is helpful for testing.

    private RandomAccessWriter writer;
    private RandomAccessReader reader;
    private RandomAccessInputStream inputStream;
    private RandomAccessOutputStream outputStream;
    private FileChannel fcn;

    private Charset cset;
    private CharsetEncoder cenc;
    private CharsetDecoder cdec;

    /**
     * bbuf is treated as a cache of the file content.
     * If it points to somewhere in the middle of the file, it holds the copy of the file content,
     * even when you are writing a large chunk of data.  If you write in the middle of a file,
     * bbuf first gets filled with contents of the data, and only after that any new data is
     * written on bbuf.
     * The exception is when you are appending data at the end of the file.
     */
    private ByteBuffer bbuf;
    private boolean bbufIsDirty; /* whether bbuf holds data that must be written. */
    private boolean bbufIsReadable; /* whether bbuf.remaining() contains readable content. */
    private long bbufpos; /* where the beginning of bbuf is pointing in the file now. */

    public RandomAccessCharacterFile(RandomAccessFile raf, String encoding) throws IOException {

        fcn = raf.getChannel();

        setEncoding(encoding);
        bbuf = ByteBuffer.allocate(BUFSIZ);

        // there is no readable data available in the buffers.
        bbuf.flip();

        // there is no write pending data in the buffers.
        bbufIsDirty = false;

        bbufIsReadable = true;

        bbufpos = fcn.position();

        reader = new RandomAccessReader();
        writer = new RandomAccessWriter();
        inputStream = new RandomAccessInputStream();
        outputStream = new RandomAccessOutputStream();
    }

    public void setEncoding(String encoding) {
      if (encoding == null) {
        cset = Charset.defaultCharset();
      } else {
        try {
          cset = Charset.forName(encoding);
        } catch (UnsupportedCharsetException e) {
          error(new SimpleError("Undefined encoding: " + encoding));
        }
      }
      cdec = cset.newDecoder();
      cdec.onMalformedInput(CodingErrorAction.REPLACE);
      cdec.onUnmappableCharacter(CodingErrorAction.REPLACE);
      cenc = cset.newEncoder();
    }

    public Writer getWriter() {
        return writer;
    }

    public PushbackReader getReader() {
        return reader;
    }

    public PushbackInputStream getInputStream() {
        return inputStream;
    }

    public OutputStream getOutputStream() {
        return outputStream;
    }

    public final void close() throws IOException {
        internalFlush(true);
        fcn.close();
    }

    public final void flush() throws IOException {
        internalFlush(false);
    }

    private final boolean ensureReadBbuf(boolean force) throws IOException {
        boolean bufReady = true;

        if ((bbuf.remaining() == 0) || force || ! bbufIsReadable) {
            // need to read from the file.

            if (bbufIsDirty) {
                flushBbuf(false);
                bbuf.clear();
                bbufIsReadable = false;
            } else {
                int bbufEnd = bbufIsReadable ? bbuf.limit() : bbuf.position();
                fcn.position(bbufpos + bbufEnd);
                bbufpos += bbuf.position();
                if (bbufIsReadable) {
                  bbuf.compact();
                  bbufIsReadable = false;
                } else //must discard the junk bytes after bbuf.position()
                  bbuf.clear();
            }

            bufReady = (fcn.read(bbuf) != -1);
            bbuf.flip();
            bbufIsReadable = true;
        }

        return bufReady;
    }


    final int read(char[] cb, int off, int len) throws IOException {
        CharBuffer cbuf = CharBuffer.wrap(cb, off, len);
        boolean decodeWasUnderflow = false;
        boolean atEof = false;
        while ((cbuf.remaining() > 0) && ! atEof) {
            int oldRemaining = cbuf.remaining();
            atEof = ! ensureReadBbuf(decodeWasUnderflow);
            CoderResult r = cdec.decode(bbuf, cbuf, atEof );
            if (oldRemaining == cbuf.remaining()
                && CoderResult.OVERFLOW == r) {
                // if this happens, the decoding failed
                // but the bufs didn't advance. Advance
                // them manually and do manual replacing,
                // otherwise we loop endlessly. This occurs
                // at least when parsing latin1 files with
                // lowercase o-umlauts in them
                // Note that this is at the moment copy-paste
                // with DecodingReader.read()
                cbuf.put('?');
                bbuf.get();
            }
            decodeWasUnderflow = (CoderResult.UNDERFLOW == r);
        }
        if (cbuf.remaining() == len) {
            return -1;
        } else {
            return len - cbuf.remaining();
        }
    }

    final void write(char[] cb, int off, int len) throws IOException {
        CharBuffer cbuf = CharBuffer.wrap(cb, off, len);
        encodeAndWrite(cbuf, false, false);
    }

    private final void internalFlush(boolean endOfFile) throws IOException {
        if (endOfFile) {
            CharBuffer cbuf = CharBuffer.allocate(0);
            encodeAndWrite(cbuf, true, endOfFile);
        } else {
            flushBbuf(false);
        }
    }

    private final void encodeAndWrite(CharBuffer cbuf, boolean flush,
                                      boolean endOfFile) throws IOException {
        while (cbuf.remaining() > 0) {
            CoderResult r = cenc.encode(cbuf, bbuf, endOfFile);
            bbufIsDirty = true;
            if (CoderResult.OVERFLOW == r || bbuf.remaining() == 0) {
                flushBbuf(false);
                bbuf.clear();
                bbufIsReadable = false;
            }
            if (r.isUnmappable()) {
                throw new RACFUnmappableCharacterException(cbuf.position(),
                                                           cbuf.charAt(cbuf.position()),
                                                           cset.name());
            }
            if (r.isMalformed()) {
                // We don't really expect Malformed, but not handling it
                // will cause an infinite loop if we don't...
                throw new RACFMalformedInputException(cbuf.position(),
                                                      cbuf.charAt(cbuf.position()),
                                                      cset.name());
            }
            // UNDERFLOW is the normal condition where cbuf runs out
            // before bbuf is filled.
        }
        if (bbuf.position() > 0 && bbufIsDirty && flush) {
            flushBbuf(false);
        }
    }

    public final void position(long newPosition) throws IOException {
        flushBbuf(true);
        long bbufend = bbufpos // in case bbuf is readable, its contents is valid
            + (bbufIsReadable ? bbuf.limit() : bbuf.position()); // beyond position()
        if (newPosition >= bbufpos && newPosition < bbufend) {
            // near seek. within existing data of bbuf.
            if (!bbufIsReadable) { //rewinding. keep tail buffered.
              bbuf.limit(bbuf.position());
              bbufIsReadable = true;
            }
            bbuf.position((int)(newPosition - bbufpos));
        } else {
            fcn.position(newPosition);
            // far seek; discard the buffer (it's already cleared)
            bbuf.clear();
            bbuf.flip(); // "there is no useful data on this buffer yet."
            bbufIsReadable = true;
            bbufpos = newPosition;
        }
    }

    public final long position() throws IOException {
        return bbufpos + bbuf.position(); // the logical position within the file.
    }

    public final long length() throws IOException {
        flushBbuf(true);
        return fcn.size();
    }

    final void flushBbuf(boolean commitOnly) throws IOException {
        if (commitOnly && !bbufIsDirty)
            return;
        //otherwise, we do at least need to increase bbufpos

        fcn.position(bbufpos);

        // if the buffer is dirty, the modifications have to be
        // before position(): before re-positioning, this.position()
        // calls this function.
        if (commitOnly) {
            ByteBuffer dup = bbuf.duplicate();
            dup.flip();
            fcn.write(dup);
            //ideally, should restore fcn.position(). but don't for performance.
//            fcn.position(fcn.position()-dup.position());
            bbufIsDirty = false; //this fixed stas's bug, but not mine.
            return;
        }
        
        if (bbufIsDirty) {
          bbuf.flip();
          fcn.write(bbuf);
        }

        bbufpos += bbuf.position();
        bbuf.clear();
        bbuf.flip(); // there's no useable data in this buffer
        bbufIsDirty = false;
        bbufIsReadable = true;
    }

    public final int read(byte[] b, int off, int len) throws IOException {
        int pos = off;
        boolean atEof = false;
        while (pos - off < len && ! atEof) {

            atEof = ! ensureReadBbuf(false);
            int want = Math.min(off + len - pos, bbuf.remaining());
            bbuf.get(b, pos, want);
            pos += want;
        }
        return pos - off;
    }

    // a method corresponding to the good ol' ungetc in C.
    // This function may fail when using (combined) character codes that use
    // escape sequences to switch between sub-codes.
    // ASCII, ISO-8859 series, any 8bit code are OK, all unicode variations are OK,
    // but applications of the ISO-2022 encoding framework can have trouble.
    // Example of such code is ISO-2022-JP which is used in Japanese e-mail.
    private CharBuffer singleCharBuf;
    private ByteBuffer shortByteBuf;
    public final void unreadChar(char c) throws IOException {
        // algorithm :
        //  1. encode c into bytes, to find out how many bytes it corresponds to
        //  2. move the position backwards that many bytes.
        //  ** we stop here.  Don't bother to write the bytes to the buffer,
        //     assuming that it is the same as the original data.
        //     If we allow to write back different characters, the buffer must get 'dirty'
        //     but that would require read/write permissions on files you use unreadChar,
        //     even if you are just reading for some tokenizer.
        //
        //  So we don't do the following.
        //  3. write the bytes.
        //  4. move the position back again.
        if (singleCharBuf == null) {
            singleCharBuf = CharBuffer.allocate(1);
            shortByteBuf = ByteBuffer.allocate((int)cenc.maxBytesPerChar());
        }
        singleCharBuf.clear();
        singleCharBuf.append(c);
        singleCharBuf.flip();
        shortByteBuf.clear();
        cenc.encode(singleCharBuf, shortByteBuf, false);
        int n = shortByteBuf.position();
        long pos = position() - n;
        position(pos);
    }

    public final void unreadByte(byte b) throws IOException {
        long pos = position() - 1;
        position(pos);
    }

    final void write(byte[] b, int off, int len) throws IOException {
        int pos = off;
        while (pos < off + len) {
            if (bbuf.remaining() == 0) {
                flushBbuf(false);
                bbuf.clear();
                bbufIsReadable = false;
            }
            int thisBatchLen = Math.min(off + len - pos, bbuf.remaining());
            bbuf.put(b, pos, thisBatchLen);
            pos += thisBatchLen;
            bbufIsDirty = true;
        }
    }
}