All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.archive.util.zip.GZIPMembersInputStream Maven / Gradle / Ivy

There is a newer version: 1.1.9
Show newest version
/*
 *  This file is part of the Heritrix web crawler (crawler.archive.org).
 *
 *  Licensed to the Internet Archive (IA) by one or more individual 
 *  contributors. 
 *
 *  The IA licenses this file to You under the Apache License, Version 2.0
 *  (the "License"); you may not use this file except in compliance with
 *  the License.  You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 *  Unless required by applicable law or agreed to in writing, software
 *  distributed under the License is distributed on an "AS IS" BASIS,
 *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 *  See the License for the specific language governing permissions and
 *  limitations under the License.
 */
package org.archive.util.zip;
import java.io.ByteArrayInputStream;
import java.io.EOFException;
import java.io.IOException;
import java.io.InputStream;
import java.io.SequenceInputStream;
import java.util.Iterator;
import java.util.NoSuchElementException;
import java.util.zip.Inflater;
import java.util.zip.ZipException;

import com.google.common.io.ByteStreams;
import com.google.common.io.CountingInputStream;

/**
 * A replacement for GZIPInputStream; offers GZIP decompression, without any
 * artificial stop after the first member in a concatenated series (in 
 * pre-JDK6u23), and offers direct access to discovered GZIP member 
 * boundaries (in compressed offsets) via the getMemberNumber(), 
 * getCurrentMemberStart(), getCurrentMemberEnd() accessors, both pre- and 
 * post- JDK6u23 (but see below for caveat about getCurrentMemberEnd()). 
 * 
 * (This replaces our previous workaround, 'GzippedInputStream', for
 * pre-JDK6u23 GZIPInputStream behavior.)
 * 
 * By default, will read straight through members, returning all uncompressed
 * data from concatenated compressed members as one stream, per the 
 * JDK6u23-and-higher behavior. The data returned from a single 
 * read() will not straddle a member boundary, *but* only after reading
 * the first byte of the next member can certainty be offered as to 
 * whether the previous member ended. Thus, in this default mode, until
 * the end of all input, the getAtMemberEnd() method will always return 
 * false, and getCurrentMemberEnd() will always return -1, because any 
 * read that discovered a definitive member-end will have begun the next
 * member. In this mode, member-ends should be deduced by watching the 
 * increment of getMemberNumber(), and using the start of the current 
 * record as the (exclusive) end-position of the previous record. 
 * 
 * The setEofEachMember() method may be used to change behavior to mimic that 
 * of pre-6u23 GZIPInputStream: reaching the end of a GZIP member will result
 * in a returned EOF. When receiving this EOF, getAtMemberEnd() will return
 * true and getCurrentMemberEnd() will return the (exclusive) member-end
 * position. Calling nextMember() after receiving an EOF will allow reading
 * to proceed into the next member (if any). 
 * 
 *  @contributor gojomo
 */
public class GZIPMembersInputStream extends OpenJDK7GZIPInputStream {
    protected long memberNumber = 0; 
    protected long holdAtMemberNumber = Long.MAX_VALUE;
    protected long currentMemberStart = 0;
    protected long currentMemberEnd = -1; 
    protected InputStream originalIn;
    
    public GZIPMembersInputStream(InputStream in) throws IOException {
        this(in,512);
    }
    
    public GZIPMembersInputStream(InputStream in, int size)
            throws IOException {
        super(countingStream(in,size), size);
        originalIn = in;
    }

    /**
     * A CountingInputStream is inserted to read compressed-offsets. 
     * 
     * @param in stream to wrap
     * @param lookback tolerance of initial mark
     * @return original stream wrapped in CountingInputStream
     * @throws IOException
     */
    protected static InputStream countingStream(InputStream in, int lookback) throws IOException {
        CountingInputStream cin = new CountingInputStream(in);
        cin.mark(lookback); 
        return cin;
    }

    protected void updateInnerMark() {
        this.in.mark(buf.length);
    }

    @Override
    public int read(byte[] buf, int off, int len) throws IOException {
        if(currentMemberEnd>0) {
            if(memberNumber>=holdAtMemberNumber) {
                // only advance if allowed
                return -1; 
            }
            // note read past member boundary
            memberNumber++; 
            currentMemberStart = currentMemberEnd; 
            currentMemberEnd = -1; 
        }
        return super.read(buf, off, len);
    }
    
    @Override
    protected boolean readTrailer() throws IOException {
        int c = inf.getRemaining();
        currentMemberEnd = ((CountingInputStream)in).getCount()-(c-8); 
//        return super.readTrailer();
// REIMPLEMENTED TO FIX MISUSE OF available()
        InputStream in = this.in;
        int n = inf.getRemaining();
        if (n > 0) {
            in = new SequenceInputStream(
                        new ByteArrayInputStream(buf, len - n, n), in);
        }
        // Uses left-to-right evaluation order
        if ((readUInt(in) != crc.getValue()) ||
            // rfc1952; ISIZE is the input size modulo 2^32
            (readUInt(in) != (inf.getBytesWritten() & 0xffffffffL)))
            throw new ZipException("Corrupt GZIP trailer");

        // always try concatenated case; EOF or other IOException
        // will let us know if we're wrong
        int m = 8;                  // this.trailer
        try {
            m += readHeader(in);    // next.header
        } catch (IOException ze) {
            return true;  // ignore any malformed, do nothing
        }
        inf.reset();
        if (n > m)
            inf.setInput(buf, len - n + m, n - m);
        return false;
    }

    /**
     * Seek forward to a particular offset in the compressed stream. Note
     * that after any seek/skip the memberNumbers may not reflect a member's
     * true ordinal position from the beginning of the stream. 
     * 
     * @param position target position
     * @throws IOException
     */
    public void compressedSeek(long position) throws IOException {
        in.reset(); 
        long count = ((CountingInputStream)in).getCount();
        long delta = position - count;
        if(delta<0) {
            throw new IllegalArgumentException("can't seek backwards: seeked "+position+" already at "+count); 
        }
        compressedSkip(delta);
    }
    
    /**
     * Skip forward the given number of bytes in the compressed stream. Note
     * that after any seek/skip the memberNumbers may not reflect a member's
     * true ordinal position from the beginning of the stream. 
     * 
     * @param offset bytes to skip
     * @throws IOException
     * @throws EOFException 
     */
    public void compressedSkip(long offset) throws IOException {
        ByteStreams.skipFully(in, offset);
        updateInnerMark();
        currentMemberStart = ((CountingInputStream)in).getCount(); 
        currentMemberEnd = -1; 
        startNewMember();
    }
    
    protected void startNewMember() throws IOException {
        new GzipHeader(in); // consume header
        inf.reset(); 
        crc.reset(); 
        eos = false;
    }

    /**
     * Test whether last read resulted in reaching the exact end of one GZIP
     * member. 
     * 
     * @return true if exactly at member end
     */
    public boolean getAtMemberEnd() {
        return currentMemberEnd>0;
    }
    
    /**
     * Get the ordinal number, starting at zero, of the currently-being-read
     * GZIP member, counting from the creation of this stream. If reading
     * straight through, this will be an accurate index relative to all 
     * members in the underlying stream. If any seeks/skips have been used, 
     * the number will only be relative to the members actually read.
     * 
     * @return ordinal number of member-in-progres
     */
    public long getMemberNumber() {
        return memberNumber;
    }
    
    /**
     * Get the compressed offset where the current member began.
     * 
     * @return position in compressed stream where current member began
     */
    public long getCurrentMemberStart() {
        return currentMemberStart;
    }
    
    
    /**
     * Get the compressed offset where the current, just-completed member
     * ends. Only accurate after the read which finishes a member (when 
     * getAtMemberEnd returns true). Otherwise, returns -1 to indicate 
     * not-yet-found.
     * 
     * @return position in compressed stream where member just finished, or -1
     * if member end not yet reached
     */
    public long getCurrentMemberEnd() {
        return currentMemberEnd;
    }
    
    /**
     * Set stream behavior to match JDK 6u22-and-earlier behavior, where 
     * reaching the end of any one GZIP member results in EOFs from all 
     * read()s as if no more data is available. (However, nextMember() may
     * be used to advance to the next member.)
     * 
     * @param eofPerMember true to set EOF-each-member behavior
     */
    public void setEofEachMember(boolean eofPerMember) {
        holdAtMemberNumber = eofPerMember ? memberNumber : Long.MAX_VALUE;
    }
    
    /**
     * Advance to next member (if the stream has been set to return EOF at the 
     * end of each member). Each call before reaching the end of a member will 
     * cause one additional member boundary to be passed. (Has no effect if not
     * in EOF-each-member mode.)
     */
    public void nextMember() {
        if(holdAtMemberNumber of 
     * @deprecated for backward compatibility; better to use direct facilities in future
     */
    public Iterator memberIterator() {
        return new GZIPEnvelopeIterator(); 
    }
       
    /**
     * Provides iterator-ish interface to members in a concatenated multi-member
     * GZIP stream for backward compatibility with our prior workaround. Not 
     * exactly like a real iterator: hasNext() will only return an accurate 
     * result when the stream returned by the previous next() is read until EOF. 
     * Previous next() values can not be retained/reused (they are in fact the
     * same object as subsequent next() returns.)
     */
    public class GZIPEnvelopeIterator implements
    Iterator {
        {
            setEofEachMember(true);
        }
        
        @Override
        public boolean hasNext() {
            // because readTrailer also reads into next header 
            // resetting inflater when there's more content, this works
            return !inf.finished();
        }
    
        @Override
        public GZIPMembersInputStream next() {
            if(getAtMemberEnd()) {
                nextMember();
            }
            if(hasNext()) {
                return GZIPMembersInputStream.this;
            } else {
                throw new NoSuchElementException(); 
            }
        }
    
        @Override
        public void remove() {
            throw new UnsupportedOperationException();
        }
    }
}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy