org.archive.format.gzip.GZIPDecoder Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of webarchive-commons Show documentation
There is a newer version: 1.1.9
package org.archive.format.gzip;

import java.io.IOException;
import java.io.InputStream;
import java.util.zip.CRC32;

import org.archive.util.ByteOp;
import org.archive.util.io.CRCInputStream;


public class GZIPDecoder implements GZIPConstants {
	public static final long SEARCH_EOF_AT_START = Long.MIN_VALUE;
	private int maxNameSize = 1024;
	private int maxCommentSize = 1024;
	public GZIPDecoder() {
	}
	public GZIPDecoder(int max) {
		maxNameSize = max;
		maxCommentSize = max;
	}
	public GZIPDecoder(int maxNameSize, int maxCommentSize) {
		this.maxNameSize = maxNameSize;
		this.maxCommentSize = maxCommentSize;
	}
	public boolean alignedAtEOF(long skipped) {
		return skipped == SEARCH_EOF_AT_START;
	}

	/**
	 * Read bytes from InputStream argument until 3 bytes are found that appear
	 * to be the start of a GZIPHeader. leave the stream on the 4th byte,
	 * and return the number of bytes skipped before finding the 3 bytes.
	 * 
	 * @param is InputStream to read from
	 * @return number of bytes skipped before finding the gzip magic: 0 if the
	 * first 3 bytes matched. If no magic was found before an EOF, returns the
	 * -1 * the number of bytes skipped before hitting the EOF. As a special
	 * case, if the stream was at EOF when the method is called, returns
	 * GZIPHeaderParser.SEARCH_EOF_AT_START (which is Long.MIN_VALUE)
	 * 
	 * @throws IOException
	 */
	public long alignOnMagic3(InputStream is) throws IOException {
		
		long bytesSkipped = 0;
		byte lookahead[] = new byte[3];
		int keep = 0;
		while(true) {
			if(keep == 2) {
				lookahead[0] = lookahead[1];
				lookahead[1] = lookahead[2];
			} else if(keep == 1) {
				lookahead[0] = lookahead[2];
			}
			
			int amt = is.read(lookahead, keep, 3 - keep);
			if(amt == -1) {
				long skippedBeforeEOF = bytesSkipped + keep;
				if(skippedBeforeEOF == 0) {
					return SEARCH_EOF_AT_START;
				}
				return -1 * skippedBeforeEOF;
			}
			// TODO: handle read < # of bytes wanted...

			// we have 3 bytes, can it be a gzipmember?
			// Legend:
			//   ? = uninspected byte
			//   1 = gzip magic 1
			//   2 = gzip magic 2
			//   ! = wrong byte value
			
			// ???
			if(lookahead[0] != GZIP_MAGIC_ONE) {
				// !??
				// nope. are the next 2 possibilities?
				if((lookahead[1] == GZIP_MAGIC_ONE) &&
					(lookahead[2] == GZIP_MAGIC_TWO)) {
					// !12
					keep = 2;
				} else if(lookahead[2] == GZIP_MAGIC_ONE) {
					// !!1
					keep = 1;

				} else {
					// !!!
					keep = 0;
				}
				bytesSkipped += (3-keep);
				continue;
			}
			// 1??
			if((lookahead[1] & 0xff) != GZIP_MAGIC_TWO) {
				// 1!?
				// nope. is the last a possible start?
				if(lookahead[2] == GZIP_MAGIC_ONE) {
					// 1!1
					keep = 1;
				} else {
					// 1!!
					// just keep lookin, no backtrack
					keep = 0;
				}
				bytesSkipped += (3-keep);
				continue;
			}
			// 12?
			if(!GZIPHeader.isValidCompressionMethod(lookahead[2])) {
				if(lookahead[2] == GZIP_MAGIC_ONE) {
					// 121
					keep = 1;
				} else {
					// 12!
					// just keep lookin, no backtrack
				}
				bytesSkipped += (3-keep);
				continue;
			}
			// found it!
			return bytesSkipped;
		}
	}

	public GZIPHeader parseHeader(InputStream origIn) 
	throws GZIPFormatException, IOException {
		return parseHeader(origIn,false);
	}

	public GZIPHeader parseHeader(InputStream origIn, boolean assume1st3) 
		throws GZIPFormatException, IOException {
		
		GZIPHeader header = null;
		CRCInputStream is = null;
		GZIPStaticHeader staticHeader = null;
		// wrap in CRC in case header has crc flag:
		if(assume1st3) {

			CRC32 crc = new CRC32();
			crc.update(GZIPStaticHeader.DEFAULT_HEADER_DATA,0,3);
			is = new CRCInputStream(origIn,crc);
			staticHeader = new GZIPStaticHeader(is,true);

		} else {

			is = new CRCInputStream(origIn);
			staticHeader = new GZIPStaticHeader(is);
		}		
		header = new GZIPHeader(staticHeader);

		if(staticHeader.isFExtraSet()) {
			header.records = new GZIPFExtraRecords(is);
		}
		// FNAME:
		if(staticHeader.isFNameSet()) {
			if(maxNameSize > 0) {
				header.fileName = ByteOp.readToNull(is,maxNameSize);
				header.fileNameLength = header.fileName.length;
			} else {
				header.fileName = null;
				header.fileNameLength = ByteOp.discardToNull(is);
			}
		}
		// FCOMMENT:
		if(staticHeader.isFCommentSet()) {
			if(maxCommentSize > 0) {
				// TODO: if maxsize is too small, this throws IOException
				//       which will do bad things to our parse up the foodchain
				header.comment = ByteOp.readToNull(is,maxCommentSize);
				header.commentLength = header.comment.length;
			} else {
				header.comment = null;
				header.commentLength = ByteOp.discardToNull(is);
			}
		}
		if(staticHeader.isFHCRCSet()) {
			header.crc = ByteOp.readShort(is);
			int wantCRC16 = (int) (is.getCRCValue() & 0xffff);
			if(wantCRC16 != header.crc) {
				throw new GZIPFormatException("HEADER CRC ERROR");
			}
		}
		return header;
	}
}