All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.archive.io.HeaderedArchiveRecord Maven / Gradle / Ivy

The newest version!
/*
 *  This file is part of the Heritrix web crawler (crawler.archive.org).
 *
 *  Licensed to the Internet Archive (IA) by one or more individual 
 *  contributors. 
 *
 *  The IA licenses this file to You under the Apache License, Version 2.0
 *  (the "License"); you may not use this file except in compliance with
 *  the License.  You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 *  Unless required by applicable law or agreed to in writing, software
 *  distributed under the License is distributed on an "AS IS" BASIS,
 *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 *  See the License for the specific language governing permissions and
 *  limitations under the License.
 */

package org.archive.io;

import java.io.ByteArrayInputStream;
import java.io.ByteArrayOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStream;
import java.io.PrintStream;

import org.apache.commons.httpclient.Header;
import org.apache.commons.httpclient.HttpParser;
import org.apache.commons.httpclient.StatusLine;
import org.apache.commons.httpclient.util.EncodingUtil;
import org.archive.io.arc.ARCConstants;
import org.archive.util.LaxHttpParser;

/**
 * An ArchiveRecord whose content has a preamble of RFC822-like headers: e.g.
 * The ArchiveRecord is a http response that leads off with http response
 * headers.  Use this ArchiveRecord Decorator to get at the content headers and
 * the header/content demarcation.
 * 
 * @author stack
 * @author Olaf Freyer
 */
public class HeaderedArchiveRecord extends ArchiveRecord {
    private int contentHeadersLength = -1;
    private int statusCode = -1;
    
    /**
     * Http header bytes.
     * 
     * If non-null and bytes available, give out its contents before we
     * go back to the underlying stream.
     */
    private InputStream contentHeaderStream = null;
    
    /**
     * Content headers.
     * 
     * Only available after the reading of headers.
     */
    private Header [] contentHeaders = null;
    

    public HeaderedArchiveRecord(final ArchiveRecord ar) throws IOException {
        super(ar);
    }
    
    public HeaderedArchiveRecord(final ArchiveRecord ar,
            final boolean readContentHeader) throws IOException {
        super(ar);
        if (readContentHeader) {
            this.contentHeaderStream = readContentHeaders();
        }
    }
    
    /**
     * Skip over the the content headers if present.
     * 
     * Subsequent reads will get the body.
     * 
     * 

Calling this method in the midst of reading the header * will make for strange results. Otherwise, safe to call * at any time though before reading any of the record * content is only time that it makes sense. * *

After calling this method, you can call * {@link #getContentHeaders()} to get the read http header. * * @throws IOException */ public void skipHttpHeader() throws IOException { if (this.contentHeaderStream == null) { return; } // Empty the contentHeaderStream for (int available = this.contentHeaderStream.available(); this.contentHeaderStream != null && (available = this.contentHeaderStream.available()) > 0;) { // We should be in this loop once only we should only do this // buffer allocation once. byte[] buffer = new byte[available]; // The read nulls out httpHeaderStream when done with it so // need check for null in the loop control line. read(buffer, 0, available); } } public void dumpHttpHeader() throws IOException { dumpHttpHeader(System.out); } public void dumpHttpHeader(final PrintStream stream) throws IOException { if (this.contentHeaderStream == null) { return; } // Dump the httpHeaderStream to STDOUT for (int available = this.contentHeaderStream.available(); this.contentHeaderStream != null && (available = this.contentHeaderStream.available()) > 0;) { // We should be in this loop only once and should do this // buffer allocation once. byte[] buffer = new byte[available]; // The read nulls out httpHeaderStream when done with it so // need check for null in the loop control line. int read = read(buffer, 0, available); stream.write(buffer, 0, read); } } /** * Read header if present. Technique borrowed from HttpClient HttpParse * class. Using http parser code for now. Later move to more generic header * parsing code if there proves a need. * * @return ByteArrayInputStream with the http header in it or null if no * http header. * @throws IOException */ private InputStream readContentHeaders() throws IOException { // If judged a record that doesn't have an http header, return // immediately. if (!hasContentHeaders()) { return null; } byte [] statusBytes = LaxHttpParser.readRawLine(getIn()); int eolCharCount = getEolCharsCount(statusBytes); if (eolCharCount <= 0) { throw new IOException("Failed to read raw lie where one " + " was expected: " + new String(statusBytes)); } String statusLine = EncodingUtil.getString(statusBytes, 0, statusBytes.length - eolCharCount, ARCConstants.DEFAULT_ENCODING); if (statusLine == null) { throw new NullPointerException("Expected status line is null"); } // TODO: Tighten up this test. boolean isHttpResponse = StatusLine.startsWithHTTP(statusLine); boolean isHttpRequest = false; if (!isHttpResponse) { isHttpRequest = statusLine.toUpperCase().startsWith("GET") || !statusLine.toUpperCase().startsWith("POST"); } if (!isHttpResponse && !isHttpRequest) { throw new UnexpectedStartLineIOException("Failed parse of " + "status line: " + statusLine); } this.statusCode = isHttpResponse? (new StatusLine(statusLine)).getStatusCode(): -1; // Save off all bytes read. Keep them as bytes rather than // convert to strings so we don't have to worry about encodings // though this should never be a problem doing http headers since // its all supposed to be ascii. ByteArrayOutputStream baos = new ByteArrayOutputStream(statusBytes.length + 4 * 1024); baos.write(statusBytes); // Now read rest of the header lines looking for the separation // between header and body. for (byte [] lineBytes = null; true;) { lineBytes = LaxHttpParser.readRawLine(getIn()); eolCharCount = getEolCharsCount(lineBytes); if (eolCharCount <= 0) { throw new IOException("Failed reading headers: " + ((lineBytes != null)? new String(lineBytes): null)); } // Save the bytes read. baos.write(lineBytes); if ((lineBytes.length - eolCharCount) <= 0) { // We've finished reading the http header. break; } } byte [] headerBytes = baos.toByteArray(); // Save off where content body, post content headers, starts. this.contentHeadersLength = headerBytes.length; ByteArrayInputStream bais = new ByteArrayInputStream(headerBytes); if (!bais.markSupported()) { throw new IOException("ByteArrayInputStream does not support mark"); } bais.mark(headerBytes.length); // Read the status line. Don't let it into the parseHeaders function. // It doesn't know what to do with it. bais.read(statusBytes, 0, statusBytes.length); this.contentHeaders = LaxHttpParser.parseHeaders(bais, ARCConstants.DEFAULT_ENCODING); bais.reset(); return bais; } public static class UnexpectedStartLineIOException extends RecoverableIOException { private static final long serialVersionUID = 1L; public UnexpectedStartLineIOException(final String reason) { super(reason); } } /** * @param bytes Array of bytes to examine for an EOL. * @return Count of end-of-line characters or zero if none. */ private int getEolCharsCount(byte [] bytes) { int count = 0; if (bytes != null && bytes.length >=1 && bytes[bytes.length - 1] == '\n') { count++; if (bytes.length >=2 && bytes[bytes.length -2] == '\r') { count++; } } return count; } /** * @return If headers are for a http response AND the headers have been * read, return status code. Else return -1. */ public int getStatusCode() { return this.statusCode; } /** * @return Returns length of content headers or -1 if headers have * not yet been read. */ public int getContentHeadersLength() { return this.contentHeadersLength; } public Header[] getContentHeaders() { return contentHeaders; } /** * @return Next character in this ARCRecord's content else -1 if at end of * this record. * @throws IOException */ public int read() throws IOException { int c = -1; if (this.contentHeaderStream != null && (this.contentHeaderStream.available() > 0)) { // If http header, return bytes from it before we go to underlying // stream. c = this.contentHeaderStream.read(); // If done with the header stream, null it out. if (this.contentHeaderStream.available() <= 0) { this.contentHeaderStream = null; } // do not increment position - // the underlying ArchiveRecord stream allready did this // incrementPosition(); } else { c = super.read(); } return c; } public int read(byte [] b, int offset, int length) throws IOException { int read = -1; if (this.contentHeaderStream != null && (this.contentHeaderStream.available() > 0)) { // If http header, return bytes from it before we go to underlying // stream. read = Math.min(length, this.contentHeaderStream.available()); if (read == 0) { read = -1; } else { read = this.contentHeaderStream.read(b, offset, read); } // If done with the header stream, null it out. if (this.contentHeaderStream.available() <= 0) { this.contentHeaderStream = null; } // do not increment position - // the underlying ArchiveRecord stream allready did this //incrementPosition(); } else { read = super.read(b, offset, length); } return read; } @Override public int available() { return ((ArchiveRecord)this.in).available(); } @Override public void close() throws IOException { ((ArchiveRecord)this.in).close(); } @Override public void dump() throws IOException { ((ArchiveRecord)this.in).dump(); } @Override public void dump(OutputStream os) throws IOException { ((ArchiveRecord)this.in).dump(os); } @Override protected String getDigest4Cdx(ArchiveRecordHeader h) { return ((ArchiveRecord)this.in).getDigest4Cdx(h); } @Override public String getDigestStr() { return ((ArchiveRecord)this.in).getDigestStr(); } @Override public ArchiveRecordHeader getHeader() { return ((ArchiveRecord)this.in).getHeader(); } @Override protected String getIp4Cdx(ArchiveRecordHeader h) { return ((ArchiveRecord)this.in).getIp4Cdx(h); } @Override protected String getMimetype4Cdx(ArchiveRecordHeader h) { return ((ArchiveRecord)this.in).getMimetype4Cdx(h); } @Override public long getPosition() { return ((ArchiveRecord)this.in).getPosition(); } @Override protected String getStatusCode4Cdx(ArchiveRecordHeader h) { return ((ArchiveRecord)this.in).getStatusCode4Cdx(h); } @Override public boolean hasContentHeaders() { return ((ArchiveRecord)this.in).hasContentHeaders(); } @Override protected void incrementPosition() { ((ArchiveRecord)this.in).incrementPosition(); } @Override protected void incrementPosition(long incr) { ((ArchiveRecord)this.in).incrementPosition(incr); } @Override protected boolean isEor() { return ((ArchiveRecord)this.in).isEor(); } @Override public boolean isStrict() { return ((ArchiveRecord)this.in).isStrict(); } @Override public boolean markSupported() { return ((ArchiveRecord)this.in).markSupported(); } @Override protected String outputCdx(String strippedFileName) throws IOException { return ((ArchiveRecord)this.in).outputCdx(strippedFileName); } @Override protected void setEor(boolean eor) { ((ArchiveRecord)this.in).setEor(eor); } @Override protected void setHeader(ArchiveRecordHeader header) { ((ArchiveRecord)this.in).setHeader(header); } @Override public void setStrict(boolean strict) { ((ArchiveRecord)this.in).setStrict(strict); } @Override protected void skip() throws IOException { ((ArchiveRecord)this.in).skip(); } @Override public long skip(long n) throws IOException { return ((ArchiveRecord)this.in).skip(n); } }





© 2015 - 2024 Weber Informatics LLC | Privacy Policy