org.jwat.common.HeaderLineReader Maven / Gradle / Ivy

Go to download
/**
 * Java Web Archive Toolkit - Software to read and validate ARC, WARC
 * and GZip files. (http://jwat.org/)
 * Copyright 2011-2012 Netarkivet.dk (http://netarkivet.dk/)
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.jwat.common;

import java.io.IOException;
import java.io.InputStream;
import java.io.PushbackInputStream;

/**
 * Advanced header/line reader which can be configured into difference modes.
 * The reader can either read normal lines or header lines.
 * Supported encodings are raw, US-ASCII, ISO-8859-1 and UTF-8.
 *
 * Furthermore header lines can employ linear white space (LWS), quoted text
 * and encoded words.
 *
 * After calling the readLine method additional information is available from
 * public fields on the reader.
 *
 * @author nicl
 */
public class HeaderLineReader {

    /*
     * Internal states.
     */

    /** Initial state for reading a normal line. */
    protected static final int S_LINE = 0;
    /** Initial state for reading a header line. */
    protected static final int S_NAME = 1;
    /** State for reading a header value. */
    protected static final int S_VALUE = 2;
    /** State for reading a LWS character sequence. */
    protected static final int S_LWS = 3;
    /** State for reading a quoted string. */
    protected static final int S_QUOTED_TEXT = 4;
    /** State for reading a quoted pair character. */
    protected static final int S_QUOTED_PAIR = 5;
    /** State for reading a quoted LWS character sequence. */
    protected static final int S_QUOTED_LWS = 6;
    /** Status for reading an encoded word character sequence. */
    protected static final int S_ENCODED_WORD_EQ = 7;

    /*
     * 8-bit character characteristics.
     */

    /** Control character characteristic. */
    protected static final int CC_CONTROL = 1;
    /** Separator character characteristic. */
    protected static final int CC_SEPARATOR_WS = 2;

    /** rfc2616 separator characters. */
    public static final String separatorsWs = "()<>@,;:\\\"/[]?={} \t";

    /** Table of separator characters. */
    public static final byte[] charCharacteristicsTab = new byte[256];

    /**
     * Populate table of separators.
     */
    static {
        for (int i=0; i<32; ++i) {
            if (i != '\t') {
                charCharacteristicsTab[i] |= CC_CONTROL;
            }
        }
        charCharacteristicsTab[127] |= CC_CONTROL;
        for (int i=0; iStringBuffer for lines. */
    protected final StringBuffer lineSb = new StringBuffer();
    /** Reusable StringBuffer for name/value strings. */
    protected final StringBuffer nvSb = new StringBuffer();
    /** Stream used to record the raw characters read by the parser. */
    protected ByteArrayOutputStreamWithUnread bytesOut = new ByteArrayOutputStreamWithUnread();

    /*
     * Error reporting.
     */

    /** Bit denoting unexpected EOF. */
    public static final int E_BIT_EOF = 1 << 0;
    /** Bit denoting a misplaced CR. */
    public static final int E_BIT_MISPLACED_CR = 1 << 1;
    /** Bit denoting a missing CR. */
    public static final int E_BIT_MISSING_CR = 1 << 2;
    /** Bit denoting an unexpected CR. */
    public static final int E_BIT_UNEXPECTED_CR = 1 << 3;
    /** Bit denoting an invalid UTF-8 encoded character. */
    public static final int E_BIT_INVALID_UTF8_ENCODING = 1 << 4;
    /** Bit denoting an invalid US-ASCII character. */
    public static final int E_BIT_INVALID_US_ASCII_CHAR = 1 << 5;
    /** Bit denoting an invalid control character. */
    public static final int E_BIT_INVALID_CONTROL_CHAR = 1 << 6;
    /** Bit denoting an invalid separator character. */
    public static final int E_BIT_INVALID_SEPARATOR_CHAR = 1 << 7;
    /** Bit denoting a missing quote character. */
    public static final int E_BIT_MISSING_QUOTE = 1 << 8;
    /** Bit denoting a missing quoted pair character. */
    public static final int E_BIT_MISSING_QUOTED_PAIR_CHAR = 1 << 9;
    /** Bit denoting an invalid quoted pair character. */
    public static final int E_BIT_INVALID_QUOTED_PAIR_CHAR = 1 << 10;
    /** Bit denoting an invalid encoding. */
    public static final int E_BIT_INVALID_CHARSET = 1 << 11;

    /*
     * Internal state.
     */

    /** True if the previous character was a CR. */
    protected boolean bCr = false;

    /** Used by decode method to indicated valid or non valid character. */
    protected boolean bValidChar;

    /*
     * Exposed state.
     */

    /** Boolean indicating whether or not EOF has been reached on stream. */
    public boolean bEof;

    /** Bit field of errors encountered while attempting to read a line. */
    public int bfErrors;

    /**
     * Prohibit public construction.
     */
    protected HeaderLineReader() {
    }

    /**
     * Returns a reader with default configuration values.
     * @return a reader with default configuration values
     */
    public static HeaderLineReader getReader() {
        return new HeaderLineReader();
    }

    /**
     * Returns a reader initialized to read normal lines.
     * Normal lines being lines with no LWS or key:value headers.
     * The reader is configured to expect US-ASCII characters.
     * @return a reader to read normal lines
     */
    public static HeaderLineReader getLineReader() {
        HeaderLineReader hlr = new HeaderLineReader();
        hlr.bNameValue = false;
        hlr.encoding = ENC_US_ASCII;
        return hlr;
    }

    /**
     * Returns a reader initialized to read header lines.
     * The reader is configured to expect ISO-8859-1 encoding, LWS,
     * quoted text and encoded words. Besides reading key:value headers it will
     * also read and return normal lines as defined in the method above.
     * @return a reader to read header lines
     */
    public static HeaderLineReader getHeaderLineReader() {
        HeaderLineReader hlr = new HeaderLineReader();
        hlr.bNameValue = true;
        hlr.encoding = ENC_ISO8859_1;
        //hlr.eol
        hlr.bLWS = true;
        hlr.bQuotedText = true;
        hlr.bEncodedWords = true;
        return hlr;
    }

    /**
     * Reads a header/line according to the configuration.
     * After calling the readLine method additional information is available
     * from public fields on the reader.
     * @param in InputStream with characters
     * @return result wrapped in a HeaderLine object
     * @throws IOException if an i/o error occurs in the underlying input stream
     */
    public HeaderLine readLine(PushbackInputStream in) throws IOException {
        HeaderLine headerLine = new HeaderLine();
        int state;
        if (!bNameValue) {
            state = S_LINE;
        } else {
            state = S_NAME;
        }
        lineSb.setLength(0);
        nvSb.setLength(0);
        bytesOut = new ByteArrayOutputStreamWithUnread();
        bfErrors = 0;
        int c;
        bCr = false;
        boolean bLoop = true;
        while (bLoop) {
            c = in.read();
            if (c != -1) {
                bytesOut.write(c);
            }
            switch (state) {
            case S_LINE:
                switch (c) {
                case -1:
                    // EOF.
                    bfErrors |= E_BIT_EOF;
                    headerLine.type = HeaderLine.HLT_LINE;
                    headerLine.line = lineSb.toString();
                    lineSb.setLength(0);
                    bLoop = false;
                    break;
                case '\r':
                    bCr = true;
                    break;
                case '\n':
                    headerLine.type = HeaderLine.HLT_LINE;
                    headerLine.line = lineSb.toString();
                    lineSb.setLength(0);
                    // Check EOL.
                    check_eol();
                    bLoop = false;
                    break;
                default:
                    if (bCr) {
                        // Misplaced CR.
                        bfErrors |= E_BIT_MISPLACED_CR;
                        bCr = false;
                    }
                    // Decode character.
                    c = decode(c, in);
                    if (c == -1) {
                        // EOF.
                        bfErrors |= E_BIT_EOF;
                        headerLine.type = HeaderLine.HLT_LINE;
                        headerLine.line = lineSb.toString();
                        lineSb.setLength(0);
                        bLoop = false;
                    } else {
                        if (bValidChar && encoding != ENC_RAW) {
                            if (c < 256 && ((charCharacteristicsTab[c] & CC_CONTROL) == CC_CONTROL)) {
                                bValidChar = false;
                                // Invalid control char
                                bfErrors |= E_BIT_INVALID_CONTROL_CHAR;
                            }
                        }
                        if (bValidChar) {
                            lineSb.append((char) c);
                        }
                    }
                    break;
                }
                break;
            case S_NAME:
                switch (c) {
                case -1:
                    // EOF.
                    bfErrors |= E_BIT_EOF;
                    headerLine.type = HeaderLine.HLT_LINE;
                    headerLine.line = lineSb.toString();
                    lineSb.setLength(0);
                    nvSb.setLength(0);
                    bLoop = false;
                    break;
                case '\r':
                    bCr = true;
                    break;
                case '\n':
                    headerLine.type = HeaderLine.HLT_LINE;
                    headerLine.line = lineSb.toString();
                    lineSb.setLength(0);
                    nvSb.setLength(0);
                    // Check EOL.
                    check_eol();
                    bLoop = false;
                    break;
                case ':':
                    headerLine.type = HeaderLine.HLT_HEADERLINE;
                    headerLine.name = nvSb.toString();
                    lineSb.setLength(0);
                    nvSb.setLength(0);
                    if (bCr) {
                        // Misplaced CR.
                        bfErrors |= E_BIT_MISPLACED_CR;
                        bCr = false;
                    }
                    state = S_VALUE;
                    break;
                default:
                    if (bCr) {
                        // Misplaced CR.
                        bfErrors |= E_BIT_MISPLACED_CR;
                        bCr = false;
                    }
                    // Decode character.
                    c = decode(c, in);
                    if (c == -1) {
                        // EOF.
                        bfErrors |= E_BIT_EOF;
                        headerLine.type = HeaderLine.HLT_LINE;
                        headerLine.line = lineSb.toString();
                        lineSb.setLength(0);
                        nvSb.setLength(0);
                        bLoop = false;
                    } else {
                        if (bValidChar && encoding != ENC_RAW) {
                            if (c < 256 && ((charCharacteristicsTab[c] & CC_CONTROL) == CC_CONTROL)) {
                                bValidChar = false;
                                // Invalid control char
                                bfErrors |= E_BIT_INVALID_CONTROL_CHAR;
                            }
                        }
                        if (bValidChar) {
                            lineSb.append((char) c);
                            if (c < 256 && ((charCharacteristicsTab[c] & CC_SEPARATOR_WS) == CC_SEPARATOR_WS)) {
                                bValidChar = false;
                                // Invalid separator in name
                                bfErrors |= E_BIT_INVALID_SEPARATOR_CHAR;
                            }
                        }
                        if (bValidChar) {
                            nvSb.append((char) c);
                        }
                    }
                    break;
                }
                break;
            case S_VALUE:
                switch (c) {
                case -1:
                    // EOF.
                    bfErrors |= E_BIT_EOF;
                    headerLine.value = trim(nvSb);
                    nvSb.setLength(0);
                    bLoop = false;
                    break;
                case '\r':
                    bCr = true;
                    break;
                case '\n':
                    // Check EOL.
                    check_eol();
                    if (bLWS) {
                        state = S_LWS;
                    } else {
                        headerLine.value = trim(nvSb);
                        nvSb.setLength(0);
                        bLoop = false;
                    }
                    break;
                default:
                    if (bCr) {
                        // Misplaced CR.
                        bfErrors |= E_BIT_MISPLACED_CR;
                        bCr = false;
                    }
                    // Decode character.
                    c = decode(c, in);
                    if (c == -1) {
                        // EOF.
                        bfErrors |= E_BIT_EOF;
                        headerLine.value = trim(nvSb);
                        nvSb.setLength(0);
                        bLoop = false;
                    } else {
                        if (bValidChar && encoding != ENC_RAW) {
                            if (c < 256 && ((charCharacteristicsTab[c] & CC_CONTROL) == CC_CONTROL)) {
                                bValidChar = false;
                                // Invalid control char
                                bfErrors |= E_BIT_INVALID_CONTROL_CHAR;
                            }
                        }
                        if (bValidChar) {
                            switch (c) {
                            case '\"':
                                nvSb.append((char)c);
                                if (bQuotedText) {
                                    state = S_QUOTED_TEXT;
                                }
                                break;
                            case '=':
                                if (bEncodedWords) {
                                    state = S_ENCODED_WORD_EQ;
                                } else {
                                    nvSb.append((char)c);
                                }
                                break;
                            default:
                                nvSb.append((char)c);
                                break;
                            }
                        }
                    }
                    break;
                }
                break;
            case S_LWS:
                switch (c) {
                case -1:
                    // EOF.
                    //bfErrors |= E_BIT_EOF;
                    headerLine.value = trim(nvSb);
                    nvSb.setLength(0);
                    bLoop = false;
                    break;
                case ' ':
                case '\t':
                    nvSb.append(" ");
                    state = S_VALUE;
                    break;
                default:
                    in.unread(c);
                    bytesOut.unread(c);
                    headerLine.value = trim(nvSb);
                    nvSb.setLength(0);
                    bLoop = false;
                    break;
                }
                break;
            case S_QUOTED_TEXT:
                switch (c) {
                case -1:
                    // EOF.
                    bfErrors |= E_BIT_MISSING_QUOTE | E_BIT_EOF;
                    headerLine.value = trim(nvSb);
                    nvSb.setLength(0);
                    bLoop = false;
                    break;
                case '\"':
                    if (bCr) {
                        // Misplaced CR.
                        bfErrors |= E_BIT_MISPLACED_CR;
                        bCr = false;
                    }
                    nvSb.append((char)c);
                    state = S_VALUE;
                    break;
                case '\\':
                    if (bCr) {
                        // Misplaced CR.
                        bfErrors |= E_BIT_MISPLACED_CR;
                        bCr = false;
                    }
                    state = S_QUOTED_PAIR;
                    break;
                case '\r':
                    bCr = true;
                    break;
                case '\n':
                    // Check EOL.
                    check_eol();
                    if (bLWS) {
                        state = S_QUOTED_LWS;
                    } else {
                        headerLine.value = trim(nvSb);
                        nvSb.setLength(0);
                        bLoop = false;
                    }
                    break;
                default:
                    if (bCr) {
                        // Misplaced CR.
                        bfErrors |= E_BIT_MISPLACED_CR;
                        bCr = false;
                    }
                    // Decode character.
                    c = decode(c, in);
                    if (c == -1) {
                        // EOF.
                        bfErrors |= E_BIT_MISSING_QUOTE | E_BIT_EOF;
                        headerLine.value = trim(nvSb);
                        nvSb.setLength(0);
                        bLoop = false;
                    } else {
                        if (bValidChar && encoding != ENC_RAW) {
                            if (c < 256 && ((charCharacteristicsTab[c] & CC_CONTROL) == CC_CONTROL)) {
                                bValidChar = false;
                                // Invalid control char
                                bfErrors |= E_BIT_INVALID_CONTROL_CHAR;
                            }
                        }
                        if (bValidChar) {
                            nvSb.append((char)c);
                        }
                    }
                    break;
                }
                break;
            case S_QUOTED_PAIR:
                switch (c) {
                case -1:
                    nvSb.append('\\');
                    // EOF.
                    bfErrors |= E_BIT_MISSING_QUOTED_PAIR_CHAR | E_BIT_MISSING_QUOTE | E_BIT_EOF;
                    headerLine.value = trim(nvSb);
                    nvSb.setLength(0);
                    bLoop = false;
                    break;
                default:
                    // Decode character.
                    c = decode(c, in);
                    if (c == -1) {
                        // EOF.
                        bfErrors |= E_BIT_MISSING_QUOTED_PAIR_CHAR | E_BIT_MISSING_QUOTE | E_BIT_EOF;
                        headerLine.value = trim(nvSb);
                        nvSb.setLength(0);
                        bLoop = false;
                    } else {
                        nvSb.append('\\');
                        nvSb.append((char)c);
                        if (!bValidChar) {
                            bfErrors |= E_BIT_INVALID_QUOTED_PAIR_CHAR;
                        }
                        state = S_QUOTED_TEXT;
                    }
                    break;
                }
                break;
            case S_QUOTED_LWS:
                switch (c) {
                case -1:
                    // EOF.
                    bfErrors |= E_BIT_MISSING_QUOTE;
                    headerLine.value = trim(nvSb);
                    nvSb.setLength(0);
                    bLoop = false;
                    break;
                case ' ':
                case '\t':
                    nvSb.append(" ");
                    state = S_QUOTED_TEXT;
                    break;
                default:
                    in.unread(c);
                    bytesOut.unread(c);
                    bfErrors |= E_BIT_MISSING_QUOTE;
                    headerLine.value = trim(nvSb);
                    nvSb.setLength(0);
                    bLoop = false;
                    break;
                }
                break;
            case S_ENCODED_WORD_EQ:
                switch (c) {
                case -1:
                    nvSb.append('=');
                    // EOF.
                    bfErrors |= E_BIT_EOF;
                    headerLine.value = trim(nvSb);
                    nvSb.setLength(0);
                    bLoop = false;
                    break;
                case '?':
                    //  Unread "=?", so it can be parsed as an EncodedWord which always starts with "=?"
                    in.unread('?');
                    in.unread('=');
                    bytesOut.unread('?');
                    bytesOut.unread('=');
                    EncodedWords ew = EncodedWords.parseEncodedWords(in, true);
                    /*
                    if (!ew.bIsValid) {
                        // TODO Decide whether to report encoded word errors or interpret as non encoded words.
                    }
                    */
                    if (!ew.bValidCharset) {
                        // In case of invalid charset errors, try to report.
                        bfErrors |= E_BIT_INVALID_CHARSET;
                        // Possible message : "Invalid charset : " + ew.charsetStr;
                    }
                    nvSb.append("=?");
                    in.unread(ew.line, 2, ew.line.length - 2);
                    bytesOut.write("=?".getBytes());
                    state = S_VALUE;
                    break;
                default:
                    nvSb.append('=');
                    in.unread(c);
                    bytesOut.unread(c);
                    state = S_VALUE;
                    break;
                }
                break;
            }
        }
        headerLine.raw = bytesOut.toByteArray();
        headerLine.bfErrors = bfErrors;
        bEof = (headerLine.raw.length == 0);
        return headerLine;
    }

    /**
     * Decode a character according to the expected encoding.
     * @param c first character of the possibly encoded character sequence
     * @param in InputStream with possible extra encoded characters.
     * @return decoded character
     * @throws IOException if an i/o error occurs in the underlying input stream
     */
    protected int decode(int c, InputStream in) throws IOException {
        switch (encoding) {
        case ENC_UTF8:
            c = utf8.readUtf8(c, in);
            bytesOut.write(utf8.chars_read);
            bValidChar = utf8.bValidChar;
            if (c != -1) {
                if (!bValidChar) {
                    // Invalid UTF-8 char
                    bfErrors |= E_BIT_INVALID_UTF8_ENCODING;
                }
            }
            break;
        case ENC_US_ASCII:
            bValidChar = (c <= 127);
            if (!bValidChar) {
                // Invalid US-ASCII char
                bfErrors |= E_BIT_INVALID_US_ASCII_CHAR;
            }
            break;
        case ENC_ISO8859_1:
            // ISO-8859-1 utilizes all 8-bits and requires no decoding.
        case ENC_RAW:
            // Raw 8-bit character needs no decoding.
        default:
             bValidChar = true;
            break;
        }
        return c;
    }

    /**
     * Check and report whether the line ended as expected.
     */
    protected void check_eol() {
        switch (eol) {
        case EOL_LF:
            if (!bCr) {
                // Unexpected CR.
                bfErrors |= E_BIT_UNEXPECTED_CR;
            }
            break;
        case EOL_CRLF:
            if (!bCr) {
                // Missing CR.
                bfErrors |= E_BIT_MISSING_CR;
            }
            break;
        }
        bCr = false;
    }

    /**
     * Trims the whitespace characters found in the beginning and end of a
     * string. Differs from the String method in that it leaves control
     * characters.
     * @param sb StringBuffer to be trimmed
     * @return trimmed string
     */
    public static String trim(StringBuffer sb) {
        int sIdx = 0;
        int eIdx = sb.length();
        while (sIdx < eIdx && sb.charAt(sIdx) == ' ') {
            ++sIdx;
        }
        while (eIdx > sIdx && sb.charAt(eIdx - 1) == ' ') {
            --eIdx;
        }
        return sb.substring(sIdx, eIdx);
    }

    /**
     * Report bit field errors as diagnoses.
     * @param bfErrors bit field with indicated errors
     * @param diagnostics diagnostics object used to report diagnoses
     */
    public static void report_error(int bfErrors, Diagnostics diagnostics) {
        if (diagnostics == null) {
            throw new IllegalArgumentException("'diagnostics' argument is null");
        }
        if ((bfErrors & E_BIT_EOF) != 0) {
            diagnostics.addError(new Diagnosis(DiagnosisType.ERROR, "header/line", "Unexpected EOF"));
        }
        if ((bfErrors & E_BIT_MISPLACED_CR) != 0) {
            diagnostics.addError(new Diagnosis(DiagnosisType.ERROR, "header/line", "Misplaced CR"));
        }
        if ((bfErrors & E_BIT_MISSING_CR) != 0) {
            diagnostics.addError(new Diagnosis(DiagnosisType.ERROR, "header/line", "Missing CR"));
        }
        if ((bfErrors & E_BIT_UNEXPECTED_CR) != 0) {
            diagnostics.addError(new Diagnosis(DiagnosisType.ERROR, "header/line", "Unexpected CR"));
        }
        if ((bfErrors & E_BIT_INVALID_UTF8_ENCODING) != 0) {
            diagnostics.addError(new Diagnosis(DiagnosisType.ERROR, "header/line", "Invalid UTF-8 encoded character"));
        }
        if ((bfErrors & E_BIT_INVALID_US_ASCII_CHAR) != 0) {
            diagnostics.addError(new Diagnosis(DiagnosisType.ERROR, "header/line", "Invalid US-ASCII character"));
        }
        if ((bfErrors & E_BIT_INVALID_CONTROL_CHAR) != 0) {
            diagnostics.addError(new Diagnosis(DiagnosisType.ERROR, "header/line", "Invalid control character"));
        }
        if ((bfErrors & E_BIT_INVALID_SEPARATOR_CHAR) != 0) {
            diagnostics.addError(new Diagnosis(DiagnosisType.ERROR, "header/line", "Invalid separator character"));
        }
        if ((bfErrors & E_BIT_MISSING_QUOTE) != 0) {
            diagnostics.addError(new Diagnosis(DiagnosisType.ERROR, "header/line", "Missing quote character"));
        }
        if ((bfErrors & E_BIT_MISSING_QUOTED_PAIR_CHAR) != 0) {
            diagnostics.addError(new Diagnosis(DiagnosisType.ERROR, "header/line", "Missing quoted pair character"));
        }
        if ((bfErrors & E_BIT_INVALID_QUOTED_PAIR_CHAR) != 0) {
            diagnostics.addError(new Diagnosis(DiagnosisType.ERROR, "header/line", "Invalid quoted pair character"));
        }
        if ((bfErrors & E_BIT_INVALID_CHARSET) != 0) {
            diagnostics.addError(new Diagnosis(DiagnosisType.ERROR, "header/line", "Invalid charset"));
        }
    }

}