All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.jwat.common.NewlineParser Maven / Gradle / Ivy

There is a newer version: 1.2.1
Show newest version
/**
 * Java Web Archive Toolkit - Software to read and validate ARC, WARC
 * and GZip files. (http://jwat.org/)
 * Copyright 2011-2012 Netarkivet.dk (http://netarkivet.dk/)
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.jwat.common;

import java.io.IOException;

/**
 * A small class to detect and count occurrences of either LF or CRLF pairs in
 * a push back input stream, one method each. Implemented as an object instance
 * so the caller can access the status information.
 *
 * A misplaced CR or LF is reported when the character is in the wrong order or
 * simply not expected. Example LFCR has both CR and LF misplaced. If we are
 * looking for LF's a CRLF pair has a misplaced CR.
 *
 * @author nicl
 */
public class NewlineParser {

    /** Did the reader detect a missing CR while parsing newlines. */
    public boolean bMissingCr = false;

    /** Did the reader detect a missing LF while parsing newlines. */
    public boolean bMissingLf = false;

    /** Did the reader detect a misplaced CR while parsing newlines. */
    public boolean bMisplacedCr = false;

    /** Did the reader detect a misplaced LF while parsing newlines. */
    public boolean bMisplacedLf = false;

    /**
     * Looks forward in the input stream and counts the number of newlines
     * found. Non newlines characters are pushed back onto the input stream.
     * @param in data input stream
     * @param diagnostics object used to report errors and/or warnings
     * @return newlines found in input stream
     * @throws IOException if an error occurs while reading data
     */
    public int parseLFs(ByteCountingPushBackInputStream in, Diagnostics diagnostics) throws IOException {
        bMissingCr = false;
        bMissingLf = false;
        bMisplacedCr = false;
        bMisplacedLf = false;
        int newlines = 0;
        byte[] buffer = new byte[2];
        boolean bLoop = true;
        while (bLoop) {
            int read = in.read(buffer);
            switch (read) {
            case 1:
                if (buffer[0] == '\n') {
                    ++newlines;
                } else if ((buffer[0] == '\r')) {
                    ++newlines;
                    bMissingLf = true;
                    bMisplacedCr = true;
                } else {
                    in.unread(buffer[0]);
                    bLoop = false;
                }
                break;
            case 2:
                if (buffer[0] == '\n') {
                    if (buffer[1] == '\n') {
                        newlines += 2;
                    } else  if (buffer[1] != '\r') {
                        ++newlines;
                        in.unread(buffer[1]);
                    } else {
                        ++newlines;
                        bMisplacedCr = true;
                    }
                } else if (buffer[0] == '\r') {
                    if (buffer[1] == '\n') {
                        ++newlines;
                        bMisplacedCr = true;
                        bMisplacedLf = true;
                    } else {
                        ++newlines;
                        bMisplacedCr = true;
                        bMissingLf = true;
                        in.unread(buffer[1]);
                    }
                } else {
                    in.unread(buffer);
                    bLoop = false;
                }
                break;
            default:
                bLoop = false;
                break;
            }
        }
        if (diagnostics != null) {
            if (bMissingLf) {
                diagnostics.addWarning(new Diagnosis(DiagnosisType.ERROR_EXPECTED,
                        "Missing LF", "Sequence of LFs"));
            }
            if (bMisplacedCr) {
                diagnostics.addWarning(new Diagnosis(DiagnosisType.ERROR_EXPECTED,
                        "Misplaced CR", "Sequence of LFs"));
            }
            if (bMisplacedLf) {
                diagnostics.addWarning(new Diagnosis(DiagnosisType.ERROR_EXPECTED,
                        "Misplaced LF", "Sequence of LFs"));
            }
        }
        return newlines;
    }

    /**
     * Looks forward in the input stream and counts the number of newlines
     * found. Non newlines characters are pushed back onto the input stream.
     * @param in data input stream
     * @param diagnostics object used to report errors and/or warnings
     * @return newlines found in input stream
     * @throws IOException if an error occurs while reading data
     */
    public int parseCRLFs(ByteCountingPushBackInputStream in, Diagnostics diagnostics) throws IOException {
        bMissingCr = false;
        bMissingLf = false;
        bMisplacedCr = false;
        bMisplacedLf = false;
        int newlines = 0;
        byte[] buffer = new byte[2];
        boolean bLoop = true;
        while (bLoop) {
            int read = in.read(buffer);
            // Switch on the number of bytes we were able to read (0-2).
            switch (read) {
            case 1:
                if (buffer[0] == '\n') {
                    ++newlines;
                    bMissingCr = true;
                } else if ((buffer[0] == '\r')) {
                    ++newlines;
                    bMissingLf = true;
                } else {
                    in.unread(buffer[0]);
                    bLoop = false;
                }
                break;
            case 2:
                if (buffer[0] == '\r') {
                    if (buffer[1] == '\n') {
                        ++newlines;
                    } else {
                        ++newlines;
                        bMissingLf = true;
                        in.unread(buffer[1]);
                    }
                } else if (buffer[0] == '\n') {
                    if (buffer[1] == '\r') {
                        ++newlines;
                        bMisplacedCr = true;
                        bMisplacedLf = true;
                    } else {
                        ++newlines;
                        bMissingCr = true;
                        in.unread(buffer[1]);
                    }
                } else {
                    in.unread(buffer);
                    bLoop = false;
                }
                break;
            default:
                bLoop = false;
                break;
            }
        }
        if (diagnostics != null) {
            if (bMissingCr) {
                diagnostics.addWarning(new Diagnosis(DiagnosisType.ERROR_EXPECTED,
                        "Missing CR", "Sequence of CRLFs"));
            }
            if (bMissingLf) {
                diagnostics.addWarning(new Diagnosis(DiagnosisType.ERROR_EXPECTED,
                        "Missing LF", "Sequence of CRLFs"));
            }
            if (bMisplacedCr) {
                diagnostics.addWarning(new Diagnosis(DiagnosisType.ERROR_EXPECTED,
                        "Misplaced CR", "Sequence of CRLFs"));
            }
            if (bMisplacedLf) {
                diagnostics.addWarning(new Diagnosis(DiagnosisType.ERROR_EXPECTED,
                        "Misplaced LF", "Sequence of CRLFs"));
            }
        }
        return newlines;
    }

}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy