fr.cenotelie.commons.utils.csv.CsvLexer Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of commons-utils Show documentation
Utility APIs for Java projects
The newest version!
/*******************************************************************************
 * Copyright (c) 2016 Association Cénotélie (cenotelie.fr)
 * This program is free software: you can redistribute it and/or modify
 * it under the terms of the GNU Lesser General Public License as
 * published by the Free Software Foundation, either version 3
 * of the License, or (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU Lesser General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General
 * Public License along with this program.
 * If not, see .
 ******************************************************************************/

package fr.cenotelie.commons.utils.csv;

import fr.cenotelie.commons.utils.RewindableTextStream;

import java.io.Reader;
import java.util.Arrays;

/**
 * Represents a lexer for a CSV document
 * 
 * Lexing rules are:
 * LineEnding -> '\n' | '\r' | '\r\n'
 * Separator -> what is given in parameter
 * TextMarker -> what is given in parameter
 * Whitespace -> (Unicode IsWhitespace character class) - (Separator | TextMarker | '\n' | '\r')
 * Cell -> (. - Whitespace)*
 * Cell -> TextMarker ( (.-TextMarker) | (TextMarker TextMarker)* ) TextMarker
 * 
 * In these rules . represents any character and - represents the language difference operator.
 * The last rule means that inside a quoted cell value the content can be anything except the quote character, in which case it must be doubled.
 *
 * @author Laurent Wouters
 */
class CsvLexer {
    /**
     * Represents an error in this lexer
     */
    public static final int TOKEN_ERROR = -1;
    /**
     * Token type of cells' value in a CSV
     */
    public static final int TOKEN_VALUE = 0;
    /**
     * Token type of cells' separator
     */
    public static final int TOKEN_SEPARATOR = 1;
    /**
     * Token type of new line markers
     */
    public static final int TOKEN_NEW_ROW = 2;
    /**
     * Token type for the end of input marker
     */
    public static final int TOKEN_EOF = 3;

    /**
     * Size of the buffer used to build the tokens
     */
    private static final int BUFFER_SIZE = 1024;

    /**
     * Whether the beginning string whitespaces must be kept or removed
     */
    private final boolean keepBeginningWhiteSpace;
    /**
     * The input stream
     */
    private final RewindableTextStream input;
    /**
     * The cell separator character
     */
    private final char separator;
    /**
     * The raw text beginning and end character
     */
    private final char textMarker;
    /**
     * The buffer used to build the tokens
     */
    private char[] builder;
    /**
     * The type of the last matched token
     */
    private int lastTokenType;
    /**
     * The value of the last matched token
     */
    private String lastTokenValue;

    /**
     * Initializes this lexer
     *
     * @param input          The input text reader
     * @param valueSeparator The character that separates values in rows
     * @param textMarker     The character that marks the beginning and end of raw text
     */
    public CsvLexer(Reader input, char valueSeparator, char textMarker) {
        this(input, valueSeparator, textMarker, false);
    }

    /**
     * Initializes this lexer with boolean to determinate if the beginning whitespace must be kept.
     *
     * @param input                   The input text reader
     * @param valueSeparator          The character that separates values in rows
     * @param textMarker              The character that marks the beginning and end of raw text
     * @param keepBeginningWhiteSpace Whether the beginning string whitespaces must be kept or removed
     */
    public CsvLexer(Reader input, char valueSeparator, char textMarker, boolean keepBeginningWhiteSpace) {
        this.input = new RewindableTextStream(input);
        this.separator = valueSeparator;
        this.textMarker = textMarker;
        this.builder = new char[BUFFER_SIZE];
        this.lastTokenType = TOKEN_ERROR;
        this.lastTokenValue = null;
        this.keepBeginningWhiteSpace = keepBeginningWhiteSpace;
    }

    /**
     * Gets the type of the last matched token
     *
     * @return The type of the last matched token
     */
    public int getTokenType() {
        return lastTokenType;
    }

    /**
     * Gets the value of the last matched token
     *
     * @return The value of the last matched token
     */
    public String getTokenValue() {
        return lastTokenValue;
    }

    /**
     * Gets the next token in the input
     *
     * @return The next token
     */
    public String next() {
        // ignore all whitespaces
        char c = input.read();
        if (input.isAtEnd()) {
            return getTokenEOF();
        }
        if (!keepBeginningWhiteSpace) {
            while (isWhitespace(c)) {
                c = input.read();
                if (input.isAtEnd()) {
                    return getTokenEOF();
                }
            }
        }

        // Here c is not whitespace and we are not at the end
        if (c == separator) {
            return getTokenSeparator();
        }
        if (c == textMarker) {
            return onTextMarkerChar();
        }
        if (c == '\r' || c == '\n') {
            return onLineEndingChar(c);
        }

        // Here we are on normal data
        int length = 1;
        builder[0] = c;
        while (true) {
            c = input.read();
            if (input.isAtEnd()) {
                break;
            }
            if (c == separator || c == '\r' || c == '\n') {
                input.rewind(1);
                break;
            }
            if (length >= builder.length)
                builder = Arrays.copyOf(builder, builder.length + BUFFER_SIZE);
            builder[length] = c;
            length++;
        }

        // we matched the data
        // Now, trim the trailing white spaces
        while (length > 0 && isWhitespace(builder[length - 1])) {
            length--;
        }

        return getTokenValue(length);
    }

    /**
     * Determines whether the given character is a white space that can be skipped
     *
     * @param c The character
     * @return true if the character can be skipped
     */
    private boolean isWhitespace(char c) {
        return !(c == separator || c == textMarker || c == '\n' || c == '\r') && Character.isWhitespace(c);
    }

    /**
     * Lexes the line ending token beginning with the given character
     *
     * @param c The beginning character
     * @return The matched token
     */
    private String onLineEndingChar(char c) {
        if (c == '\n') {
            return getTokenNewRow();
        }
        // This was a '\r' character
        // Check for windows line ending style
        char n = input.read();
        if (input.isAtEnd()) {
            return getTokenNewRow();
        }
        if (n != '\n') {
            input.rewind(1);
        }
        return getTokenNewRow();
    }

    /**
     * Lexes the raw text between marks
     *
     * @return The matched token
     */
    private String onTextMarkerChar() {
        int length = 0;
        while (true) {
            char c = input.read();
            if (input.isAtEnd()) {
                return getTokenError();
            }
            if (c != textMarker) {
                if (length >= builder.length)
                    builder = Arrays.copyOf(builder, builder.length + BUFFER_SIZE);
                builder[length] = c;
                length++;
            } else {
                // get the following char
                c = input.read();
                if (c == textMarker) {
                    // This is a double marker
                    if (length >= builder.length)
                        builder = Arrays.copyOf(builder, builder.length + BUFFER_SIZE);
                    builder[length] = c;
                    length++;
                } else {
                    // This was the end of the quoted text
                    if (!input.isAtEnd()) {
                        input.rewind(1);
                    }
                    return getTokenValue(length);
                }
            }
        }
    }

    /**
     * Gets an error token
     *
     * @return An error token
     */
    private String getTokenError() {
        lastTokenType = TOKEN_ERROR;
        lastTokenValue = null;
        return null;
    }

    /**
     * Gets an end of input marker token
     *
     * @return An end of input marker token
     */
    private String getTokenEOF() {
        lastTokenType = TOKEN_EOF;
        lastTokenValue = null;
        return null;
    }

    /**
     * Gets a cell separator token
     *
     * @return A cell separator token
     */
    private String getTokenSeparator() {
        lastTokenType = TOKEN_SEPARATOR;
        lastTokenValue = null;
        return null;
    }

    /**
     * Gets a new row token
     *
     * @return A new row token
     */
    private String getTokenNewRow() {
        lastTokenType = TOKEN_NEW_ROW;
        lastTokenValue = null;
        return null;
    }

    /**
     * Gets a token representing a cell's value
     *
     * @param length Length of the value
     * @return A token
     */
    private String getTokenValue(int length) {
        lastTokenType = TOKEN_VALUE;
        lastTokenValue = new String(builder, 0, length);
        return lastTokenValue;
    }
}