com.fasterxml.jackson.dataformat.csv.CsvParser Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of jackson-dataformat-csv Show documentation
Support for reading and writing CSV-encoded data via Jackson abstractions.
The newest version!
package com.fasterxml.jackson.dataformat.csv;

import java.io.*;
import java.math.BigDecimal;
import java.math.BigInteger;

import org.codehaus.jackson.*;
import org.codehaus.jackson.impl.JsonParserMinimalBase;
import org.codehaus.jackson.impl.JsonReadContext;
import org.codehaus.jackson.io.IOContext;
import org.codehaus.jackson.util.BufferRecycler;
import org.codehaus.jackson.util.ByteArrayBuilder;

import com.fasterxml.jackson.dataformat.csv.impl.CsvReader;
import com.fasterxml.jackson.dataformat.csv.impl.TextBuffer;

/**
 * {@link JsonParser} implementation used to expose CSV documents
 * in form that allows other Jackson functionality to deal
 * with it.
 *
 * Implementation is based on a state-machine that pulls information
 * using {@link CsvReader}.
 */
public class CsvParser
    extends JsonParserMinimalBase
{
    /**
     * Enumeration that defines all togglable features for Smile generators.
     */
    public enum Feature {
        /**
         * Feature determines whether spaces around separator characters
         * (commas) are to be automatically trimmed before being reported
         * or not.
         * Note that this does NOT force trimming of possible white space from
         * within double-quoted values, but only those surrounding unquoted
         * values (white space outside of double-quotes is never included regardless
         * of trimming).
         *

         * Default value is false, as per RFC-4180.
         */
        TRIM_SPACES(false),
        
        /**
         * Feature that determines how stream of records (usually CSV lines, but sometimes
         * multiple lines when linefeeds are included in quoted values) is exposed:
         * either as a sequence of Objects (false), or as an array of Objects (true).
         * Using stream of Objects is convenient when using
         * ObjectMapper.readValues(...)
         * and array of Objects convenient when binding to Lists or
         * arrays of values.
         *

         * Default value is false, meaning that by default a CSV document is exposed as
         * a sequence of root-level Object entries.
         */
        WRAP_AS_ARRAY(false)
        ;

        final boolean _defaultState;
        final int _mask;
        
        /**
         * Method that calculates bit set (flags) of all features that
         * are enabled by default.
         */
        public static int collectDefaults()
        {
            int flags = 0;
            for (Feature f : values()) {
                if (f.enabledByDefault()) {
                    flags |= f.getMask();
                }
            }
            return flags;
        }
        
        private Feature(boolean defaultState) {
            _defaultState = defaultState;
            _mask = (1 << ordinal());
        }
        
        public boolean enabledByDefault() { return _defaultState; }
        public int getMask() { return _mask; }
    }

    private final static CsvSchema EMPTY_SCHEMA;
    static {
        EMPTY_SCHEMA = CsvSchema.emptySchema();
    }

    /*
    /**********************************************************************
    /* State constants
    /**********************************************************************
     */

    /**
     * Initial state before anything is read from document.
     */
    protected final static int STATE_DOC_START = 0;
    
    /**
     * State before logical start of a record, in which next
     * token to return will be {@link JsonToken#START_OBJECT}
     * (or if no Schema is provided, {@link JsonToken#START_ARRAY}).
     */
    protected final static int STATE_RECORD_START = 1;

    /**
     * State in which next entry will be available, returning
     * either {@link JsonToken#FIELD_NAME} or value
     * (depending on whether entries are expressed as
     * Objects or just Arrays); or
     * matching close marker.
     */
    protected final static int STATE_NEXT_ENTRY = 2;

    /**
     * State in which value matching field name will
     * be returned.
     */
    protected final static int STATE_NAMED_VALUE = 3;

    /**
     * State in which "unnamed" value (entry in an array)
     * will be returned, if one available; otherwise
     * end-array is returned.
     */
    protected final static int STATE_UNNAMED_VALUE = 4;
    
    /**
     * State in which end marker is returned; either
     * null (if no array wrapping), or
     * {@link JsonToken#END_ARRAY} for wrapping.
     * This step will loop, returning series of nulls
     * if {@link #nextToken} is called multiple times.
     */
    protected final static int STATE_DOC_END = 5;
    
    /*
    /**********************************************************************
    /* Configuration
    /**********************************************************************
     */
    
    /**
     * Codec used for data binding when (if) requested.
     */
    protected ObjectCodec _objectCodec;

    protected int _csvFeatures;
    
    /**
     * Definition of columns being read. Initialized to "empty" instance, which
     * has default configuration settings.
     */
    protected CsvSchema _schema = EMPTY_SCHEMA;

    /**
     * Number of columns defined by schema.
     */
    protected int _columnCount = 0;

    /*
    /**********************************************************************
    /* State
    /**********************************************************************
     */
    
    /**
     * Information about parser context, context in which
     * the next token is to be parsed (root, array, object).
     */
    protected JsonReadContext _parsingContext;

    /**
     * Name of column that we exposed most recently, accessible after
     * {@link JsonToken#FIELD_NAME} as well as value tokens immediately
     * following field name.
     */
    protected String _currentName;

    /**
     * String value for the current column, if accessed.
     */
    protected String _currentValue;
    
    /**
     * Index of the column we are exposing
     */
    protected int _columnIndex;
    
    /**
     * Current logical state of the parser; one of STATE_
     * constants.
     */
    protected int _state = STATE_DOC_START;
    
    /**
     * We will hold on to decoded binary data, for duration of
     * current event, so that multiple calls to
     * {@link #getBinaryValue} will not need to decode data more
     * than once.
     */
    protected byte[] _binaryValue;

    /*
    /**********************************************************************
    /* Helper objects
    /**********************************************************************
     */

    /**
     * Thing that actually reads the CSV content
     */
    protected final CsvReader _reader;

    /**
     * Buffer that contains contents of all values after processing
     * of doubled-quotes, escaped characters.
     */
    protected final TextBuffer _textBuffer;
    
    protected ByteArrayBuilder _byteArrayBuilder;
    
    /*
    /**********************************************************************
    /* Life-cycle
    /**********************************************************************
     */
    
    public CsvParser(IOContext ctxt, BufferRecycler br,
            int parserFeatures, int csvFeatures,
            ObjectCodec codec, Reader reader)
    {
        super(parserFeatures);    
        _objectCodec = codec;
        _textBuffer = new TextBuffer(br);
        _csvFeatures = csvFeatures;
        _parsingContext = JsonReadContext.createRootContext();
        _reader = new CsvReader(this, ctxt, reader, _schema, _textBuffer,
                isEnabled(JsonParser.Feature.AUTO_CLOSE_SOURCE),
                isEnabled(Feature.TRIM_SPACES));
    }

    @Override
    public ObjectCodec getCodec() {
        return _objectCodec;
    }

    @Override
    public void setCodec(ObjectCodec c) {
        _objectCodec = c;
    }
    
    @Override
    public boolean canUseSchema(FormatSchema schema) {
        return (schema instanceof FormatSchema);
    }

    @Override
    public void setSchema(FormatSchema schema)
    {
        if (schema instanceof CsvSchema) {
            _schema = (CsvSchema) schema;
        } else if (schema == null) {
            schema = EMPTY_SCHEMA;
        } else {
            super.setSchema(schema);
        }
        _columnCount = _schema.size();            
        _reader.setSchema(_schema);
    }

    @Override
    public int releaseBuffered(Writer out) throws IOException
    {
        return _reader.releaseBuffered(out);
    }

    @Override
    public boolean isClosed() { return _reader.isClosed(); }

    @Override
    public void close() throws IOException { _reader.close(); }

    /*
    /***************************************************
    /* Public API, configuration
    /***************************************************
     */

    /**
     * Method for enabling specified CSV feature
     * (check {@link Feature} for list of features)
     */
    public JsonParser enable(Feature f)
    {
        _csvFeatures |= f.getMask();
        return this;
    }

    /**
     * Method for disabling specified  CSV feature
     * (check {@link Feature} for list of features)
     */
    public JsonParser disable(Feature f)
    {
        _csvFeatures &= ~f.getMask();
        return this;
    }

    /**
     * Method for enabling or disabling specified CSV feature
     * (check {@link Feature} for list of features)
     */
    public JsonParser configure(Feature f, boolean state)
    {
        if (state) {
            enable(f);
        } else {
            disable(f);
        }
        return this;
    }

    /**
     * Method for checking whether specified CSV {@link Feature}
     * is enabled.
     */
    public boolean isEnabled(Feature f) {
        return (_csvFeatures & f.getMask()) != 0;
    }

    /**
     * Accessor for getting active schema definition: it may be
     * "empty" (no column definitions), but will never be null
     * since it defaults to an empty schema (and default configuration)
     *
     * NOTE: should be part of JsonParser, will be for 2.0.
     */
    public CsvSchema getSchema() {
        return _schema;
    }
    
    /*
    /**********************************************************
    /* Location info
    /**********************************************************
     */
    
    @Override
    public JsonStreamContext getParsingContext() {
        return _parsingContext;
    }

    @Override
    public JsonLocation getTokenLocation() {
        return _reader.getTokenLocation();
    }

    @Override
    public JsonLocation getCurrentLocation() {
        return _reader.getCurrentLocation();
    }

    @Override
    public Object getInputSource() {
        return _reader.getInputSource();
    }
    
    /*
    /**********************************************************
    /* Parsing
    /**********************************************************
     */

    @Override
    public String getCurrentName() throws IOException, JsonParseException
    {
        return _currentName;
    }
    
    @Override
    public JsonToken nextToken() throws IOException, JsonParseException
    {
        _binaryValue = null;
        switch (_state) {
        case STATE_DOC_START:
            return (_currToken = _handleStartDoc());
        case STATE_RECORD_START:
            return (_currToken = _handleRecordStart());
        case STATE_NEXT_ENTRY:
            return (_currToken = _handleNextEntry());
        case STATE_NAMED_VALUE:
            return (_currToken = _handleNamedValue());
        case STATE_UNNAMED_VALUE:
            return (_currToken = _handleUnnamedValue());
        case STATE_DOC_END:
            _reader.close();
            if (_parsingContext.inRoot()) {
                return null;
            }
            // should always be in array, actually... but:
            boolean inArray = _parsingContext.inArray();
            _parsingContext = _parsingContext.getParent();
            return inArray ? JsonToken.END_ARRAY : JsonToken.END_OBJECT;
        default:
            throw new IllegalStateException();
        }
    }

    /*
    /**********************************************************
    /* Parsing, helper methods
    /**********************************************************
     */
    
    /**
     * Method called to handle details of initializing things to return
     * the very first token.
     */
    protected JsonToken _handleStartDoc() throws IOException, JsonParseException
    {
        // First things first: are we expecting header line? If so, read, process
        if (_schema.useHeader()) {
            _readHeaderLine();
        }
        // and if we are to skip the first data line, skip it
        if (_schema.skipFirstDataRow()) {
            _reader.skipLine();
        }
        
        /* Only one real complication, actually; empy documents (zero bytes).
         * Those have no entries. Should be easy enough to detect like so:
         */
        if (!_reader.hasMoreInput()) {
            _state = STATE_DOC_END;
            // but even empty sequence must still be wrapped in logical array
            if (isEnabled(Feature.WRAP_AS_ARRAY)) {
                _parsingContext = _reader.childArrayContext(_parsingContext);
                return JsonToken.START_ARRAY;
            }
            return null;
        }
        
        if (isEnabled(Feature.WRAP_AS_ARRAY)) {
            _parsingContext = _reader.childArrayContext(_parsingContext);
            _state = STATE_RECORD_START;
            return JsonToken.START_ARRAY;
        }
        // otherwise, same as regular new entry...
        return _handleRecordStart();
    }

    protected JsonToken _handleRecordStart() throws IOException, JsonParseException
    {
        _columnIndex = 0;
        if (_columnCount == 0) { // no schema; exposed as an array
            _state = STATE_UNNAMED_VALUE;
            _parsingContext = _reader.childArrayContext(_parsingContext);
            return JsonToken.START_ARRAY;
        }
        // otherwise, exposed as an Object
        _parsingContext = _reader.childObjectContext(_parsingContext);
        _state = STATE_NEXT_ENTRY;
        return JsonToken.START_OBJECT;
    }

    protected JsonToken _handleNextEntry() throws IOException, JsonParseException
    {
        // NOTE: only called when we do have real Schema
        String next = _reader.nextString();

        if (next == null) { // end of record or input...
            _parsingContext = _parsingContext.getParent();
            // let's handle EOF or linefeed
            if (!_reader.startNewLine()) {
                _state = STATE_DOC_END;
            } else {
                // no, just end of record
                _state = STATE_RECORD_START;
            }
            return JsonToken.END_OBJECT;
        }
        _state = STATE_NAMED_VALUE;
        _currentValue = next;
        if (_columnIndex >= _columnCount) {
            _reportError("Too many entries: expected at most "+_columnCount+" (value ("+next.length()+" chars) \""+next+"\")");
        }
        _currentName = _schema.column(_columnIndex).getName();
        return JsonToken.FIELD_NAME;
    }

    protected JsonToken _handleNamedValue() throws IOException, JsonParseException
    {
        _state = STATE_NEXT_ENTRY;
        ++_columnIndex;
        return JsonToken.VALUE_STRING;
    }

    protected JsonToken _handleUnnamedValue() throws IOException, JsonParseException
    {
        String next = _reader.nextString();
        if (next == null) { // end of record or input...
            _parsingContext = _parsingContext.getParent();
            if (!_reader.startNewLine()) { // end of whole thing...
                _state = STATE_DOC_END;
            } else {
                // no, just end of record
                _state = STATE_RECORD_START;
            }
            return JsonToken.END_ARRAY;
        }
        // state remains the same
        _currentValue = next;
        ++_columnIndex;
        return JsonToken.VALUE_STRING;
    }

    /**
     * Method called to process the expected header line
     */
    protected void _readHeaderLine() throws IOException, JsonParseException
    {
        /* Two separate cases:
         * 
         * (a) We already have a Schema with columns; if so, header will be skipped
         * (b) Otherwise, need to find column definitions; empty one is not acceptable
         */

        if (_schema.size() > 0) { // case (a); skip all/any
            while (_reader.nextString() != null) { }
            return;
        }
        // case (b); read all
        String name;
        // base setting on existing schema, but drop columns
        CsvSchema.Builder builder = _schema.rebuild().clearColumns();
        
        while ((name = _reader.nextString()) != null) {
            // one more thing: always trim names, regardless of config settings
            name = name.trim();
            
            // See if "old" schema defined type; if so, use that type...
            CsvSchema.Column prev = _schema.column(name);
            if (prev != null) {
                builder.addColumn(name, prev.getType());
            } else {
                builder.addColumn(name);
            }
        }
        // Ok: did we get any  columns?
        CsvSchema newSchema = builder.build();
        int size = newSchema.size();
        if (size < 2) { // 1 just because we may get 'empty' header name
            String first = (size == 0) ? "" : newSchema.column(0).getName().trim();
            if (first.length() == 0) {
                _reportError("Empty header line: can not bind data");
            }
        }
        // otherwise we will use what we got
        setSchema(builder.build());
    }
    
    /*
    /**********************************************************
    /* String value handling
    /**********************************************************
     */

    // For now we do not store char[] representation...
    @Override
    public boolean hasTextCharacters() {
        return _textBuffer.hasTextAsCharacters();
    }
    
    @Override
    public String getText() throws IOException, JsonParseException {
        return _currentValue;
    }

    @Override
    public char[] getTextCharacters() throws IOException, JsonParseException {
        return _textBuffer.contentsAsArray();
    }

    @Override
    public int getTextLength() throws IOException, JsonParseException {
        return _textBuffer.size();
    }

    @Override
    public int getTextOffset() throws IOException, JsonParseException {
        return 0;
    }
    
    /*
    /**********************************************************************
    /* Binary (base64)
    /**********************************************************************
     */

    @Override
    public byte[] getBinaryValue(Base64Variant variant) throws IOException, JsonParseException
    {
        if (_binaryValue == null) {
            if (_currToken != JsonToken.VALUE_STRING) {
                _reportError("Current token ("+_currToken+") not VALUE_STRING, can not access as binary");
            }
            ByteArrayBuilder builder = _getByteArrayBuilder();
            _decodeBase64(_currentValue, builder, variant);
            _binaryValue = builder.toByteArray();
        }
        return _binaryValue;
    }

    /*
    /**********************************************************************
    /* Number accessors
    /**********************************************************************
     */

    @Override
    public NumberType getNumberType() throws IOException, JsonParseException {
        return _reader.getNumberType();
    }
    
    @Override
    public Number getNumberValue() throws IOException, JsonParseException {
        return _reader.getNumberValue();
    }

    @Override
    public int getIntValue() throws IOException, JsonParseException {
        return _reader.getIntValue();
    }
    
    @Override
    public long getLongValue() throws IOException, JsonParseException {
        return _reader.getLongValue();
    }
    
    @Override
    public BigInteger getBigIntegerValue() throws IOException, JsonParseException {
        return _reader.getBigIntegerValue();
    }
    
    @Override
    public float getFloatValue() throws IOException, JsonParseException {
        return _reader.getFloatValue();
    }
    
    @Override
    public double getDoubleValue() throws IOException, JsonParseException {
        return _reader.getDoubleValue();
    }
    
    @Override
    public BigDecimal getDecimalValue() throws IOException, JsonParseException {
        return _reader.getDecimalValue();
    }
    
    /*
    /**********************************************************************
    /* Helper methods from base class
    /**********************************************************************
     */
    
    @Override
    protected void _handleEOF() throws JsonParseException {
        // I don't think there's problem with EOFs usually; except maybe in quoted stuff?
        _reportInvalidEOF(": expected closing quote character");
    }

    /*
    /**********************************************************************
    /* Helper methods for CsvReader
    /**********************************************************************
     */

    // must be (re)defined to make package-accessible
    public void _reportCsvError(String msg)  throws JsonParseException {
        super._reportError(msg);
    }

    public void _reportUnexpectedCsvChar(int ch, String msg)  throws JsonParseException {
        super._reportUnexpectedChar(ch, msg);
    }
    
    /*
    /**********************************************************************
    /* Internal methods
    /**********************************************************************
     */
    
    public ByteArrayBuilder _getByteArrayBuilder()
    {
        if (_byteArrayBuilder == null) {
            _byteArrayBuilder = new ByteArrayBuilder();
        } else {
            _byteArrayBuilder.reset();
        }
        return _byteArrayBuilder;
    }

}