All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.fasterxml.jackson.dataformat.csv.CsvParser Maven / Gradle / Ivy

The newest version!
package com.fasterxml.jackson.dataformat.csv;

import java.io.*;
import java.math.BigDecimal;
import java.math.BigInteger;

import org.codehaus.jackson.*;
import org.codehaus.jackson.impl.JsonParserMinimalBase;
import org.codehaus.jackson.impl.JsonReadContext;
import org.codehaus.jackson.io.IOContext;
import org.codehaus.jackson.util.BufferRecycler;
import org.codehaus.jackson.util.ByteArrayBuilder;

import com.fasterxml.jackson.dataformat.csv.impl.CsvReader;
import com.fasterxml.jackson.dataformat.csv.impl.TextBuffer;

/**
 * {@link JsonParser} implementation used to expose CSV documents
 * in form that allows other Jackson functionality to deal
 * with it.
 *

* Implementation is based on a state-machine that pulls information * using {@link CsvReader}. */ public class CsvParser extends JsonParserMinimalBase { /** * Enumeration that defines all togglable features for Smile generators. */ public enum Feature { /** * Feature determines whether spaces around separator characters * (commas) are to be automatically trimmed before being reported * or not. * Note that this does NOT force trimming of possible white space from * within double-quoted values, but only those surrounding unquoted * values (white space outside of double-quotes is never included regardless * of trimming). *

* Default value is false, as per RFC-4180. */ TRIM_SPACES(false), /** * Feature that determines how stream of records (usually CSV lines, but sometimes * multiple lines when linefeeds are included in quoted values) is exposed: * either as a sequence of Objects (false), or as an array of Objects (true). * Using stream of Objects is convenient when using * ObjectMapper.readValues(...) * and array of Objects convenient when binding to Lists or * arrays of values. *

* Default value is false, meaning that by default a CSV document is exposed as * a sequence of root-level Object entries. */ WRAP_AS_ARRAY(false) ; final boolean _defaultState; final int _mask; /** * Method that calculates bit set (flags) of all features that * are enabled by default. */ public static int collectDefaults() { int flags = 0; for (Feature f : values()) { if (f.enabledByDefault()) { flags |= f.getMask(); } } return flags; } private Feature(boolean defaultState) { _defaultState = defaultState; _mask = (1 << ordinal()); } public boolean enabledByDefault() { return _defaultState; } public int getMask() { return _mask; } } private final static CsvSchema EMPTY_SCHEMA; static { EMPTY_SCHEMA = CsvSchema.emptySchema(); } /* /********************************************************************** /* State constants /********************************************************************** */ /** * Initial state before anything is read from document. */ protected final static int STATE_DOC_START = 0; /** * State before logical start of a record, in which next * token to return will be {@link JsonToken#START_OBJECT} * (or if no Schema is provided, {@link JsonToken#START_ARRAY}). */ protected final static int STATE_RECORD_START = 1; /** * State in which next entry will be available, returning * either {@link JsonToken#FIELD_NAME} or value * (depending on whether entries are expressed as * Objects or just Arrays); or * matching close marker. */ protected final static int STATE_NEXT_ENTRY = 2; /** * State in which value matching field name will * be returned. */ protected final static int STATE_NAMED_VALUE = 3; /** * State in which "unnamed" value (entry in an array) * will be returned, if one available; otherwise * end-array is returned. */ protected final static int STATE_UNNAMED_VALUE = 4; /** * State in which end marker is returned; either * null (if no array wrapping), or * {@link JsonToken#END_ARRAY} for wrapping. * This step will loop, returning series of nulls * if {@link #nextToken} is called multiple times. */ protected final static int STATE_DOC_END = 5; /* /********************************************************************** /* Configuration /********************************************************************** */ /** * Codec used for data binding when (if) requested. */ protected ObjectCodec _objectCodec; protected int _csvFeatures; /** * Definition of columns being read. Initialized to "empty" instance, which * has default configuration settings. */ protected CsvSchema _schema = EMPTY_SCHEMA; /** * Number of columns defined by schema. */ protected int _columnCount = 0; /* /********************************************************************** /* State /********************************************************************** */ /** * Information about parser context, context in which * the next token is to be parsed (root, array, object). */ protected JsonReadContext _parsingContext; /** * Name of column that we exposed most recently, accessible after * {@link JsonToken#FIELD_NAME} as well as value tokens immediately * following field name. */ protected String _currentName; /** * String value for the current column, if accessed. */ protected String _currentValue; /** * Index of the column we are exposing */ protected int _columnIndex; /** * Current logical state of the parser; one of STATE_ * constants. */ protected int _state = STATE_DOC_START; /** * We will hold on to decoded binary data, for duration of * current event, so that multiple calls to * {@link #getBinaryValue} will not need to decode data more * than once. */ protected byte[] _binaryValue; /* /********************************************************************** /* Helper objects /********************************************************************** */ /** * Thing that actually reads the CSV content */ protected final CsvReader _reader; /** * Buffer that contains contents of all values after processing * of doubled-quotes, escaped characters. */ protected final TextBuffer _textBuffer; protected ByteArrayBuilder _byteArrayBuilder; /* /********************************************************************** /* Life-cycle /********************************************************************** */ public CsvParser(IOContext ctxt, BufferRecycler br, int parserFeatures, int csvFeatures, ObjectCodec codec, Reader reader) { super(parserFeatures); _objectCodec = codec; _textBuffer = new TextBuffer(br); _csvFeatures = csvFeatures; _parsingContext = JsonReadContext.createRootContext(); _reader = new CsvReader(this, ctxt, reader, _schema, _textBuffer, isEnabled(JsonParser.Feature.AUTO_CLOSE_SOURCE), isEnabled(Feature.TRIM_SPACES)); } @Override public ObjectCodec getCodec() { return _objectCodec; } @Override public void setCodec(ObjectCodec c) { _objectCodec = c; } @Override public boolean canUseSchema(FormatSchema schema) { return (schema instanceof FormatSchema); } @Override public void setSchema(FormatSchema schema) { if (schema instanceof CsvSchema) { _schema = (CsvSchema) schema; } else if (schema == null) { schema = EMPTY_SCHEMA; } else { super.setSchema(schema); } _columnCount = _schema.size(); _reader.setSchema(_schema); } @Override public int releaseBuffered(Writer out) throws IOException { return _reader.releaseBuffered(out); } @Override public boolean isClosed() { return _reader.isClosed(); } @Override public void close() throws IOException { _reader.close(); } /* /*************************************************** /* Public API, configuration /*************************************************** */ /** * Method for enabling specified CSV feature * (check {@link Feature} for list of features) */ public JsonParser enable(Feature f) { _csvFeatures |= f.getMask(); return this; } /** * Method for disabling specified CSV feature * (check {@link Feature} for list of features) */ public JsonParser disable(Feature f) { _csvFeatures &= ~f.getMask(); return this; } /** * Method for enabling or disabling specified CSV feature * (check {@link Feature} for list of features) */ public JsonParser configure(Feature f, boolean state) { if (state) { enable(f); } else { disable(f); } return this; } /** * Method for checking whether specified CSV {@link Feature} * is enabled. */ public boolean isEnabled(Feature f) { return (_csvFeatures & f.getMask()) != 0; } /** * Accessor for getting active schema definition: it may be * "empty" (no column definitions), but will never be null * since it defaults to an empty schema (and default configuration) *

* NOTE: should be part of JsonParser, will be for 2.0. */ public CsvSchema getSchema() { return _schema; } /* /********************************************************** /* Location info /********************************************************** */ @Override public JsonStreamContext getParsingContext() { return _parsingContext; } @Override public JsonLocation getTokenLocation() { return _reader.getTokenLocation(); } @Override public JsonLocation getCurrentLocation() { return _reader.getCurrentLocation(); } @Override public Object getInputSource() { return _reader.getInputSource(); } /* /********************************************************** /* Parsing /********************************************************** */ @Override public String getCurrentName() throws IOException, JsonParseException { return _currentName; } @Override public JsonToken nextToken() throws IOException, JsonParseException { _binaryValue = null; switch (_state) { case STATE_DOC_START: return (_currToken = _handleStartDoc()); case STATE_RECORD_START: return (_currToken = _handleRecordStart()); case STATE_NEXT_ENTRY: return (_currToken = _handleNextEntry()); case STATE_NAMED_VALUE: return (_currToken = _handleNamedValue()); case STATE_UNNAMED_VALUE: return (_currToken = _handleUnnamedValue()); case STATE_DOC_END: _reader.close(); if (_parsingContext.inRoot()) { return null; } // should always be in array, actually... but: boolean inArray = _parsingContext.inArray(); _parsingContext = _parsingContext.getParent(); return inArray ? JsonToken.END_ARRAY : JsonToken.END_OBJECT; default: throw new IllegalStateException(); } } /* /********************************************************** /* Parsing, helper methods /********************************************************** */ /** * Method called to handle details of initializing things to return * the very first token. */ protected JsonToken _handleStartDoc() throws IOException, JsonParseException { // First things first: are we expecting header line? If so, read, process if (_schema.useHeader()) { _readHeaderLine(); } // and if we are to skip the first data line, skip it if (_schema.skipFirstDataRow()) { _reader.skipLine(); } /* Only one real complication, actually; empy documents (zero bytes). * Those have no entries. Should be easy enough to detect like so: */ if (!_reader.hasMoreInput()) { _state = STATE_DOC_END; // but even empty sequence must still be wrapped in logical array if (isEnabled(Feature.WRAP_AS_ARRAY)) { _parsingContext = _reader.childArrayContext(_parsingContext); return JsonToken.START_ARRAY; } return null; } if (isEnabled(Feature.WRAP_AS_ARRAY)) { _parsingContext = _reader.childArrayContext(_parsingContext); _state = STATE_RECORD_START; return JsonToken.START_ARRAY; } // otherwise, same as regular new entry... return _handleRecordStart(); } protected JsonToken _handleRecordStart() throws IOException, JsonParseException { _columnIndex = 0; if (_columnCount == 0) { // no schema; exposed as an array _state = STATE_UNNAMED_VALUE; _parsingContext = _reader.childArrayContext(_parsingContext); return JsonToken.START_ARRAY; } // otherwise, exposed as an Object _parsingContext = _reader.childObjectContext(_parsingContext); _state = STATE_NEXT_ENTRY; return JsonToken.START_OBJECT; } protected JsonToken _handleNextEntry() throws IOException, JsonParseException { // NOTE: only called when we do have real Schema String next = _reader.nextString(); if (next == null) { // end of record or input... _parsingContext = _parsingContext.getParent(); // let's handle EOF or linefeed if (!_reader.startNewLine()) { _state = STATE_DOC_END; } else { // no, just end of record _state = STATE_RECORD_START; } return JsonToken.END_OBJECT; } _state = STATE_NAMED_VALUE; _currentValue = next; if (_columnIndex >= _columnCount) { _reportError("Too many entries: expected at most "+_columnCount+" (value ("+next.length()+" chars) \""+next+"\")"); } _currentName = _schema.column(_columnIndex).getName(); return JsonToken.FIELD_NAME; } protected JsonToken _handleNamedValue() throws IOException, JsonParseException { _state = STATE_NEXT_ENTRY; ++_columnIndex; return JsonToken.VALUE_STRING; } protected JsonToken _handleUnnamedValue() throws IOException, JsonParseException { String next = _reader.nextString(); if (next == null) { // end of record or input... _parsingContext = _parsingContext.getParent(); if (!_reader.startNewLine()) { // end of whole thing... _state = STATE_DOC_END; } else { // no, just end of record _state = STATE_RECORD_START; } return JsonToken.END_ARRAY; } // state remains the same _currentValue = next; ++_columnIndex; return JsonToken.VALUE_STRING; } /** * Method called to process the expected header line */ protected void _readHeaderLine() throws IOException, JsonParseException { /* Two separate cases: * * (a) We already have a Schema with columns; if so, header will be skipped * (b) Otherwise, need to find column definitions; empty one is not acceptable */ if (_schema.size() > 0) { // case (a); skip all/any while (_reader.nextString() != null) { } return; } // case (b); read all String name; // base setting on existing schema, but drop columns CsvSchema.Builder builder = _schema.rebuild().clearColumns(); while ((name = _reader.nextString()) != null) { // one more thing: always trim names, regardless of config settings name = name.trim(); // See if "old" schema defined type; if so, use that type... CsvSchema.Column prev = _schema.column(name); if (prev != null) { builder.addColumn(name, prev.getType()); } else { builder.addColumn(name); } } // Ok: did we get any columns? CsvSchema newSchema = builder.build(); int size = newSchema.size(); if (size < 2) { // 1 just because we may get 'empty' header name String first = (size == 0) ? "" : newSchema.column(0).getName().trim(); if (first.length() == 0) { _reportError("Empty header line: can not bind data"); } } // otherwise we will use what we got setSchema(builder.build()); } /* /********************************************************** /* String value handling /********************************************************** */ // For now we do not store char[] representation... @Override public boolean hasTextCharacters() { return _textBuffer.hasTextAsCharacters(); } @Override public String getText() throws IOException, JsonParseException { return _currentValue; } @Override public char[] getTextCharacters() throws IOException, JsonParseException { return _textBuffer.contentsAsArray(); } @Override public int getTextLength() throws IOException, JsonParseException { return _textBuffer.size(); } @Override public int getTextOffset() throws IOException, JsonParseException { return 0; } /* /********************************************************************** /* Binary (base64) /********************************************************************** */ @Override public byte[] getBinaryValue(Base64Variant variant) throws IOException, JsonParseException { if (_binaryValue == null) { if (_currToken != JsonToken.VALUE_STRING) { _reportError("Current token ("+_currToken+") not VALUE_STRING, can not access as binary"); } ByteArrayBuilder builder = _getByteArrayBuilder(); _decodeBase64(_currentValue, builder, variant); _binaryValue = builder.toByteArray(); } return _binaryValue; } /* /********************************************************************** /* Number accessors /********************************************************************** */ @Override public NumberType getNumberType() throws IOException, JsonParseException { return _reader.getNumberType(); } @Override public Number getNumberValue() throws IOException, JsonParseException { return _reader.getNumberValue(); } @Override public int getIntValue() throws IOException, JsonParseException { return _reader.getIntValue(); } @Override public long getLongValue() throws IOException, JsonParseException { return _reader.getLongValue(); } @Override public BigInteger getBigIntegerValue() throws IOException, JsonParseException { return _reader.getBigIntegerValue(); } @Override public float getFloatValue() throws IOException, JsonParseException { return _reader.getFloatValue(); } @Override public double getDoubleValue() throws IOException, JsonParseException { return _reader.getDoubleValue(); } @Override public BigDecimal getDecimalValue() throws IOException, JsonParseException { return _reader.getDecimalValue(); } /* /********************************************************************** /* Helper methods from base class /********************************************************************** */ @Override protected void _handleEOF() throws JsonParseException { // I don't think there's problem with EOFs usually; except maybe in quoted stuff? _reportInvalidEOF(": expected closing quote character"); } /* /********************************************************************** /* Helper methods for CsvReader /********************************************************************** */ // must be (re)defined to make package-accessible public void _reportCsvError(String msg) throws JsonParseException { super._reportError(msg); } public void _reportUnexpectedCsvChar(int ch, String msg) throws JsonParseException { super._reportUnexpectedChar(ch, msg); } /* /********************************************************************** /* Internal methods /********************************************************************** */ public ByteArrayBuilder _getByteArrayBuilder() { if (_byteArrayBuilder == null) { _byteArrayBuilder = new ByteArrayBuilder(); } else { _byteArrayBuilder.reset(); } return _byteArrayBuilder; } }





© 2015 - 2025 Weber Informatics LLC | Privacy Policy