org.python.modules._csv.PyReader Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of jython-slim Show documentation
Show all versions of jython-slim Show documentation
Jython is an implementation of the high-level, dynamic, object-oriented
language Python written in 100% Pure Java, and seamlessly integrated with
the Java platform. It thus allows you to run Python on any Java platform.
/* Copyright (c) Jython Developers */
package org.python.modules._csv;
import org.python.core.PyIterator;
import org.python.core.PyList;
import org.python.core.PyObject;
import org.python.core.PyString;
import org.python.core.PyType;
import org.python.core.Visitproc;
import org.python.expose.ExposedGet;
import org.python.expose.ExposedType;
/**
* CSV file reader.
*
* Analogous to CPython's _csv.c::ReaderObj struct.
*/
@ExposedType(name = "_csv.reader", doc = PyReader.reader_doc)
public class PyReader extends PyIterator {
public static final PyType TYPE = PyType.fromClass(PyReader.class);
public static final String reader_doc =
"CSV reader\n" +
"\n" +
"Reader objects are responsible for reading and parsing tabular data\n" +
"in CSV format.\n";
/** Parsing Dialect. */
@ExposedGet
public PyDialect dialect;
/** The current line number. */
@ExposedGet
public int line_num = 0;
/** The underlying input iterator. */
private PyObject input_iter;
/** Current CSV parse state. */
private ParserState state = ParserState.START_RECORD;
/** Field list for current record. */
private PyList fields = new PyList();
/** Current field builder in here. */
private StringBuffer field = new StringBuffer(INITIAL_BUILDER_CAPACITY);
/** Whether the field should be treated as numeric. */
private boolean numeric_field = false;
/** Initial capacity of the field StringBuilder. */
private static final int INITIAL_BUILDER_CAPACITY = 4096;
public PyReader(PyObject input_iter, PyDialect dialect) {
this.input_iter = input_iter;
this.dialect = dialect;
}
public PyObject __iternext__() {
PyObject lineobj;
PyObject fields;
String line;
char c;
int linelen;
parse_reset();
do {
lineobj = input_iter.__iternext__();
if (lineobj == null) {
// End of input OR exception
if (field.length() != 0 || state == ParserState.IN_QUOTED_FIELD) {
if (dialect.strict) {
throw _csv.Error("unexpected end of data");
} else {
parse_save_field();
break;
}
}
return null;
}
line_num++;
line = lineobj.toString();
linelen = line.length();
for (int i = 0; i < linelen; i++) {
c = line.charAt(i);
if (c == '\0') {
throw _csv.Error("line contains NULL byte");
}
parse_process_char(c);
}
parse_process_char('\0');
} while (state != ParserState.START_RECORD);
fields = this.fields;
this.fields = new PyList();
return fields;
}
@SuppressWarnings("fallthrough")
private void parse_process_char(char c) {
switch (state) {
case START_RECORD:
// start of record
if (c == '\0') {
// empty line - return []
break;
} else if (c == '\n' || c == '\r') {
state = ParserState.EAT_CRNL;
break;
}
// normal character - handle as START_FIELD
state = ParserState.START_FIELD;
// *** fallthru ***
case START_FIELD:
// expecting field
if (c == '\n' || c == '\r' || c == '\0') {
// save empty field - return [fields]
parse_save_field();
state = c == '\0' ? ParserState.START_RECORD : ParserState.EAT_CRNL;
} else if (c == dialect.quotechar && dialect.quoting != QuoteStyle.QUOTE_NONE) {
// start quoted field
state = ParserState.IN_QUOTED_FIELD;
} else if (c == dialect.escapechar) {
// possible escaped character
state = ParserState.ESCAPED_CHAR;
} else if (c == ' ' && dialect.skipinitialspace) {
// ignore space at start of field
;
} else if (c == dialect.delimiter) {
// save empty field
parse_save_field();
} else {
// begin new unquoted field
if (dialect.quoting == QuoteStyle.QUOTE_NONNUMERIC) {
numeric_field = true;
}
parse_add_char(c);
state = ParserState.IN_FIELD;
}
break;
case ESCAPED_CHAR:
if (c == '\0') {
c = '\n';
}
parse_add_char(c);
state = ParserState.IN_FIELD;
break;
case IN_FIELD:
// in unquoted field
if (c == '\n' || c == '\r' || c == '\0') {
// end of line - return [fields]
parse_save_field();
state = c == '\0' ? ParserState.START_RECORD : ParserState.EAT_CRNL;
} else if (c == dialect.escapechar) {
// possible escaped character
state = ParserState.ESCAPED_CHAR;
} else if (c == dialect.delimiter) {
// save field - wait for new field
parse_save_field();
state = ParserState.START_FIELD;
} else {
// normal character - save in field
parse_add_char(c);
}
break;
case IN_QUOTED_FIELD:
// in quoted field
if (c == '\0') {
;
} else if (c == dialect.escapechar) {
// Possible escape character
state = ParserState.ESCAPE_IN_QUOTED_FIELD;
} else if (c == dialect.quotechar && dialect.quoting != QuoteStyle.QUOTE_NONE) {
if (dialect.doublequote) {
// doublequote; " represented by ""
state = ParserState.QUOTE_IN_QUOTED_FIELD;
} else {
// end of quote part of field
state = ParserState.IN_FIELD;
}
} else {
// normal character - save in field
parse_add_char(c);
}
break;
case ESCAPE_IN_QUOTED_FIELD:
if (c == '\0') {
c = '\n';
}
parse_add_char(c);
state = ParserState.IN_QUOTED_FIELD;
break;
case QUOTE_IN_QUOTED_FIELD:
// doublequote - seen a quote in an quoted field
if (dialect.quoting != QuoteStyle.QUOTE_NONE && c == dialect.quotechar) {
// save "" as "
parse_add_char(c);
state = ParserState.IN_QUOTED_FIELD;
} else if (c == dialect.delimiter) {
// save field - wait for new field
parse_save_field();
state = ParserState.START_FIELD;
} else if (c == '\n' || c == '\r' || c == '\0') {
// end of line - return [fields]
parse_save_field();
state = c == '\0' ? ParserState.START_RECORD : ParserState.EAT_CRNL;
} else if (!dialect.strict) {
parse_add_char(c);
state = ParserState.IN_FIELD;
} else {
// illegal
throw _csv.Error(String.format("'%c' expected after '%c'",
dialect.delimiter, dialect.quotechar));
}
break;
case EAT_CRNL:
if (c == '\n' || c == '\r') {
;
} else if (c == '\0') {
state = ParserState.START_RECORD;
} else {
String err = "new-line character seen in unquoted field - do you need to "
+ "open the file in universal-newline mode?";
throw _csv.Error(err);
}
break;
}
}
private void parse_reset() {
fields = new PyList();
state = ParserState.START_RECORD;
numeric_field = false;
}
private void parse_save_field() {
PyObject field;
field = new PyString(this.field.toString());
if (numeric_field) {
numeric_field = false;
field = field.__float__();
}
fields.append(field);
// XXX: fastest way to clear StringBuffer?
this.field = new StringBuffer(INITIAL_BUILDER_CAPACITY);
}
private void parse_add_char(char c) {
int field_len = field.length();
if (field_len >= _csv.field_limit) {
throw _csv.Error(String.format("field larger than field limit (%d)",
_csv.field_limit));
}
field.append(c);
}
/**
* State of the CSV reader.
*/
private enum ParserState {
START_RECORD, START_FIELD, ESCAPED_CHAR, IN_FIELD, IN_QUOTED_FIELD, ESCAPE_IN_QUOTED_FIELD,
QUOTE_IN_QUOTED_FIELD, EAT_CRNL;
}
/* Traverseproc implementation */
@Override
public int traverse(Visitproc visit, Object arg) {
int retVal = super.traverse(visit, arg);
if (retVal != 0) {
return retVal;
}
if (dialect != null) {
retVal = visit.visit(dialect, arg);
if (retVal != 0) {
return retVal;
}
}
if (input_iter != null) {
retVal = visit.visit(input_iter, arg);
if (retVal != 0) {
return retVal;
}
}
return fields != null ? visit.visit(fields, arg) : 0;
}
@Override
public boolean refersDirectlyTo(PyObject ob) {
return ob == null && (ob == fields || ob == dialect
|| ob == input_iter || super.refersDirectlyTo(ob));
}
}