org.opensearch.ingest.common.CsvParser Maven / Gradle / Ivy
/*
* SPDX-License-Identifier: Apache-2.0
*
* The OpenSearch Contributors require contributions made to
* this file be licensed under the Apache-2.0 license or a
* compatible open source license.
*/
/*
* Licensed to Elasticsearch under one or more contributor
* license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright
* ownership. Elasticsearch licenses this file to you under
* the Apache License, Version 2.0 (the "License"); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
/*
* Modifications Copyright OpenSearch Contributors. See
* GitHub history for details.
*/
package org.opensearch.ingest.common;
import org.opensearch.ingest.IngestDocument;
final class CsvParser {
private static final char LF = '\n';
private static final char CR = '\r';
private static final char SPACE = ' ';
private static final char TAB = '\t';
private enum State {
START,
UNQUOTED,
QUOTED,
QUOTED_END
}
private final char quote;
private final char separator;
private final boolean trim;
private final String[] headers;
private final Object emptyValue;
private final IngestDocument ingestDocument;
private final StringBuilder builder = new StringBuilder();
private State state = State.START;
private String line;
private int currentHeader = 0;
private int startIndex = 0;
private int length;
private int currentIndex;
CsvParser(IngestDocument ingestDocument, char quote, char separator, boolean trim, String[] headers, Object emptyValue) {
this.ingestDocument = ingestDocument;
this.quote = quote;
this.separator = separator;
this.trim = trim;
this.headers = headers;
this.emptyValue = emptyValue;
}
void process(String line) {
this.line = line;
length = line.length();
for (currentIndex = 0; currentIndex < length; currentIndex++) {
switch (state) {
case START:
if (processStart()) {
return;
}
break;
case UNQUOTED:
if (processUnquoted()) {
return;
}
break;
case QUOTED:
processQuoted();
break;
case QUOTED_END:
if (processQuotedEnd()) {
return;
}
break;
}
}
// we've reached end of string, we need to handle last field
switch (state) {
case UNQUOTED:
setField(length);
break;
case QUOTED_END:
setField(length - 1);
break;
case QUOTED:
throw new IllegalArgumentException("Unmatched quote");
}
}
private boolean processStart() {
for (; currentIndex < length; currentIndex++) {
char c = currentChar();
if (c == quote) {
state = State.QUOTED;
builder.setLength(0);
startIndex = currentIndex + 1;
return false;
} else if (c == separator) {
startIndex++;
builder.setLength(0);
if (setField(startIndex)) {
return true;
}
} else if (isWhitespace(c)) {
if (trim) {
startIndex++;
}
} else {
state = State.UNQUOTED;
builder.setLength(0);
return false;
}
}
return true;
}
private boolean processUnquoted() {
int spaceCount = 0;
for (; currentIndex < length; currentIndex++) {
char c = currentChar();
if (c == LF || c == CR || c == quote) {
throw new IllegalArgumentException("Illegal character inside unquoted field at " + currentIndex);
} else if (c == separator) {
state = State.START;
if (setField(currentIndex - spaceCount)) {
return true;
}
startIndex = currentIndex + 1;
return false;
} else if (trim && isWhitespace(c)) {
spaceCount++;
} else {
spaceCount = 0;
}
}
return false;
}
private void processQuoted() {
for (; currentIndex < length; currentIndex++) {
if (currentChar() == quote) {
state = State.QUOTED_END;
break;
}
}
}
private boolean processQuotedEnd() {
char c = currentChar();
if (c == quote) {
builder.append(line, startIndex, currentIndex - 1).append(quote);
startIndex = currentIndex + 1;
state = State.QUOTED;
return false;
}
boolean shouldSetField = true;
for (; currentIndex < length; currentIndex++) {
c = currentChar();
if (c == separator) {
if (shouldSetField && setField(currentIndex - 1)) {
return true;
}
startIndex = currentIndex + 1;
state = State.START;
return false;
} else if (isWhitespace(c)) {
if (shouldSetField) {
if (setField(currentIndex - 1)) {
return true;
}
shouldSetField = false;
}
} else {
throw new IllegalArgumentException("character '" + c + "' after quoted field at " + currentIndex);
}
}
return true;
}
private char currentChar() {
return line.charAt(currentIndex);
}
private boolean isWhitespace(char c) {
return c == SPACE || c == TAB;
}
private boolean setField(int endIndex) {
String value;
if (builder.length() == 0) {
value = line.substring(startIndex, endIndex);
} else {
value = builder.append(line, startIndex, endIndex).toString();
}
if (value.length() > 0) {
ingestDocument.setFieldValue(headers[currentHeader], value);
} else if (emptyValue != null) {
ingestDocument.setFieldValue(headers[currentHeader], emptyValue);
}
currentHeader++;
return currentHeader == headers.length;
}
}