All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.opensearch.ingest.common.CsvParser Maven / Gradle / Ivy

/*
 * SPDX-License-Identifier: Apache-2.0
 *
 * The OpenSearch Contributors require contributions made to
 * this file be licensed under the Apache-2.0 license or a
 * compatible open source license.
 */

/*
 * Licensed to Elasticsearch under one or more contributor
 * license agreements. See the NOTICE file distributed with
 * this work for additional information regarding copyright
 * ownership. Elasticsearch licenses this file to you under
 * the Apache License, Version 2.0 (the "License"); you may
 * not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *    http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied.  See the License for the
 * specific language governing permissions and limitations
 * under the License.
 */

/*
 * Modifications Copyright OpenSearch Contributors. See
 * GitHub history for details.
 */

package org.opensearch.ingest.common;

import org.opensearch.ingest.IngestDocument;

final class CsvParser {

    private static final char LF = '\n';
    private static final char CR = '\r';
    private static final char SPACE = ' ';
    private static final char TAB = '\t';

    private enum State {
        START,
        UNQUOTED,
        QUOTED,
        QUOTED_END
    }

    private final char quote;
    private final char separator;
    private final boolean trim;
    private final String[] headers;
    private final Object emptyValue;
    private final IngestDocument ingestDocument;
    private final StringBuilder builder = new StringBuilder();
    private State state = State.START;
    private String line;
    private int currentHeader = 0;
    private int startIndex = 0;
    private int length;
    private int currentIndex;

    CsvParser(IngestDocument ingestDocument, char quote, char separator, boolean trim, String[] headers, Object emptyValue) {
        this.ingestDocument = ingestDocument;
        this.quote = quote;
        this.separator = separator;
        this.trim = trim;
        this.headers = headers;
        this.emptyValue = emptyValue;
    }

    void process(String line) {
        this.line = line;
        length = line.length();
        for (currentIndex = 0; currentIndex < length; currentIndex++) {
            switch (state) {
                case START:
                    if (processStart()) {
                        return;
                    }
                    break;
                case UNQUOTED:
                    if (processUnquoted()) {
                        return;
                    }
                    break;
                case QUOTED:
                    processQuoted();
                    break;
                case QUOTED_END:
                    if (processQuotedEnd()) {
                        return;
                    }
                    break;
            }
        }

        // we've reached end of string, we need to handle last field
        switch (state) {
            case UNQUOTED:
                setField(length);
                break;
            case QUOTED_END:
                setField(length - 1);
                break;
            case QUOTED:
                throw new IllegalArgumentException("Unmatched quote");
        }
    }

    private boolean processStart() {
        for (; currentIndex < length; currentIndex++) {
            char c = currentChar();
            if (c == quote) {
                state = State.QUOTED;
                builder.setLength(0);
                startIndex = currentIndex + 1;
                return false;
            } else if (c == separator) {
                startIndex++;
                builder.setLength(0);
                if (setField(startIndex)) {
                    return true;
                }
            } else if (isWhitespace(c)) {
                if (trim) {
                    startIndex++;
                }
            } else {
                state = State.UNQUOTED;
                builder.setLength(0);
                return false;
            }
        }
        return true;
    }

    private boolean processUnquoted() {
        int spaceCount = 0;
        for (; currentIndex < length; currentIndex++) {
            char c = currentChar();
            if (c == LF || c == CR || c == quote) {
                throw new IllegalArgumentException("Illegal character inside unquoted field at " + currentIndex);
            } else if (c == separator) {
                state = State.START;
                if (setField(currentIndex - spaceCount)) {
                    return true;
                }
                startIndex = currentIndex + 1;
                return false;
            } else if (trim && isWhitespace(c)) {
                spaceCount++;
            } else {
                spaceCount = 0;
            }
        }
        return false;
    }

    private void processQuoted() {
        for (; currentIndex < length; currentIndex++) {
            if (currentChar() == quote) {
                state = State.QUOTED_END;
                break;
            }
        }
    }

    private boolean processQuotedEnd() {
        char c = currentChar();
        if (c == quote) {
            builder.append(line, startIndex, currentIndex - 1).append(quote);
            startIndex = currentIndex + 1;
            state = State.QUOTED;
            return false;
        }
        boolean shouldSetField = true;
        for (; currentIndex < length; currentIndex++) {
            c = currentChar();
            if (c == separator) {
                if (shouldSetField && setField(currentIndex - 1)) {
                    return true;
                }
                startIndex = currentIndex + 1;
                state = State.START;
                return false;
            } else if (isWhitespace(c)) {
                if (shouldSetField) {
                    if (setField(currentIndex - 1)) {
                        return true;
                    }
                    shouldSetField = false;
                }
            } else {
                throw new IllegalArgumentException("character '" + c + "' after quoted field at " + currentIndex);
            }
        }
        return true;
    }

    private char currentChar() {
        return line.charAt(currentIndex);
    }

    private boolean isWhitespace(char c) {
        return c == SPACE || c == TAB;
    }

    private boolean setField(int endIndex) {
        String value;
        if (builder.length() == 0) {
            value = line.substring(startIndex, endIndex);
        } else {
            value = builder.append(line, startIndex, endIndex).toString();
        }
        if (value.length() > 0) {
            ingestDocument.setFieldValue(headers[currentHeader], value);
        } else if (emptyValue != null) {
            ingestDocument.setFieldValue(headers[currentHeader], emptyValue);
        }
        currentHeader++;
        return currentHeader == headers.length;
    }
}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy