com.xceptance.common.util.CsvUtilsDecode Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of xlt Show documentation
XLT (Xceptance LoadTest) is an extensive load and performance test tool developed and maintained by Xceptance.
The newest version!
/*
 * Copyright (c) 2005-2024 Xceptance Software Technologies GmbH
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package com.xceptance.common.util;

import java.text.ParseException;

import com.xceptance.xlt.api.util.SimpleArrayList;
import com.xceptance.xlt.api.util.XltCharBuffer;

/**
 * The {@link CsvUtilsDecode} class provides helper methods to decode values from the CSV format. This is the high
 * performance and most efficient method. It will avoid copying data at all cost and move through the cache very
 * efficiently.
 * 
 * Note: This version is no longer the fasted one possible. It suffers from a JIT issue when it sees quoted lines before
 * regular lines. It has been replaced with {@link CsvLineDecoder}. But the new version only supports a comma as
 * delimiter for performance reasons, hence this class has still value.
 *
 * @author René Schwietzke
 * @since 7.0.0
 */
public final class CsvUtilsDecode
{
    /**
     * Character constant representing a comma.
     */
    private static final char COMMA = ',';

    /**
     * Character constant representing a double quote.
     */
    private static final char QUOTE_CHAR = '"';

    /**
     * Default constructor. Declared private to prevent external instantiation.
     */
    private CsvUtilsDecode()
    {
    }

    /**
     * Decodes the given CSV-encoded data record and returns the plain unquoted fields.
     *
     * @param s
     *            the CSV-encoded data record
     * @return the plain fields
     */
    public static SimpleArrayList parse(final String s)
    {
        return parse(new SimpleArrayList<>(50), XltCharBuffer.valueOf(s), COMMA);
    }

    // our bit flags for the parser
    @SuppressWarnings("unused")
    private static final int NONE = 0;

    private static final int IN_QUOTES = 1;

    private static final int START_MODE = 2;

    @SuppressWarnings("unused")
    private static final int SEPARATOR_EXPECTED = 4;

    private static final int LAST_WAS_SEPARATOR = 8;

    private static final int JUST_LEFT_QUOTES = 16;

    private static final int COPY_REST = 32;

    /**
     * Encodes the given fields to a CSV-encoded data record using the given field separator.
     *
     * @param list
     *            a list to append to, for memory efficiency, we hand one in instead of creating our own
     * @param src
     *            the buffer to read from
     * @param fieldSeparator
     *            the field separator to use
     * @return the CSV-encoded data record
     * @throws ParseException
     */
    public static SimpleArrayList parse(final SimpleArrayList result, final XltCharBuffer src,
                                                       final char fieldSeparator)
    {
        final int size = src.length();

        int state = START_MODE;
        int pos = 0;
        int start = 0;
        int offset = 0;

        Main:
        while (pos < size)
        {
            final char c = src.charAt(pos);

            state = state | COPY_REST;
            state = state & ~LAST_WAS_SEPARATOR;

            // do we have a quote?
            if (c == QUOTE_CHAR)
            {
                // our first quote because this all handled in the inner loop
                state = state | IN_QUOTES;

                pos++;

                // when we just started, we want to move the offset as well to avoid
                // too much copying
                if ((state & START_MODE) == START_MODE)
                {
                    offset++; // we also change the offset to avoid copying too much
                    start = offset;
                }

                // now read until we leave this state again or exhaust the buffer
                InQuotes:
                while (pos < size)
                {
                    final char c2 = src.charAt(pos);

                    if (c2 == QUOTE_CHAR)
                    {
                        // this is either the end quote or a quoted quote

                        // peak
                        final char peakedChar = src.peakAhead(pos + 1);
                        if (peakedChar == 0)
                        {
                            // there is nothing anymore, break here, pos is right because
                            // we just peaked and did not move the cursor
                            state = state | COPY_REST;

                            break Main;
                        }

                        // we have not reached the end, let's see what we got
                        if (peakedChar == QUOTE_CHAR)
                        {
                            // this is a quoted quote, preserve it and jump over
                            // it, but move the offset only by one because now
                            // everything has to be moved until we reach the end OR
                            // worse... more of these
                            src.put(offset, c2);

                            offset++;
                            pos += 2;

                            continue InQuotes;
                        }

                        // we have to jump over the quote, we are at the end
                        pos++;

                        state = state & ~(IN_QUOTES | START_MODE);
                        state = state | JUST_LEFT_QUOTES;

                        continue Main;
                    }

                    // ok, no quote, so this is just text in quotes
                    if (pos != offset)
                    {
                        src.put(offset, c2);
                    }
                    pos++;
                    offset++;

                    state = state & ~START_MODE;
                }

                // if we got here, quotes does not close
                // deal with it in a soft way at the moment and copy the rest
                state = state | COPY_REST;

                break Main;
            }

            if (c == fieldSeparator)
            {
                // if we just left the quotes, just reset because that is expected
                // or we already skipped a few characters
                state = state & ~(JUST_LEFT_QUOTES | COPY_REST);

                // we saw a separator and we start a new col
                state = state | (LAST_WAS_SEPARATOR | START_MODE);

                // we need the data
                result.add(src.viewFromTo(start, offset));

                pos++;
                offset = pos;
                start = pos;

                continue Main;
            }

            // this feature is not yet supported because we are in control of our data and don't have
            // whitespaces around the separator
            // if ((state & JUST_LEFT_QUOTES) == JUST_LEFT_QUOTES)
            // {
            // // we have left quotes, but not seen a separator, so we have garbage or spaces, ignore
            // pos++;
            //
            // continue Main;
            // }

            // move the char up if we have to
            if (pos != offset)
            {
                src.put(offset, c);
            }

            pos++;
            offset++;

            state = state & ~START_MODE;
        }

        // There is a rest to copy
        if ((state & COPY_REST) == COPY_REST)
        {
            result.add(src.viewFromTo(start, offset));
        }
        else if ((state & LAST_WAS_SEPARATOR) == LAST_WAS_SEPARATOR)
        {
            // we had a separator at the end, so we have an empty string to add
            result.add(XltCharBuffer.empty());
        }
        else if (size == 0)
        {
            // the rare case if an empty string
            result.add(XltCharBuffer.empty());
        }

        return result;
    }
}